In [None]:
import os
import shutil
import csv

import pandas as pd

import sys
sys.path.append("..")

import minetext.visualization.wordcloud_visualization as visualization

In [None]:
def create_directory(dirpath):
    if not os.path.exists(dirpath):
        os.makedirs(dirpath)

In [None]:
def delete_directory(dirpath):
    if os.path.exists(dirpath):
        shutil.rmtree(dirpath, ignore_errors=True)

In [None]:
def read_csv_file(dirpath, filename, header=0, index_col=None, encoding="UTF-8", quoting=csv.QUOTE_NONNUMERIC):
    filepath = os.path.join(dirpath, filename)
    
    return pd.read_csv(filepath_or_buffer=filepath,\
                       index_col=index_col,\
                       header=header,\
                       encoding=encoding,\
                       quoting=quoting)

In [None]:
def read_csv_directory(dirpath, header=0, encoding="UTF-8", quoting=csv.QUOTE_NONNUMERIC):
    dataframes = []
    
    files = os.listdir(dirpath)
    
    for file in files:    
        filename = os.fsdecode(file)
        
        if filename.endswith(".csv"):
            dataframe = read_csv_file(dirpath=dirpath, filename=filename, header=header, encoding=encoding, quoting=quoting)
            
            dataframes.append(dataframe)
            
    return pd.concat(dataframes)

In [None]:
def data_to_csv(data, dirpath, filename, index=True, index_label="index", columns=None, header=True, encoding="UTF-8", quoting=csv.QUOTE_NONNUMERIC):
    create_directory(dirpath)
    
    filepath = os.path.join(dirpath, filename)
    
    output = pd.DataFrame(data=data, columns=columns)
    
    output.to_csv(path_or_buf=filepath, index=index, index_label=index_label, header=header, encoding=encoding, quoting=quoting)

In [None]:
def overall_results_to_csv(results, dirpath, filename, columns=None):
    data_to_csv(data=results, dirpath=dirpath, filename=filename, columns=columns)

In [None]:
def clusters_to_csv(result, dirpath, collection_field, header=True, columns=None, encoding="UTF-8"):
    clustering_dirpath = os.path.join(dirpath, "clustering")
    
    # result: k, sse, iteractions, clusters
    for index, k in enumerate(result["k"]):
        iteraction = result["iteraction"][index]
        clusters = result["clusters"][index]
        
        subdir_clusters = os.path.join(clustering_dirpath,\
                                       "K_" + str(k),\
                                       "iteraction_" + str(iteraction),\
                                       "clusters")
        
        create_directory(subdir_clusters)
        
        for cluster in clusters:
            cluster_id = cluster["id"]
            cluster_documents = cluster[collection_field]
            
            cluster_filename = "cluster_" + str(cluster_id) + ".csv"
            
            # Salva o cluster
            data_to_csv(data=cluster_documents, filename=cluster_filename, dirpath=subdir_clusters, header=header, columns=columns)

In [None]:
def most_similar_to_csv(result, dirpath, header=True, encoding="UTF-8"):
    clustering_dirpath = os.path.join(dirpath, "clustering")
    
    # result: k, iteractions, clusters, most_similar
    for index, k in enumerate(result["k"]):
        iteraction = result["iteraction"][index]
        clusters = result["clusters"][index]
        most_similar = result["most_similar"][index]
        
        subdir_most_similar = os.path.join(clustering_dirpath,\
                                           "K_" + str(k),\
                                           "iteraction_" + str(iteraction),\
                                           "most_similar")
        
        create_directory(subdir_most_similar)
        
        for cluster in clusters:
            cluster_id = cluster["id"]
            cluster_most_similar = []
            
            cluster_most_similar_filename = "most_similar_cluster_" + str(cluster_id) + ".csv"                      
            
            for distance, document in most_similar[cluster_id]:
                similar = dict()
                
                similar["content"] = document["content"]
                similar["clean_content"] = document["clean_content"]
                similar["cluster"] = document["cluster"]
                similar["distance"] = distance
                
                cluster_most_similar.append(similar)
            
            # Salva os n documentos mais similares do cluster
            data_to_csv(data=cluster_most_similar,\
                        filename=cluster_most_similar_filename,\
                        dirpath=subdir_most_similar,\
                        header=header)

In [None]:
def medoids_to_csv(result, dirpath, encoding="UTF-8"):
    clustering_dirpath = os.path.join(dirpath, "clustering")
    
    medoids_filename = "clusters_medoids.csv"
    
    # result: k, sse, iteractions, clusters
    for index, k in enumerate(result["k"]):
        iteraction = result["iteraction"][index]
        medoids = result["medoids"][index]
        
        subdir_iteraction = os.path.join(clustering_dirpath,\
                                       "K_" + str(k),\
                                       "iteraction_" + str(iteraction))
        
        create_directory(subdir_iteraction)
        
        # Salva o cluster
        data_to_csv(data=medoids, filename=medoids_filename, dirpath=subdir_iteraction, encoding=encoding, columns=["content", "clean_content", "rating", "cluster"])

In [None]:
def clusters_to_wordcloud(result, dirpath, collection_field, text_field_name, filename_preffix="wordcloud_cluster"):
    clustering_dirpath = os.path.join(dirpath, "clustering")
    
    # result: k, iteraction, clusters
    for index, k in enumerate(result["k"]):
        clusters = result["clusters"][index]
        iteraction = result["iteraction"][index]

        subdir_wordclouds = os.path.join(clustering_dirpath,\
                                         "K_" + str(k),\
                                         "iteraction_" + str(iteraction),\
                                         "wordclouds")

        create_directory(subdir_wordclouds)

        filepath = os.path.join(subdir_wordclouds, filename_preffix)
        
        visualization.generate_pure_word_cloud_from_clusters(save_dir_file=filepath,\
                                                             clusters=clusters,\
                                                             collection_field=collection_field,\
                                                             text_field_name=text_field_name)

In [None]:
def clusters_dataframe_from_csv(result_dirpath, k=None, k_preffix="K_", iteraction=None, iteraction_preffix="iteraction_"):
    clustering_dirpath = os.path.join(result_dirpath, "clustering")
    
    clusters = []
    
    if k:
        k_subdirs = [k_preffix + str(k)]
    else:
        k_subdirs = os.listdir(clustering_dirpath)
    
    for k_subdir in k_subdirs:
        k_subdirpath = os.path.join(clustering_dirpath, k_subdir)
        
        if os.path.isdir(k_subdirpath):
            if iteraction:
                itr_subdirs = [iteraction_preffix + str(iteraction)]
            else:
                itr_subdirs = os.listdir(k_subdirpath)
            
            for itr_subdir in itr_subdirs:
                itr_subdirpath = os.path.join(k_subdirpath, itr_subdir)
                
                if os.path.isdir(itr_subdirpath):
                    clusters_subdirpath = os.path.join(itr_subdirpath, "clusters")
                    
                    # Extrai o valor de K do nome das pastas (P.O.G)
                    k = int(k_subdir.replace("K_", ""))
                
                    # Extrai o valor de itr do nome das pastas (P.O.G)
                    itr = int(itr_subdir.replace("iteraction_", ""))
                    
                    k_clusters =  read_csv_directory(clusters_subdirpath)
                    
                    # Insere o valor de K em todos os elementos dos arquivos CSV
                    k_clusters.insert(loc=len(k_clusters.columns), column="k", value=[k] * len(k_clusters))
                    # Insere o valor de itr em todos os elementos dos arquivos CSV
                    k_clusters.insert(loc=len(k_clusters.columns), column="iteraction", value=[itr] * len(k_clusters))
                    
                    clusters.append(k_clusters)
    
    return pd.concat(clusters)

In [None]:
def overall_results_from_csv(result_dirpath, overall_results_filename="clustering_overall_results.csv"):
    return read_csv_file(result_dirpath, overall_results_filename)