In [None]:
import os.path
import copy

import nbimporter

import utils.file_manager as fm
import utils.pre_processing as ppc
import utils.graph_plotter as gp

import minetext.clustering.distance as distance
import minetext.clustering.kmedoids as kmedoids

## Function that reads the export directories, at the export base directory

In [None]:
def read_exports_directories(export_directories, exports_basedir):
    exports = []
    
    for directory in exports_directories:
        export_dirpath = os.path.join(exports_basedir, directory)
        
        # Lê os exports do diretório
        export = fm.read_csv_directory(dirpath=export_dirpath, header=1)
        
        exports.append(export)
        
    return exports

## Function that do the pre-processing for each of the exports passed

In [None]:
# Realiza o pré-processamento e armazenamento dos exports
def pre_processing(exports):
    clean_exports = []
    
    for export in exports:
        # Realiza o pré-processamento do conteúdo dos exports
        clean_export = ppc.clean_reviews(export)
        
        clean_exports.append(clean_export)
        
    return clean_exports

## Function that saves the clean exports in CSV files, at the clean exports directory

In [None]:
# Realiza o pré-processamento e armazenamento dos exports
def save_clean_exports(exports, export_directories, clean_exports_basedir):
    # Colunas dos exports a serem consideradas
    columns = ["Content", "CleanContent", "Source", "Rating"]
    # Novos nomes de colunas para os exports pré-processados
    header = ["content", "clean_content", "source", "rating"]
    
    fm.delete_directory(clean_exports_basedir)
    
    for directory, export in zip(exports_directories, exports):
        filename = directory + ".csv"

        # Salva os exports já pré-processados em um arquivo .csv
        fm.data_to_csv(data=export,\
                       dirpath=clean_exports_basedir,\
                       filename=filename,\
                       index=False,\
                       columns=columns,\
                       header=header)

## Does the K-medoids clustering
Also has the function to turn dataframe tuples into a list of dicts, with the passed columns

In [None]:
# Função que transforma uma tupla do dataframe em um dict()
# A função considera os atributos: Content, CleanContent, Source e Rating
def dataframe_to_dicts(dataframe):
    tuples = dataframe.itertuples() # Traz as linhas do dataframe em formato de tupla
    columns = dataframe.keys() # Recupera o nome das columas do dataframe de exports
    indexes = range(1, len(columns) + 1) # Cria um vetor com os indices a serem considerados
    
    dicts = []
    
    for tuple_t in tuples:
        enumeration = None
    
        if not indexes:
            enumeration = enumerate(columns)
        else:
            enumeration = zip(indexes, columns)

        d = dict()

        for index, column in enumeration:
            d[column] = tuple_t[index]
            
        dicts.append(d)
        
    return dicts

def clustering_kmedoids(export, distance_calculator, k_min=3, k_max=15, iteractions=20, n=20):
    # Dict que armazenará as reviews como documentos a serem clusterizados
    reviews = dict()
    reviews["reviews"] = dataframe_to_dicts(export)
    
    result = dict()
    result["k"] = []        
    result["sse"] = []
    result["iteraction"] = []
    result["clusters"] = []
    result["most_similar"] = []

    for k in range(k_min, k_max + 1):
        min_sse = float("inf")
        min_itr = None        
        min_clusters = None
        min_most_similar = None

        for itr in range(1, iteractions + 1):
            # Simulação de passagem de parâmetro po cópia.
            # Necessário pois é preciso uma cópia para cada instância do kmedoids, para evitar inconsistências.
            documents = copy.deepcopy(reviews["reviews"])

            kmedoids_instance = kmedoids.Kmedoids(k=k,\
                                                  documents=documents,\
                                                  distance_calculator=distance_calculator,\
                                                  collection_field="reviews", text_field_name="clean_content")

            kmedoids_instance.clustering()                

            sse = kmedoids_instance.calculate_sse()

            if sse < min_sse:
                min_itr = itr
                min_sse = sse
                min_clusters = copy.deepcopy(kmedoids_instance.get_clusters())
                min_most_similar = copy.deepcopy(kmedoids_instance.n_most_similar_for_clusters_medoid(n))

        result["k"].append(k)
        result["sse"].append(min_sse)
        result["iteraction"].append(min_itr)
        result["clusters"].append(min_clusters)
        result["most_similar"].append(min_most_similar)
        
    return result

## Function that reads the clean reviews CSV files in the clean exports directories

In [None]:
# Lê os diretórios com os exports pré-processados
def read_clean_exports_directories(export_directories, clean_exports_basedir):
    exports = []
    
    for directory in export_directories:
        export_filename = directory + ".csv"

        export = fm.read_csv_file(dirpath=clean_exports_basedir, filename=export_filename)
        
        exports.append(export)
        
    return exports

## Saves the clustering results in the results directory
Saves overall results, clusters generated, most similar reviews and word clouds of each cluster, for all the K values

In [None]:
def save_clustering_result(result, directory, results_basedir, collection_field="reviews", text_field_name="clean_content", overall_results_filename="clustering_overall_results.csv"):
        result_directory = os.path.join(results_basedir, directory)
        
        fm.delete_directory(result_directory)
        
        overall_results_columns = ["k", "iteraction", "sse"]
        
        # Save overall clustering results
        fm.overall_results_to_csv(results=result,\
                                 dirpath=result_directory,\
                                 filename=overall_results_filename,\
                                 columns=overall_results_columns)
        
        clusters_results_columns = ["content", "clean_content", "cluster", "rating"]
        
        # Save cluster contents
        fm.clusters_to_csv(result=result,\
                           dirpath=result_directory,\
                           collection_field="reviews",\
                           columns=clusters_results_columns)
        
        # Save the clusters' most similar documents
        fm.most_similar_to_csv(result=result, dirpath=result_directory)
        
        # Save the clusters' wordclouds
        fm.clusters_to_wordcloud(result=result,\
                                 dirpath=result_directory,\
                                 collection_field=collection_field,\
                                 text_field_name=text_field_name)

In [None]:
def plot_graphs(results_basedir, directory, k_min=3, k_max=20):
    results_dirpath = os.path.join(results_basedir, directory)
    
    overall_results = fm.overall_results_from_csv(results_dirpath)
    gp.plot_overall_clustering_results(overall_results, results_dirpath)
    
    clusters = fm.clusters_dataframe_from_csv(results_dirpath)

    for k in range(k_min, k_max + 1):
        k_subdir = os.path.join(results_dirpath, 'clustering', 'K_' + str(k))
        
        itr_subdirs = os.listdir(k_subdir)
        
        for itr_subdir in itr_subdirs:
            graphs_dirpath = os.path.join(k_subdir, itr_subdir, "graphs")
            
            gp.plot_clusters_mean_ratings(clusters, k=k, dirpath=graphs_dirpath) 
            gp.plot_clusters_document_count(clusters, k=k, dirpath=graphs_dirpath)

## Application variables (mostly workspace directories)

Also defines the exports directories considered in the application execution

In [None]:
# Diretórios base
data_basedir = os.path.join(os.getcwd(), "data") # Diretório geral de dados

exports_basedir = os.path.join(data_basedir, "exports") # Subdiretório dos exports

clean_exports_basedir = os.path.join(data_basedir, "clean_exports") # Subdiretório os exports pré-processados
results_jaccard_basedir = os.path.join(data_basedir, "results", "jaccard") # Subdiretório dos resultados
results_levenshtein_basedir = os.path.join(data_basedir, "results", "levenshtein") # Subdiretório dos resultados

# Lista dos nomes dos diretórios que contém os exports
exports_directories = ["facebook", "telegram"]

## Reads the exports directories, and pre-process the reviews, and saves them in the results directory for each export directory defined

In [None]:
raw_exports = read_exports_directories(exports_directories, exports_basedir)

clean_exports = pre_processing(raw_exports)

save_clean_exports(clean_exports, exports_directories, clean_exports_basedir)

## Runs the K-medoids clustering algorithm using the Jaccard distance calculator

In [None]:
print("K-medoids with Jaccard distance started.")
exports = read_clean_exports_directories(exports_directories, clean_exports_basedir)

for directory, export in zip(exports_directories, exports):
    print("Directory: ", directory)
    result = clustering_kmedoids(export, distance.JaccardCalculatorDistance(), k_min=3, k_max=80, iteractions=40, n=50)
    
    save_clustering_result(result, directory, results_jaccard_basedir)

## Runs the K-medoids clustering algorithm using the Levenshtein distance calculator

In [None]:
print("K-medoids with Levenshtein distance started.")
exports = read_clean_exports_directories(exports_directories, clean_exports_basedir)

for directory, export in zip(exports_directories, exports):
    print("Directory: ", directory)
    result = clustering_kmedoids(export, distance.LevenshteinCalculator(), k_min=3, k_max=30, iteractions=20, n=50)
    
    save_clustering_result(result, directory, results_levenshtein_basedir)

In [None]:
for results_basedir, k_min, k_max in zip([results_jaccard_basedir, results_levenshtein_basedir], [3, 3], [80, 30]):
    for directory in exports_directories:
        plot_graphs(results_basedir, directory, k_min=k_min, k_max=k_max)

In [None]:
for results_basedir in [results_jaccard_basedir, results_levenshtein_basedir]:
    for directory in exports_directories:
        result_dirpath = os.path.join(results_basedir, directory)
        
        overall_results = fm.overall_results_from_csv(result_dirpath)
    
        print(gp.calculate_mininal_difference(overall_results))