In [None]:
import os
import copy

import nbimporter

import utils.file_manager as fm
import utils.pre_processing as ppc

import minetext.clustering.distance as distance
import minetext.clustering.kmedoids as kmedoids

In [None]:
data_basedir = os.path.join(os.cwd(), "data")

exports_basedir = os.path.join(data_basedir, "exports")
clean_exports_basedir = os.path.join(data_basedir, "clean_exports")
results_basedir = os.path.join(data_basedir, "results")

fm.create_directory(data_basedir)
fm.create_directory(exports_basedir)
fm.create_directory(clean_exports_basedir)
fm.create_directory(results_basedir)

# Lista dos nomes dos diretórios que contém os exports
exports_directories = ["facebook", "telegram"]

# Colunas dos exports a serem consideradas
columns = ["Content", "CleanContent", "Source", "Rating"]

jaccard_calculator = distance.JaccardCalculatorDistance()

In [None]:
for directory in exports_directories:
    filename = directory + ".csv"
    
    export_dirpath = os.path.join(exports_basedir, directory)
    
    # Lê os exports do diretório
    export = fm.read_csv_directory(dirpath=export_dirpath, header=1)
    
    # Realiza o pré-processamento do conteúdo dos exports
    clean_export = ppc.clean_reviews(export=export)
    
    # Salva os exports já pré-processados em um arquivo .csv
    fm.data_to_csv(data=clean_export,\
                   dirpath=clean_exports_basedir,\
                   filename=filename,\
                   columns=columns)

In [None]:
def tuple_to_dict(tuple_t):
    review = dict()
    
    review["Content"] = tuple_t.Content
    review["CleanContent"] = tuple_t.CleanContent
    review['Source'] = tuple_t.Source
    review["Rating"] = tuple_t.Rating
    
    return review

In [None]:
for directory in exports_directories:
    clean_exports_filename = directory + ".csv"
    
    clean_export = fm.read_csv_file(dirpath=clean_exports_basedir,\
                                   filename=clean_exports_filename)
    
    export_reviews = clean_export.itertuples() # Traz as linhas do dataframe em formato de tupla
    
    # Dict que armazenará as reviews como documentos a serem clusterizados
    reviews = dict()
    reviews["reviews"] = []
    
    for review in export_reviews:
        reviews["reviews"].append(tuple_to_dict(review))
        
    results = dict()
    results["k"] = []
    results["iteractions"] = []
    results["sse"] = []
    results["clusters"] = []
        
    for i in range(2, 16):
        for j in range(1, 6):
            documents = copy.copy(reviews["reviews"])
            
            kmedoids_instance = kmedoids.Kmedoids(k=i,\
                                              documents=documents,\
                                              distance_calculator=jaccard_calculator,\
                                              collection_field="reviews", text_field_name="CleanContent")
        
            kmedoids_instance.clustering()
            
            results["k"].append(i)
            results["sse"].append(kmedoids_instance.calculate_sse())
            results["iteractions"].append(j)
            results["clusters"].append(kmedoids_instance.get_clusters())
                
                
    results_dirpath = os.path.join(results_basedir, directory)
    results_filename = "overall_results.csv"
    
    fm.data_to_csv(data=results, filename=results_filename, dirpath=results_dirpath, columns=["k", "sse", "iteractions"])
    fm.clusters_to_csv(results=results, dirpath=results_dirpath, collection_field="reviews")