In [None]:
import math
import os.path
import numpy as np
import matplotlib.pyplot as plt

import nbimporter

import sys
sys.path.append("..")

import utils.file_manager as fm

from IPython.display import display

from minetext.clustering import distance

In [None]:
def calculate_minimal_difference(overall_results, interval=5, max_err_increase=5):
    k_values = overall_results['k']
    sse_values = overall_results['sse']
    
    min_diff = dict()
    min_diff["results"] = None
    min_diff["difference"] = float("inf")
    
    k_len = len(overall_results['k'])
    
    for i in range(0, k_len - interval):
        j = i + interval
        
        difference = np.sum(np.absolute(np.diff(overall_results["sse"][i:j])))
        
        if difference < max_err_increase * interval:
            min_diff["results"] = overall_results[i:j]
            min_diff["difference"] = difference
            
            break

    return min_diff

In [None]:
def plot_overall_clustering_results(results, dirpath, filename_preffix="clustering_overall_results", xoffset=0, yoffset=0):
    fm.create_directory(dirpath)
    
    figure_filepath = os.path.join(dirpath, filename_preffix + ".png")
    
    label_fontsize = 16
    
    xticks = 3
    yticks = 100
    max_x =  math.ceil((max(results["k"]) + 1)  / xticks) * xticks
    max_y = math.ceil((max(results["sse"]) + 1) / yticks) * yticks
    
    fig = plt.figure()
    fig.set_size_inches((14, 7))
    
    plt.xlabel("Número de Clusters (K)", fontsize=label_fontsize)
    plt.ylabel("Soma dos Quadrados dos Erros (SSE)", fontsize=label_fontsize)

    plt.xlim([0, max_x + xoffset])
    plt.ylim([yoffset, max_y + yoffset])
    
    plt.xticks(np.arange(xoffset, max_x + xoffset, xticks))
    plt.yticks(np.arange(yoffset, max_y + yoffset, yticks))

    plt.plot(results["k"], results["sse"], '-o')
    plt.grid()

    plt.savefig(figure_filepath)
    plt.close()

In [None]:
def plot_clusters_mean_ratings(export, dirpath, k, filename_preffix="mean_rating", xoffset=0, yoffset=0, min_rating=1., max_rating=5.):
    fm.create_directory(dirpath)
    
    filename = filename_preffix + "_k_" + str(k) + ".png"
    filepath = os.path.join(dirpath, filename)
    
    k_clusters = export.loc[export['k'] == k]
    mean_groups = k_clusters.groupby(['k', 'cluster'])["rating"].mean().reset_index(name="mean_rating")
    
    print("Plotting clusters mean values!")
    
    for rating in range(int(min_rating), int(max_rating) + 1):
        if rating == max_rating:
            clusters = mean_groups.loc[mean_groups['mean_rating'] == rating]['cluster'].apply(lambda x: x + 1).values.astype(int).tolist()
            
            print("Clusters with mean ratings M, so that M =", rating, "are:", clusters)
            print("Porcentagem do total de clusters:", math.ceil((len(clusters) / k) * 100.0) / 100)
        else:
            clusters =mean_groups.loc[mean_groups['mean_rating'].apply(lambda x: rating <= x < rating + 1)]['cluster'].apply(lambda x: x + 1).values.astype(int).tolist() 
            print("Clusters with mean ratings M, so that", rating, " <= M <", rating + 1, "are:", clusters)
            
            print("Porcentagem do total de clusters:", math.ceil((len(clusters) / k) * 100.0) / 100)
    
    label_fontsize = 16
    
    fig = plt.figure()
    fig.set_size_inches((16, 10))
    
    plt.xlabel("Cluster", fontsize=label_fontsize)
    plt.ylabel("Média das Notas", fontsize=label_fontsize)
    
    plt.xlim([0, k + xoffset])
    plt.ylim([min_rating, max_rating])
    
    plt.xticks(np.arange(xoffset, k + xoffset, math.ceil(k / 25)))
    plt.yticks(np.arange(min_rating, max_rating + 0.5, 0.5))
    
    plt.bar(mean_groups['cluster'].apply(lambda x: x + xoffset), mean_groups['mean_rating'])
    
    plt.savefig(filepath)
    plt.close()

In [None]:
def plot_clusters_document_count(export, k, dirpath, filename_preffix="document_count", xoffset=0, yoffset=0):
    fm.create_directory(dirpath)
    
    filename = filename_preffix + "_k_" + str(k) + ".png"
    filepath = os.path.join(dirpath, filename)
    
    k_clusters = export.loc[export['k'] == k]
    count_groups = k_clusters.groupby(['k', 'cluster'])["index"].count().reset_index(name="document_count")
    
    label_fontsize = 16
    
    xticks = 2
    yticks = 50
    max_x =  math.ceil((max(count_groups["k"]) + 1)  / xticks) * xticks    
    max_y = math.ceil(max(count_groups["document_count"] + 1) / yticks) * yticks
    
    fig = plt.figure()
    fig.set_size_inches((16, 10))
    
    plt.xlabel("Clusters", fontsize=label_fontsize)
    plt.ylabel("Número de Comentários", fontsize=label_fontsize)
                      
    plt.xlim([0, max_x + xoffset])
    plt.ylim([yoffset, max_y + xoffset])
                      
    plt.xticks(np.arange(xoffset, max_x + xoffset, xticks))
    plt.yticks(np.arange(yoffset, max_y + yoffset, yticks))
    
    plt.bar(count_groups["cluster"].apply(lambda x: x + xoffset), count_groups["document_count"])
    
    plt.savefig(filepath)
    plt.close()

In [None]:
def plot_clusters_document_percentage(export, k, dirpath, filename_preffix="document_percentage", xoffset=0, yoffset=0):
    fm.create_directory(dirpath)
    
    filename = filename_preffix + "_k_" + str(k) + ".png"
    filepath = os.path.join(dirpath, filename)
    
    k_clusters = export.loc[export['k'] == k]
    
    count_groups = k_clusters.groupby(['k', 'cluster'])["index"].count().reset_index(name="document_count")
    
    label_fontsize = 16
    
    xticks = 2
    yticks = 0.1
    max_x =  math.ceil((max(count_groups["k"]) + 1)  / xticks) * xticks    
    max_y = math.ceil(max(count_groups["document_count"] + 1) / yticks) * yticks
    
    n_documents = len(k_clusters)
    
    fig = plt.figure()
    fig.set_size_inches((16, 10))
    
    plt.xlabel("Clusters", fontsize=label_fontsize)
    plt.ylabel("Número de Comentários (%)", fontsize=label_fontsize)
                      
    plt.xlim([0, max_x + xoffset])
    plt.ylim([yoffset, max_y + xoffset])
                      
    plt.xticks(np.arange(xoffset, max_x + xoffset, xticks))
    plt.yticks(np.arange(yoffset, max_y + yoffset, yticks))
    
    plt.bar(count_groups["cluster"].apply(lambda x: x + xoffset), count_groups["document_count"].apply(lambda x: x / n_documents))
    
    plt.savefig(filepath)
    plt.close()

In [None]:
def plot_clusters_rating_frequencies(export, k, dirpath, filename_preffix="rating_frequencies", xoffset=0, yoffset=0):
    fm.create_directory(dirpath)
    
    k_clusters = export.loc[export['k'] == k]
    
    rating_groups = k_clusters.groupby(['cluster', 'rating'])['index'].count().reset_index(name="frequency")
    
    label_fontsize = 16
    
    xticks = 1
    yticks = 0.1
    max_x = math.ceil((max(rating_groups['rating']) + 1)  / xticks) * xticks    
    max_y = math.ceil(1 / yticks) * yticks
    
    for cluster_id in rating_groups['cluster'].unique():
        cluster_ratings = rating_groups.loc[rating_groups['cluster'] == cluster_id]
        
        filename = filename_preffix + "_cluster_" + str(cluster_id) + ".png"
        filepath = os.path.join(dirpath, filename)
        
        n_documents = np.sum(cluster_ratings['frequency'])
        
        fig = plt.figure()
        fig.set_size_inches((10, 6))
        
        plt.xlim([0, max_x + xoffset])
        plt.ylim([yoffset, max_y + xoffset])

        plt.xticks(np.arange(xoffset, max_x + xoffset, xticks))
        plt.yticks(np.arange(yoffset, max_y + yoffset, yticks))

        plt.xlabel("Nota", fontsize=label_fontsize)
        plt.ylabel("Frequência (%)", fontsize=label_fontsize)
        
        plt.bar(cluster_ratings["rating"], cluster_ratings["frequency"].apply(lambda x: x / n_documents))
    
        plt.savefig(filepath)
        plt.close()