In [None]:
import math
import os.path
import numpy as np
import matplotlib.pyplot as plt

import nbimporter

import sys
sys.path.append("..")

import utils.file_manager as fm

from minetext.clustering import distance

In [None]:
def calculate_mininal_difference(overall_results):
    k_values = overall_results['k']
    sse_values = overall_results['sse']
    
    min_diff = dict()
    min_diff["k1"] = None
    min_diff["sse1"] = None
    min_diff["k2"] = None
    min_diff["sse1"] = None
    min_diff["difference"] = float("inf")
    
    k_len = len(overall_results['k'])

    for i in range(1, k_len):
        diff = dict()
        
        diff["k1"] = overall_results["k"][i - 1]
        diff["sse1"] = overall_results["sse"][i-1]
        diff["k2"] = overall_results["k"][i]
        diff["sse2"] = overall_results["sse"][i]
        diff["difference"] = np.absolute(np.diff(overall_results["sse"][i-1:i+1]))[0]
        
        if diff["difference"] < min_diff["difference"]:
            min_diff = diff
            
    return min_diff

In [None]:
def plot_overall_clustering_results(results, dirpath, filename_preffix="clustering_overall_results"):
    fm.create_directory(dirpath)
    
    figure_filepath = os.path.join(dirpath, filename_preffix + ".png")
    
    label_fontsize = 16

#     min_x = xoffset
#     min_y = yoffset
#     max_x = max(x) + xoffset
#     max_y = max(y) + yoffset

    fig = plt.figure()
    fig.set_size_inches((14, 7))

#     ax = fig.gca()
#     ax.set_xlim([min_x, max_x])
#     ax.set_ylim([min_y, max_y])

    plt.xlabel("Número de Clusters (K)", fontsize=label_fontsize)
    plt.ylabel("Soma dos Quadrados dos Erros (SSE)", fontsize=label_fontsize)

    plt.scatter(results["k"], results["sse"])
    plt.grid()

    plt.savefig(figure_filepath)
    plt.close()

In [None]:
def plot_clusters_mean_ratings(export, dirpath, k, filename_preffix="mean_rating", xoffset=0, yoffset=0, min_rating=0, max_rating=5):
    fm.create_directory(dirpath)
    
    filename = filename_preffix + "_k_" + str(k) + ".png"
    filepath = os.path.join(dirpath, filename)
    
    k_clusters = export.loc[export['k'] == k]
    mean_groups = k_clusters.groupby(['k', 'cluster'])["rating"].mean().reset_index(name="mean_rating")
    
    fig = plt.figure()
    fig.set_size_inches((14, 7))
    
    plt.xlabel("Clusters")
    plt.ylabel("Mean ratings")
    plt.xlim([xoffset - 1, k + xoffset])
    plt.ylim([min_rating, max_rating])
    plt.xticks(np.arange(xoffset, k + xoffset, math.ceil(k / 25)))
    plt.yticks(np.arange(min_rating, max_rating + 0.5, 0.5))
    
    plt.bar(mean_groups['cluster'].apply(lambda x: x + xoffset), mean_groups['mean_rating'])
    
    plt.savefig(filepath)
    plt.close()

In [None]:
def plot_clusters_document_count(export, k, dirpath, filename_preffix="document_count", xoffset=0, yoffset=0):
    fm.create_directory(dirpath)
    
    filename = filename_preffix + "_k_" + str(k) + ".png"
    filepath = os.path.join(dirpath, filename)
    
    k_clusters = export.loc[export['k'] == k]
    count_groups = k_clusters.groupby(['k', 'cluster'])["index"].count().reset_index(name="document_count")
    
    fig = plt.figure()
    fig.set_size_inches((14, 7))
    
    plt.xlabel("Clusters")
    plt.ylabel("Document count")
    plt.xlim([xoffset - 1, k + xoffset])
    plt.xticks(np.arange(xoffset, k + xoffset, math.ceil(k / 25)))
    
    plt.bar(count_groups["cluster"], count_groups["document_count"])
    
    plt.savefig(filepath)
    plt.close()