# Model Results

This notebook performs clustering runs on various k-values and bias amounts, then produces model results for further analysis.

## Load Data

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import pandas as pd
from Biased_Clusters import get_silhouette

In [3]:
# cleaned data
df = pd.read_csv('data/data_cleaned.csv')

In [4]:
import json
import scipy.sparse

# load training data
x_vector = np.load('data/x_vector.npy')

# load terms sparse matrix
terms_sparse_matrix = scipy.sparse.load_npz('data/terms_sparse_matrix.npz')

# load terms label
with open("data/terms_label.txt", "r") as fp:
    terms_label = json.load(fp)

## Run KMeans Models on Various k-values and Bias Amounts

In [5]:
def build_result_data(df, x_vector, n_clusters, max_range=1000):
    # run k-mean model on various bias amounts and build the result dataframe
    data = []
    for m in tqdm(range(1, max_range,10)):
        m = m*.01
        #print(m)
        try:
            data.append(get_silhouette(df, x_vector, m, n_clusters))
        except ValueError:
            continue
    
    # create a data frame of result
    df_result = pd.DataFrame(data)
    
    return df_result

In [6]:
from tqdm import tqdm

# create an empty list to store the result data frame
result_list = []
k_values = []             # a list of k-values
avg_sil_scores = []       # a list of average Silhouette score per k-value

# run KMeans model on 25 different k-values
for n_clusters in range(5, 31):
    # get model result and save to a list
    df_result = build_result_data(df, x_vector, n_clusters, 2000)
    result_list.append(df_result)
    
    # compute average Silhouette score for each k value
    k_values.append(n_clusters)
    avg_sil_scores.append(df_result['Silhouette Score'].mean())

100%|██████████| 200/200 [01:47<00:00,  1.86it/s]
100%|██████████| 200/200 [01:53<00:00,  1.76it/s]
100%|██████████| 200/200 [01:58<00:00,  1.69it/s]
100%|██████████| 200/200 [02:00<00:00,  1.65it/s]
100%|██████████| 200/200 [02:04<00:00,  1.61it/s]
100%|██████████| 200/200 [02:13<00:00,  1.50it/s]
100%|██████████| 200/200 [02:45<00:00,  1.21it/s]
100%|██████████| 200/200 [03:18<00:00,  1.01it/s]
100%|██████████| 200/200 [02:44<00:00,  1.22it/s]
100%|██████████| 200/200 [02:48<00:00,  1.19it/s]
100%|██████████| 200/200 [02:52<00:00,  1.16it/s]
100%|██████████| 200/200 [02:55<00:00,  1.14it/s]
100%|██████████| 200/200 [03:33<00:00,  1.07s/it]
100%|██████████| 200/200 [03:07<00:00,  1.06it/s]
100%|██████████| 200/200 [03:14<00:00,  1.03it/s]
100%|██████████| 200/200 [03:19<00:00,  1.00it/s]
100%|██████████| 200/200 [04:11<00:00,  1.26s/it]
100%|██████████| 200/200 [03:45<00:00,  1.13s/it]
100%|██████████| 200/200 [04:35<00:00,  1.38s/it]
100%|██████████| 200/200 [04:05<00:00,  1.23s/it]


## Save Results

In [7]:
# save the result data frames to csv files
for n_clusters in range(5, 31):
    result_list[n_clusters-5].to_csv('results/results_' + str(n_clusters) + '.csv', index=False)

In [8]:
# save average Silhouette score per k
df_avg_sil = pd.DataFrame(dict({'Number of Topics': k_values, 'Average Silhouette Score': avg_sil_scores}))
df_avg_sil.to_csv('results/avg_sil_per_k.csv', index=False)