In [18]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
from sklearn.cluster import DBSCAN
import json
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import matplotlib.cm as cm
from itertools import product
import plotly.express as px

In [19]:
data = json.load(open('reddit_scrapper/data/scrapped_data.json','r+'))
subreddit_names_list = json.load(open('reddit_scrapper/data/list_of_unique_subreddits.json','r+'))
subreddit_index = dict(zip(subreddit_names_list,range(len(subreddit_names_list))))
index_subreddit =  dict(zip(range(len(subreddit_names_list)),subreddit_names_list))

In [20]:
def create_matrix(data,matrix_width,subreddit_index):
    """ Creates matrix filled with zeros and iterates over it filling the cells based on 
        the subreddit-index dictionary"""
    matrix = np.zeros(shape=(len(data),matrix_width))
    for idx,redditor in enumerate(data.values()):
        for key,value in redditor.items():
            matrix[idx,subreddit_index[key]] = value
    return matrix
#[DVC]
def filter_matrix(matrix,threshold,index_subreddit):
    mask = np.where(matrix>threshold,True,False)
    rows = ~np.all(mask==False,axis=1)
    columns = ~np.all(mask==False,axis=0)
    del mask
    data = matrix[np.ix_(rows,columns)]
    del rows
    df = pd.DataFrame(data,columns=np.squeeze(np.argwhere(columns)))
    del data,columns
    df.rename(columns=index_subreddit,inplace=True)
    return df
def extract_most_popular_subreddits(df,lower_limit,upper_limit):
    most_popular_reddits = df.sum(axis=0).sort_values(ascending=False)[lower_limit:upper_limit].index
    column_base_order = dict(zip(df.columns,range(len(df.columns))))
    column_indexes = [column_base_order[i] for i in most_popular_reddits]
    X_np = df.to_numpy()[:, column_indexes]
    del df,column_base_order,column_indexes
    zero_rows = np.where(X_np.sum(axis=1) == 0)[0]
    X_np= np.delete(X_np, zero_rows, axis=0)
    return pd.DataFrame(X_np,columns=most_popular_reddits).drop_duplicates()


In [21]:
matrix = create_matrix(data,len(subreddit_names_list),subreddit_index)

In [22]:
df = filter_matrix(matrix,5,index_subreddit)
del matrix

In [23]:
upper_limit = 2000 ##  Choose number of most popular reddits
lower_limit = 3
df = extract_most_popular_subreddits(df,lower_limit,upper_limit)

In [24]:
n_components = 3 ## Choose number of dimensions to project the data onto
tsne =  TSNE(n_components=n_components,n_jobs=-1,random_state=69)
X_tsne = tsne.fit_transform(df)

KeyboardInterrupt: 

In [None]:
eps_list = list(np.around(np.arange(2, 7.0, 0.1),2))
lower_min_samples = 3
higher_min_samples = 20
min_samples_list = list(np.arange(lower_min_samples, higher_min_samples, 1))
pairs = list(product(eps_list,min_samples_list))
n = 2 # early skip
results = []
last_n_scores = []
progress_bar_value = 0
with tqdm(total=100, desc="Percentage done") as pbar:  
    for i in range(len(pairs)):
        if i == len(pairs):
            break
        eps,min_samples = pairs[i]
        cluster_labels = DBSCAN(eps=eps, min_samples=min_samples,n_jobs=-1).fit_predict(X_tsne)
        silhouette_avg = silhouette_score(X_tsne, cluster_labels)
        results.append((np.unique(cluster_labels).shape[0],eps,min_samples,silhouette_avg))
        last_n_scores.append(silhouette_avg)
        if  np.unique(last_n_scores[-n:]).shape[0] ==1 and len(last_n_scores)>n:
            last_n_scores = []
            if i < len(pairs):
                skip = higher_min_samples-int(pairs[i][1])
                if len(pairs)-skip+1 > 2*len(min_samples_list):
                    pairs = pairs[skip-1:]
        pbar.update((i/len(pairs) - progress_bar_value)*100)
        progress_bar_value = i/len(pairs)
    pbar.update(100-progress_bar_value)


In [None]:
results = pd.DataFrame(results,columns=['n_clusters','eps','min_samples','silhouette'])
results = results.sort_values(by='silhouette',ascending=False)
results.to_csv('tsne_params.csv')