In [37]:
import json
import numpy as np
import pandas as pd
import plotly.express as px
import mlxtend as mlx
from tqdm.notebook import tqdm, trange
from itertools import chain,product

In [38]:
def read_json(path):
    file = open(path,'r+', encoding='utf-8')
    data = {}
    for line in file.readlines():
        try: 
            dic = json.loads(line)
            if dic['author'] in data.keys():
                if dic['subreddit'] in data[dic['author']].keys():
                    data[dic['author']][dic['subreddit']] +=1
                else:
                    data[dic['author']][dic['subreddit']] =1
            else:
                data[dic['author']] = {}
                data[dic['author']][dic['subreddit']] = 1
        except json.JSONDecodeError:
            continue
    return data

In [39]:
def jsonKeys2int(x):
    if isinstance(x, dict):
            return {int(k):v for k,v in x.items()}
    return x

In [40]:
#[DVC]
data = json.load(open('reddit_scrapper/data/scrapped_data.json','r+'))
subreddit_names_list = json.load(open('reddit_scrapper/data/list_of_unique_subreddits.json','r+'))
subreddit_index = dict(zip(subreddit_names_list,range(len(subreddit_names_list))))
index_subreddit =  dict(zip(range(len(subreddit_names_list)),subreddit_names_list))

In [41]:
#[DVC]
def create_matrix(data,matrix_width,subreddit_index):
    """ Creates matrix filled with zeros and iterates over it filling the cells based on 
        the subreddit-index dictionary"""
    matrix = np.zeros(shape=(len(data),matrix_width))
    for idx,redditor in enumerate(data.values()):
        for key,value in redditor.items():
            matrix[idx,subreddit_index[key]] = value
    return matrix

In [42]:
def update_dictionaries(new_redditors,subreddit_index,index_subreddit,subreddit_names_list):
    """ Updating index-subreddit,subreddit-index dictionaries and subreddit_names_list according to the
        new_redditors- new incoming data"""
    for redditor in new_redditors.values():
        for i in redditor.keys():
            if i not in subreddit_names_list:
                subreddit_index[i] = len(subreddit_index)
                index_subreddit[len(index_subreddit)] = i
                subreddit_names_list.append(i)
    return subreddit_index,index_subreddit,subreddit_names_list

In [43]:
def resize_matrix_width(matrix,subreddit_index):
    """ Updating the width of the main matrix to match it with the incoming data"""
    if matrix.shape[1] != len(subreddit_index):
        extension = np.zeros((matrix.shape[0],len(subreddit_index)-matrix.shape[1]))
        matrix = np.hstack((matrix, extension))
    return matrix 

In [44]:
def update(matrix,new_redditors,subreddit_index,index_subreddit,subreddit_names_list):
    """Function that manages new incoming data and combines it with the main matrix"""
    subreddit_index,index_subreddit,subreddit_names_list = \
    update_dictionaries(new_redditors,subreddit_index,index_subreddit,subreddit_names_list)
    matrix = resize_matrix_width(matrix,subreddit_index)
    recent_data = create_matrix(new_redditors,matrix.shape[1],subreddit_index)
    matrix = np.vstack((matrix, recent_data))
    return matrix,subreddit_index,index_subreddit,subreddit_names_list

### Create matrix

Remove rows and columns where all cells contains integer less or equal to 5. E.g. when a redditor comments less than threshold (6) times in a given reddit we consider an inconsiderable input so we floor it to zero. If a given redditor has commented less than the threshold in all the reddits (0/False vector) we remove him from the data. Same applies to reddit sites.

In [45]:
#[DVC]
def filter_matrix(matrix,threshold,index_subreddit):
    mask = np.where(matrix>threshold,True,False)
    rows = ~np.all(mask==False,axis=1)
    columns = ~np.all(mask==False,axis=0)
    del mask
    data = matrix[np.ix_(rows,columns)]
    del rows
    df = pd.DataFrame(data,columns=np.squeeze(np.argwhere(columns)))
    del data,columns
    df.rename(columns=index_subreddit,inplace=True)
    return df

In [46]:
#[DVC]
def extract_most_popular_subreddits(df,lower_limit,upper_limit,clear_zero_rows=True):
    most_popular_reddits = df.sum(axis=0).sort_values(ascending=False)[lower_limit:upper_limit].index
    column_base_order = dict(zip(df.columns,range(len(df.columns))))
    column_indexes = [column_base_order[i] for i in most_popular_reddits]
    X_np = df.to_numpy()[:, column_indexes]
    del df,column_base_order,column_indexes
    zero_rows = np.where(X_np.sum(axis=1) == 0)[0]
    X_np= np.delete(X_np, zero_rows, axis=0)
    if clear_zero_rows:
        return pd.DataFrame(X_np,columns=most_popular_reddits).drop_duplicates()
    else:
        return pd.DataFrame(df,columns=most_popular_reddits).drop_duplicates()

In [52]:
#[DVC]
matrix = create_matrix(data,len(subreddit_names_list),subreddit_index)

In [48]:
#[DVC]
df = filter_matrix(matrix,5,index_subreddit)
del matrix

In [49]:
#[DVC]
df_bool = df.astype(bool).astype(int)

In [50]:
#[DVC]
upper_limit = 2000 ##  Choose number of most popular reddits
lower_limit = 3
df = extract_most_popular_subreddits(df,lower_limit,upper_limit)

In [56]:
df.to_csv('df.csv')

In [57]:
pd.read_csv('df.csv')

Unnamed: 0.1,Unnamed: 0,memes,politics,CryptoCurrency,nba,soccer,teenagers,AmItheAsshole,amcstock,PublicFreakout,...,adhdmeme,WANDAVISION,peopleofwalmart,PhoenixSC,palegirls,popheadscirclejerk,aspiememes,help,boburnham,BrandNewSentence
0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,55.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29471,30436,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
29472,30437,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
29473,30438,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
29474,30439,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [55]:
df = df.iloc[:-2,:]

Unnamed: 0,memes,politics,CryptoCurrency,nba,soccer,teenagers,AmItheAsshole,amcstock,PublicFreakout,news,...,adhdmeme,WANDAVISION,peopleofwalmart,PhoenixSC,palegirls,popheadscirclejerk,aspiememes,help,boburnham,BrandNewSentence
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,55.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30435,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
30436,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
30437,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
30438,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Clustering

In [15]:
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.manifold import TSNE
from sklearn.cluster import DBSCAN
import matplotlib.pyplot as plt
import matplotlib.cm as cm
from itertools import product
import plotly.express as px
from tqdm.notebook import tqdm

In [29]:
#[DVC]
n_components = 3 ## Choose number of dimensions to project the data onto
tsne =  TSNE(n_components=n_components,n_jobs=-1,random_state=69)
X_tsne = tsne.fit_transform(df)

NameError: name 'df' is not defined

In [32]:
X_tsne=pd.read_csv('X_tsneL=3U=2000.csv')

In [33]:
X_tsne.drop(columns=['Unnamed: 0'],inplace=True)

In [23]:
#[DVC]
clustering = DBSCAN(eps=2, min_samples=8,n_jobs=-1).fit(X_tsne)

In [24]:
#[DVC]
X_tsne = pd.DataFrame(X_tsne,columns=['component1','component2','component3'])
X_tsne['clustering'] = clustering.labels_
X_tsne['clustering'] = X_tsne['clustering'].astype(str)

### Dictionary of subrredits and number of their occurences divided into clusters

In [25]:
X_tsne['clustering'].value_counts()

-1     3281
11     2630
17     1066
20      679
22      585
       ... 
68        8
387       8
702       7
747       7
753       6
Name: clustering, Length: 756, dtype: int64

In [26]:
# X_tsne = X_tsne[X_tsne['clustering'] !='-1']

In [27]:
#[DVC]
clustered_useres_dicts = {}
df.loc[:,'clustering'] = clustering.labels_
clustered_users = df.groupby(by=df['clustering']).sum()
clustered_users_matrix = clustered_users.to_numpy().astype(int)
for i in range(clustered_users.to_numpy().shape[0]):
    mask = np.where(clustered_users_matrix[i,:] >0,True,False)
    clustered_useres_dicts[clustered_users.iloc[i].name] = \
    dict(zip(clustered_users.columns[mask],clustered_users_matrix[i,:][mask]))

NameError: name 'df' is not defined

t-SNE nie będzie działało na danych których nie użyliśmy podczas klastrowania. Zakładamy że jest to proces który odpalany jest raz na jakiś czas i docelowo np raz na tydzień pokazuje gdzie ludzie podobni do "mnie" postują.

In [61]:
for i in clustered_useres_dicts:
    x = clustered_useres_dicts[i]
    print(sorted(x.items(),key=lambda item: item[1],reverse=True)[:5])

[('memes', 1426), ('nba', 916), ('politics', 770), ('news', 674), ('PublicFreakout', 631)]
[('Chodi', 765), ('memes', 24), ('BollyBlindsNGossip', 18), ('rarepuppers', 13), ('Genshin_Impact', 11)]
[('amcstock', 9645), ('GME', 2988), ('WallStreetbetsELITE', 585), ('Wallstreetbetsnew', 439), ('AMCSTOCKS', 350)]
[('GuessTheMovie', 263), ('whatisthisthing', 209), ('crochet', 23), ('AskOldPeople', 19), ('interestingasfuck', 14)]
[('starterpacks', 396), ('2MiddleEast4you', 20), ('ksi', 19), ('CryptoCurrency', 16), ('ffxiv', 15)]
[('apexlegends', 2454), ('apexuniversity', 171), ('ApexOutlands', 155), ('memes', 65), ('PublicFreakout', 46)]
[('unpopularopinion', 5598), ('NoStupidQuestions', 2391), ('memes', 176), ('TooAfraidToAsk', 176), ('Showerthoughts', 161)]
[('TeenMomOGandTeenMom2', 869), ('FundieSnarkUncensored', 721), ('DuggarsSnark', 688), ('MunchSnark', 69), ('AmItheAsshole', 58)]
[('Sims4', 516), ('thesims', 319), ('harrypotter', 26), ('hockey', 14), ('AmItheAsshole', 13)]
[('okbuddyre

In [29]:
#[DVC]
clustered_useres_dicts[df.loc[5,"clustering"]]

{'memes': 7,
 'politics': 2,
 'CryptoCurrency': 16,
 'nba': 2,
 'teenagers': 9,
 'PublicFreakout': 1,
 'news': 3,
 'unpopularopinion': 5,
 'worldnews': 2,
 'relationship_advice': 1,
 'PoliticalCompassMemes': 1,
 'interestingasfuck': 13,
 'Cringetopia': 4,
 'funny': 4,
 'pics': 2,
 'gaming': 5,
 'dankmemes': 13,
 'todayilearned': 8,
 'conspiracy': 1,
 '196': 2,
 'aww': 8,
 'NoStupidQuestions': 1,
 'WhitePeopleTwitter': 9,
 'movies': 4,
 'AskMen': 2,
 'apexlegends': 5,
 'facepalm': 3,
 'Games': 1,
 'mildlyinteresting': 2,
 'Minecraft': 3,
 'FridayNightFunkin': 5,
 'HolUp': 1,
 'Unexpected': 2,
 'nottheonion': 6,
 'MadeMeSmile': 3,
 'PS5': 4,
 'Whatcouldgowrong': 1,
 'trashy': 6,
 'LivestreamFail': 3,
 'Damnthatsinteresting': 4,
 'mildlyinfuriating': 4,
 'HistoryMemes': 5,
 'shitposting': 7,
 'videos': 2,
 '2007scape': 6,
 'iamatotalpieceofshit': 2,
 'ffxiv': 15,
 'television': 2,
 'PrequelMemes': 2,
 'marvelstudios': 10,
 'changemyview': 8,
 'starterpacks': 396,
 'masseffect': 1,
 'Chris

In [30]:
#[DVC] - tu chodzi tylko o to żeby nie było tej kolumny z klastrami do kolejnych metod klastrowania
# df.drop(columns='clustering',inplace=True)

In [34]:
if n_components == 2:
    X_tsne = pd.DataFrame(X_tsne,columns=['component1','component2'])
    X_tsne['clustering'] = clustering.labels_
    X_tsne['clustering'] = X_tsne['clustering'].astype(str)
    print(f"Number of clusters {pd.unique(X_tsne['clustering']).shape[0]}")
    fig = px.scatter(X_tsne,x="component1", y="component2", color="clustering")
    fig.show()
elif n_components == 3:
    X_tsne = pd.DataFrame(X_tsne,columns=['component1','component2','component3'])
    X_tsne['clustering'] = clustering.labels_
    X_tsne['clustering'] = X_tsne['clustering'].astype(str)
    print(f"Number of clusters {pd.unique(X_tsne['clustering']).shape[0]}")
    fig = px.scatter_3d(X_tsne,x="component1", y="component2",z='component3', color="clustering")
    fig.show()

Number of clusters 756


In [32]:
fig.write_html("tsne_clusters.html")

### Find the best parameters

In [20]:
#[DVC] - to jako opcja w DVC jezeli bedziemy chcieli puścić szukanie parametrów z 
# możliwością zewnętrznego ustawienia zakresów paramaterów
eps_list = list(np.around(np.arange(2, 5.0, 0.1),2))
lower_min_samples = 3
higher_min_samples = 15
min_samples_list = list(np.arange(lower_min_samples, higher_min_samples, 1))
pairs = list(product(eps_list,min_samples_list))
n = 2 # early skip
results = []
last_n_scores = []
progress_bar_value = 0
with tqdm(total=100, desc="Percentage done") as pbar:  
    for i in range(len(pairs)):
        if i == len(pairs):
            break
        eps,min_samples = pairs[i]
        cluster_labels = DBSCAN(eps=eps, min_samples=min_samples,n_jobs=-1).fit_predict(X_tsne)
        silhouette_avg = silhouette_score(X_tsne, cluster_labels)
        results.append((np.unique(cluster_labels).shape[0],eps,min_samples,silhouette_avg))
        last_n_scores.append(silhouette_avg)
        if  np.unique(last_n_scores[-n:]).shape[0] ==1 and len(last_n_scores)>n:
            last_n_scores = []
            if i < len(pairs):
                skip = higher_min_samples-int(pairs[i][1])
                if len(pairs)-skip+1 > 2*len(min_samples_list):
                    pairs = pairs[skip-1:]
        pbar.update((i/len(pairs) - progress_bar_value)*100)
        progress_bar_value = i/len(pairs)
    pbar.update(100-progress_bar_value)


HBox(children=(HTML(value='Percentage done'), FloatProgress(value=0.0), HTML(value='')))




KeyboardInterrupt: 

In [21]:
#[DVC] wyniki z poprzedniej komórki
results = pd.DataFrame(results,columns=['n_clusters','eps','min_samples','silhouette'])
results = results.sort_values(by='silhouette',ascending=False)
results.to_csv('tsne_params.csv')

### PCA test

In [65]:
from sklearn.decomposition import PCA
from sklearn.neighbors import BallTree
from sklearn.cluster import KMeans

In [66]:
df.shape

(29476, 1998)

In [67]:
pca = PCA().fit(df)
x = np.cumsum(pca.explained_variance_ratio_)
fig = px.line( y=x, x=list(range(len(x))))
fig.update_layout(title='PCA algorithm',
                   xaxis_title='number of components',
                   yaxis_title='cumulative explained variance')
fig.show()

KeyboardInterrupt: 

In [68]:
#[DVC]
n_components=200

In [69]:
#[DVC]
pca = PCA(n_components=n_components).fit(df)
pca_df = pd.DataFrame(pca.transform(df))

In [70]:
#[DVC]
pca_df['clustering'] = None
clustering = KMeans(n_clusters=700).fit(pca_df.iloc[:,:-1])
pca_df['clustering'] = clustering.labels_
pca_df['clustering'] = pca_df['clustering'].astype(str)

In [351]:
#[DVC]
def reshape_new_user(new_user,df):
    base_dict = dict(zip(df.columns,[0]*df.columns.shape[0]))
    for key,value in list(new_user.values())[0].items():
        if key in base_dict:
            base_dict[key] = value
    base_dict ={list(new_user.keys())[0]:base_dict}
    new_user_df = pd.DataFrame(base_dict).T
    new_user_df = new_user_df[df.columns.to_list()]
    return new_user_df

In [352]:
#[DVC]
def get_cluster_index(pca_new_user,pca_df,NN=10):
    tree = BallTree(pca_df.iloc[:,:-1])
    dist, ind = tree.query(pca_new_user,k=NN)
    print(pca_df.iloc[ind[0],pca_df.columns.get_loc("clustering")]\
          .value_counts().sort_values(ascending=False))
    new_user_cluster = pca_df.iloc[ind[0],pca_df.columns.get_loc("clustering")]\
    .value_counts().sort_values(ascending=False).index[0]
    return int(new_user_cluster)

In [353]:
#[DVC]
def get_clustered_subreddits(df,labels):
    clustered_useres_dicts = {}
    df.loc[:,'clustering'] = clustering.labels_
    clustered_users = df.groupby(by=df['clustering']).sum()
    clustered_users_matrix = clustered_users.to_numpy().astype(int)
    for i in range(clustered_users.to_numpy().shape[0]):
        mask = np.where(clustered_users_matrix[i,:] >0,True,False)
        clustered_useres_dicts[clustered_users.iloc[i].name] = \
        dict(zip(clustered_users.columns[mask],clustered_users_matrix[i,:][mask])) 
    df.drop(columns=['clustering'],inplace=True)
    return clustered_useres_dicts

In [370]:
new_user = {"KuchniaMagdyGessler": {
    "sex": 5,
    "cock": 15,
    "malegrooming": 1,
    "penis": 40
}}

In [371]:
test_users = [
{"gothsluts": 16, "2000sGirls": 33, "nj4nj": 1, "Yololaceandlingerie": 1, "u_Unknownpleasures89": 1, "TeenBeauties": 1, "assinthong": 1, "18_19": 1, "GirlsGoneWilderness": 1, "collegesluts": 10, "GaybrosGoneWild": 1, "facesitting": 1, "milf": 3, "cougars_and_milfs_sfw": 3, "bigasses": 1},
{"flatearth": 74, "teenagers": 1},
{"HotOnlyfans": 19, "onlyfanschicks": 17, "FreeOnlyFansPage": 17, "HornyOnlyfans": 4, "onlyfansgirls101": 16, "AdorableOnlyfans": 2},
{"CallOfDutyMobile": 75},
{"Amd": 57, "battlefield2042": 1, "Battlefield6": 1, "pathofexile": 15, "PathOfExileBuilds": 1},
{"AskComputerScience": 11, "compsci": 4, "learnmachinelearning": 13, "computerscience": 10, "deeplearning": 6, "learnprogramming": 5, "AskPhysics": 6, "AskProgramming": 7, "Chodi": 1, "algorithms": 2, "linuxquestions": 1, "MachineLearning": 1, "math": 4, "crypto": 3, "softwarearchitecture": 1},
{"starwarsmemes": 1, "nextfuckinglevel": 9, "darkjokes": 1, "juggling": 15, "AskReddit": 4, "MetalMemes": 5, "Unexpected": 2, "trackandfield": 4, "okbuddyretard": 8, "cursedcomments": 3, "Meshuggah": 2, "mildlyinfuriating": 3, "memes": 11, "interestingasfuck": 1, "hermitcraftmemes": 1, "ConservativeMemes": 1, "BrandNewSentence": 2, "oddlyterrifying": 1, "ChoosingBeggars": 1},
{"NoFap": 75},
{"UFOscience": 13, "UFOs": 26, "ufomemes": 1, "UAP": 2, "ufo": 15, "HighStrangeness": 5, "space": 6, "CoronavirusDownunder": 2, "UFObelievers": 2, "australia": 1, "UF0": 1, "brexit": 1}
]
for i in range(len(test_users)):
    test_users[i] = {i:test_users[i]}

In [372]:
new_user_df = reshape_new_user(new_user,df)
pca_new_user = pca.transform(new_user_df)
new_user_cluster_index = get_cluster_index(pca_new_user,pca_df)
clustered_useres_dicts = get_clustered_subreddits(df,clustering.labels_)
user_cluster = clustered_useres_dicts[new_user_cluster_index]
user_cluster = {i[0]:i[1] for i in user_cluster.items() if i[0] not in list(new_user.values())[0].keys()}
print(new_user_cluster_index,new_user,pd.Series(user_cluster).sort_values(ascending=False).head(20))

0    10
Name: clustering, dtype: int64
0 {'KuchniaMagdyGessler': {'sex': 5, 'cock': 15, 'malegrooming': 1, 'penis': 40}} BreedingMaterial         1493
PokemonGoFriends         1349
pussy                    1264
keoXer                   1147
playboicarti             1038
chubby                   1020
BBW                       973
redscarepod               921
dndnext                   919
EliteDangerous            913
NoFap                     904
Random_Acts_Of_Amazon     903
gme_meltdown              896
mechmarket                890
TwoBestFriendsPlay        880
pokemon                   873
FireEmblemHeroes          860
Chiraqology               849
airsoft                   848
AskRedditAfterDark        845
dtype: int64


In [373]:
for new_user in test_users:
    new_user_df = reshape_new_user(new_user,df)
    pca_new_user = pca.transform(new_user_df)
    new_user_cluster_index = get_cluster_index(pca_new_user,pca_df)
    clustered_useres_dicts = get_clustered_subreddits(df,clustering.labels_)
    user_cluster = clustered_useres_dicts[new_user_cluster_index]
    user_cluster = {i[0]:i[1] for i in user_cluster.items() if i[0] not in list(new_user.values())[0].keys()}
    print(new_user_cluster_index,new_user,pd.Series(user_cluster).sort_values(ascending=False).head(20))

0    10
Name: clustering, dtype: int64
0 {0: {'gothsluts': 16, '2000sGirls': 33, 'nj4nj': 1, 'Yololaceandlingerie': 1, 'u_Unknownpleasures89': 1, 'TeenBeauties': 1, 'assinthong': 1, '18_19': 1, 'GirlsGoneWilderness': 1, 'collegesluts': 10, 'GaybrosGoneWild': 1, 'facesitting': 1, 'milf': 3, 'cougars_and_milfs_sfw': 3, 'bigasses': 1}} BreedingMaterial         1493
PokemonGoFriends         1349
pussy                    1264
keoXer                   1147
playboicarti             1038
chubby                   1020
BBW                       973
redscarepod               921
dndnext                   919
EliteDangerous            913
NoFap                     904
Random_Acts_Of_Amazon     903
gme_meltdown              896
mechmarket                890
TwoBestFriendsPlay        880
pokemon                   873
FireEmblemHeroes          860
Chiraqology               849
airsoft                   848
AskRedditAfterDark        845
dtype: int64
0    10
Name: clustering, dtype: int64
0 {1: {'flate

In [21]:
#[DVC] - to jako opcja w DVC jezeli bedziemy chcieli puścić szukanie parametrów z 
# możliwością zewnętrznego ustawienia zakresów paramaterów
eps_list = list(np.around(np.arange(2, 5.0, 0.1),2))
lower_min_samples = 3
higher_min_samples = 15
min_samples_list = list(np.arange(lower_min_samples, higher_min_samples, 1))
pairs = list(product(eps_list,min_samples_list))
n = 2 # early skip
results = []
last_n_scores = []
progress_bar_value = 0
with tqdm(total=100, desc="Percentage done") as pbar:  
    for i in range(len(pairs)):
        if i == len(pairs):
            break
        eps,min_samples = pairs[i]
        cluster_labels = DBSCAN(eps=eps, min_samples=min_samples,n_jobs=-1).fit_predict(pca_df)
        silhouette_avg = silhouette_score(pca_df, cluster_labels)
        results.append((np.unique(cluster_labels).shape[0],eps,min_samples,silhouette_avg))
        last_n_scores.append(silhouette_avg)
        if  np.unique(last_n_scores[-n:]).shape[0] ==1 and len(last_n_scores)>n:
            last_n_scores = []
            if i < len(pairs):
                skip = higher_min_samples-int(pairs[i][1])
                if len(pairs)-skip+1 > 2*len(min_samples_list):
                    pairs = pairs[skip-1:]
        pbar.update((i/len(pairs) - progress_bar_value)*100)
        progress_bar_value = i/len(pairs)
    pbar.update(100-progress_bar_value)


NameError: name 'product' is not defined

In [354]:
# new_user = {6: {'starwarsmemes': 1, 'nextfuckinglevel': 9, 'darkjokes': 1, 'juggling': 15, 'AskReddit': 4, 'MetalMemes': 5, 'Unexpected': 2, 'trackandfield': 4, 'okbuddyretard': 8, 'cursedcomments': 3, 'Meshuggah': 2, 'mildlyinfuriating': 3, 'memes': 11, 'interestingasfuck': 1, 'hermitcraftmemes': 1, 'ConservativeMemes': 1, 'BrandNewSentence': 2, 'oddlyterrifying': 1, 'ChoosingBeggars': 1}}

In [66]:
results = pd.DataFrame(results,columns=['n_clusters','eps','min_samples','silhouette'])
results = results.sort_values(by='silhouette',ascending=False)
results.head(10)

Unnamed: 0,n_clusters,eps,min_samples,silhouette
248,2,4.9,8,0.507418
247,2,4.9,7,0.507418
238,2,4.8,7,0.505849
239,2,4.8,8,0.503591
230,2,4.7,7,0.502118
231,2,4.7,8,0.500884
240,2,4.8,9,0.500824
224,2,4.6,7,0.499479
225,2,4.6,8,0.499479
219,2,4.5,8,0.499479


# Ball Tree

In [374]:
#[DVC]
tree = BallTree(df)
n = 400 # index of initial user
user_vector = np.expand_dims(df.loc[n].to_numpy(),0)
dist, ind = tree.query(user_vector,k=100)

Initial user

In [375]:
#[DVC]
mask = np.where(user_vector>0,True,False)[0]
users_subrredits = df.loc[n].index[mask]
query_user = dict(zip(df.loc[n].index[mask],user_vector[0][mask]))
query_user

{'nextfuckinglevel': 2.0,
 'pics': 6.0,
 'Unexpected': 3.0,
 'iamatotalpieceofshit': 3.0,
 'cats': 1.0,
 'AreTheStraightsOK': 4.0,
 'depression': 6.0,
 'UrbanHell': 1.0}

Found users

In [376]:
#[DVC]
nearest_nb_dict = []
nearest_nb = df.iloc[list(*ind)[1:]]
nearest_nb_np = nearest_nb.to_numpy().astype(int)
for i in range(nearest_nb_np.shape[0]):
    mask = np.where(nearest_nb_np[i,:] >0,True,False)
    nearest_nb_dict.append(dict(zip(nearest_nb.columns[mask],nearest_nb_np[i,:][mask])))

Recommendation by number of comments

In [377]:
#[DVC]
neighbors = nearest_nb.sum(axis=0).astype(int)
columns = set(neighbors.index)- set(users_subrredits)
neighbors = neighbors.loc[columns]
neighbors[neighbors >0].sort_values(ascending=False)

unpopularopinion        7
Damnthatsinteresting    7
interestingasfuck       6
sex                     5
IdiotsInCars            5
                       ..
WTF                     1
OldSchoolCool           1
hmmm                    1
SafeMoon                1
cycling                 1
Length: 189, dtype: int64

Recommendation by number of users

In [378]:
#[DVC]
neighbors = nearest_nb.astype(bool).astype(int).sum(axis=0)
columns = set(neighbors.index)- set(users_subrredits)
neighbors = neighbors.loc[columns]
neighbors[neighbors >0].sort_values(ascending=False)

Damnthatsinteresting    6
news                    5
IdiotsInCars            4
interestingasfuck       4
anime                   4
                       ..
thenetherlands          1
summonerschool          1
JusticeServed           1
pittsburgh              1
cycling                 1
Length: 189, dtype: int64

In [None]:
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

In [None]:
#[DVC]
frequent_itemsets = apriori(df_bool, min_support=0.01, use_colnames=True)
rules = association_rules(frequent_itemsets, 
                  metric='confidence', 
                  min_threshold=0.7)

In [None]:
rules.to_json("data/arules.json")