In [1]:
import numpy as np
import matplotlib.pylab as pl
import ot
import ot.plot

import pandas as pd
import praw
import re
import nltk

import gensim.models


import xgboost as xgb

import numpy as np
import seaborn as sns


import sklearn 
from sklearn.model_selection import train_test_split

from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

from sklearn.cluster import SpectralClustering
regex = re.compile('[^a-zA-Z ]')

#@numba.jit # unfortunately this doesn't jit easily :(
def tokenize(text):
    # given a body of text, this splits into sentences, then processes each word in the sentence to remove
    # non alphabetical characters... (? bad idea, what about users with numbers in their name)
    # returns it as a list of lists of words, the format desired by gensims word2vec
    
    sentences = []
    if type(text) == str:
        for sentence in nltk.tokenize.sent_tokenize(text):
            processed = [regex.sub('', word.lower()) for word in sentence.split(' ') ]
            processed = [word for word in processed if word not in set( ['' ])]
            sentences.append(processed)
    return sentences

def average_vector(text, model):
    present_keys = [x for x in text if x in model.wv.key_to_index ]
    if not present_keys:
        return np.array([0] * len( model.wv[ model.wv.index_to_key[0]]))
    return sum( [model.wv[x] for x in present_keys] ) /len(present_keys)

def average_vector_paragraph(text, model):
    if text == []:
        return np.zeros(model.wv.vector_size)
    return sum( average_vector(sentence, model)  for sentence in text )

## Most similar posts?


def similarity(vec_1, vec_2):
    return sklearn.metrics.pairwise.cosine_similarity([vec_1], [vec_2])[0]

def make_similarity_col(df, given_index):
    given_vector = df['avg_vector'][given_index] 
    df['similarity'] = df['avg_vector'].apply( lambda x : similarity(x, given_vector))
    
# helper function for printing the most similar word vectors

def sims(args, model):
    for word, sim in model.wv.most_similar(**args, topn = 10):
        print( f"{word} - similarity {sim}")    

        
        
def train_w2v(tokenized_text):
    # the train dataframe ot build the w2v model on
    
    corpus = []
    for tokenized in tokenized_text:
        corpus += tokenized

    model = gensim.models.Word2Vec(sentences = corpus,  min_count=10, vector_size=300, epochs = 4)
    #model_fasttext = gensim.models.FastText(sentences = corpus,  min_count=10, vector_size=200, epochs = 4)
    
    return model

def vectorize(df, model):
    df['avg_vector'] = df['tokenized_title'].apply(lambda text : average_vector_paragraph(text, model)) 
    X = np.vstack(df['avg_vector'].to_numpy())
    #df.concat(axis = 1, X)
    return X

def unpack_vectors(text, model):
    vectors = []
    for sentance in text:
        for word in sentance:
            if word in model.wv.key_to_index.keys():
                vectors.append(model.wv[word])
    return np.asarray(vectors)

def cloudify(df, model):
    df['point_cloud'] = df['tokenized_title'].apply(lambda text : unpack_vectors(text, model)) 

    return df

def ot_distance(cloud_a, cloud_b):
    n_a = len(cloud_a)
    n_b = len(cloud_b)
    a, b = np.ones((n_a,)) / n_a, np.ones((n_b,)) / n_b 
    M = ot.dist(cloud_a, cloud_b)
    M /= M.max()
    d = ot.emd2(a, b, M)
    return d

def ot_distance_regularized(cloud_a, cloud_b):
    n_a = len(cloud_a)
    n_b = len(cloud_b)
    a, b = np.ones((n_a,)) / n_a, np.ones((n_b,)) / n_b 
    M = ot.dist(cloud_a, cloud_b)
    M /= M.max()
    lambd = 1e-3
    d = ot.sinkhorn2(a, b, M, lambd)[0]
    return d

In [134]:
df = pd.read_csv("../Data/subreddit_WallStreetBets/otherdata/wsb_cleaned.csv", nrows = None)

df = df.dropna(subset = ['title','selftext'])


In [135]:
df['tokenized_title'] = df.title.apply(tokenize)
df['tokenized_selftext'] = df.selftext.apply(tokenize)
model = train_w2v(df['tokenized_title'].append(df['tokenized_selftext'])) 

In [136]:
corpus_df = pd.DataFrame(model.wv.key_to_index.keys())
corpus_df['vector'] = corpus_df[0].apply(lambda x : model.wv[x])
corpus_df.to_csv("learned_embedding.csv")

## Cluster authors based on their word vector point cloud distributions:

In [127]:
%%script false 
author_counts = df.author.value_counts()

frequent_poasters = list(author_counts [ author_counts > 5 ].index)
author_counts [ author_counts > 5 ]

cleaned = df[ df.author.isin(frequent_poasters)]

compounded = cleaned[['tokenized_title', 'author']].groupby("author").agg('sum')
compounded.drop(labels = ["None", "AutoModerator"])
clouded = cloudify(compounded, model)
clouds = clouded[[point_cloud]]
clouds

Couldn't find program: 'false'


In [44]:
%%script false 


distances = np.zeros( shape= (len(clouds), len(clouds)) )
# This is expensive, I don't wnat to redo it every time...
for i in range(len(clouds)):
    print(f"Processing column {i} out of {len(clouds)}")
    for j in range(len(clouds)):
        if i < j:
            d = ot_distance_regularized(clouds.iloc[i], clouds.iloc[j])
            distances[i,j] = d
            distances[j,i] = d
            
aff_matrix = np.exp( -1 * distances / distances.std())

sc = SpectralClustering(n_clusters = 8, affinity = 'precomputed')
labels = sc.fit_predict(aff_matrix)
clouds['clusters'] = labels
clouds.clusters.sort_values()

Couldn't find program: 'false'


It's hard to make sense of this because I don't know the users well enough to cluster them. 

If we cluster posts by title instead at least then the clusters can be evaluated by inspection.

## Cluster posts based on their word vector clouds.

How well does nearest neighbor classification do?
Can we build a Bayesian hierarchical model that takes into account any groups we find here?
(If we find any conceptually meaningful clusters, what else can we do with that information?)

In [137]:
post_clouded = cloudify(df, model)
post_clouded = post_clouded [ post_clouded.point_cloud.apply(lambda x : len(x) > 0)]

In [138]:
clouds = post_clouded[['id', 'title', 'point_cloud']][:100]


In [139]:
distances = np.zeros( shape= (len(clouds), len(clouds)) )
# This is expensive, I don't wnat to redo it every time...
k = int(len(clouds) / 10)

for i in range(len(clouds)):
    if i % k == 0:
        print(f"Processing column {i} out of {len(clouds)}")
    for j in range(len(clouds)):
        if i < j:
            d = ot_distance(clouds.iloc[i].point_cloud, clouds.iloc[j].point_cloud)
            distances[i,j] = d
            distances[j,i] = d


Processing column 0 out of 100
Processing column 10 out of 100
Processing column 20 out of 100
Processing column 30 out of 100
Processing column 40 out of 100
Processing column 50 out of 100
Processing column 60 out of 100
Processing column 70 out of 100
Processing column 80 out of 100
Processing column 90 out of 100


In [140]:
aff_matrix = np.exp( -1 * distances / distances.std())

sc = SpectralClustering(n_clusters = int(len(clouds)/5), affinity = 'precomputed')
labels = sc.fit_predict(aff_matrix)
clouds['clusters'] = labels
pd.options.display.max_colwidth = None
pd.options.display.max_rows = 100
clouds[['id', 'title', 'clusters']].sort_values(by = 'clusters')

Unnamed: 0,id,title,clusters
135,eic7kt,What the fuck do you consider a dip?,0
58,eikk0r,dO yOu ThiNk Im PlaYing ArOuNd?,0
83,eii4ys,Whoever was on here talking about capes being popular was onto something! Check out Ivanka trump last night,0
104,eif5r3,They’re gonna call it the “hindsight 2020s”,0
47,eilpv9,ACB gonna moon boys,0
131,eics2y,What are you guys doing New Year’s Eve?,0
32,einrlu,"What Are Your Moves Tomorrow, January 02",0
75,eiiq38,I've concluded we're all autistic dumbasses.,0
81,eiicnv,You people have ruined me,0
41,eimigb,Please help now please assist,0
