In [1]:
import numpy as np
import matplotlib.pylab as pl
import ot
import ot.plot

import pandas as pd
import praw
import re
import nltk

import gensim.models


import xgboost as xgb

import numpy as np
import seaborn as sns


import sklearn 
from sklearn.model_selection import train_test_split

from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA


regex = re.compile('[^a-zA-Z ]')

#@numba.jit # unfortunately this doesn't jit easily :(
def tokenize(text):
    # given a body of text, this splits into sentences, then processes each word in the sentence to remove
    # non alphabetical characters... (? bad idea, what about users with numbers in their name)
    # returns it as a list of lists of words, the format desired by gensims word2vec
    
    sentences = []
    if type(text) == str:
        for sentence in nltk.tokenize.sent_tokenize(text):
            processed = [regex.sub('', word.lower()) for word in sentence.split(' ') ]
            processed = [word for word in processed if word not in set( ['' ])]
            sentences.append(processed)
    return sentences

def average_vector(text, model):
    present_keys = [x for x in text if x in model.wv.key_to_index ]
    if not present_keys:
        return np.array([0] * len( model.wv[ model.wv.index_to_key[0]]))
    return sum( [model.wv[x] for x in present_keys] ) /len(present_keys)

def average_vector_paragraph(text, model):
    if text == []:
        return np.zeros(model.wv.vector_size)
    return sum( average_vector(sentence, model)  for sentence in text )

## Most similar posts?


def similarity(vec_1, vec_2):
    return sklearn.metrics.pairwise.cosine_similarity([vec_1], [vec_2])[0]

def make_similarity_col(df, given_index):
    given_vector = df['avg_vector'][given_index] 
    df['similarity'] = df['avg_vector'].apply( lambda x : similarity(x, given_vector))
    
# helper function for printing the most similar word vectors

def sims(args, model):
    for word, sim in model.wv.most_similar(**args, topn = 10):
        print( f"{word} - similarity {sim}")    

        
        
def train_w2v(tokenized_text):
    # the train dataframe ot build the w2v model on
    
    corpus = []
    for tokenized in tokenized_text:
        corpus += tokenized

    model = gensim.models.Word2Vec(sentences = corpus,  min_count=10, vector_size=300, epochs = 4)
    #model_fasttext = gensim.models.FastText(sentences = corpus,  min_count=10, vector_size=200, epochs = 4)
    
    return model

def vectorize(df, model):
    df['avg_vector'] = df['tokenized_title'].apply(lambda text : average_vector_paragraph(text, model)) 
    X = np.vstack(df['avg_vector'].to_numpy())
    #df.concat(axis = 1, X)
    return X

def unpack_vectors(text, model):
    vectors = []
    for sentance in text:
        for word in sentance:
            if word in model.wv.key_to_index.keys():
                vectors.append(model.wv[word])
    return np.asarray(vectors)

def cloudify(df, model):
    df['point_cloud'] = df['tokenized_title'].apply(lambda text : unpack_vectors(text, model)) 

    return df

def ot_distance(cloud_a, cloud_b):
    n_a = len(cloud_a)
    n_b = len(cloud_b)
    a, b = np.ones((n_a,)) / n_a, np.ones((n_b,)) / n_b 
    M = ot.dist(cloud_a, cloud_b)
    M /= M.max()
    d = ot.emd2(a, b, M)
    return d


In [2]:
df = pd.read_csv("../Data/subreddit_WallStreetBets/otherdata/wsb_cleaned.csv", nrows = 10000)

df = df.dropna(subset = ['title'])


In [3]:
df['tokenized_title'] = df.title.apply(tokenize)
#df['tokenized_selftext'] = df.selftext.apply(tokenize)
model = train_w2v(df['tokenized_title']) 

In [4]:
corpus_df = pd.DataFrame(model.wv.key_to_index.keys())
corpus_df['vector'] = corpus_df[0].apply(lambda x : model.wv[x])

In [16]:
author_counts = df.author.value_counts()
#author_counts

In [11]:
more_than_one_post = list(author_counts [ author_counts > 2 ].index)
more_than_one_post

['None',
 'AutoModerator',
 'fallouthong',
 'MaxAds1',
 'Sandvicheater',
 'rawrtherapy',
 'WSBConsensus',
 'OldTrillionaire',
 'thehandsoap',
 'crossroadie666',
 'bobbythebich',
 'TripleBrain',
 'bigbear0083',
 'air_walks',
 'Vinny32295',
 'Ned_Flanderz',
 'EnlargedOrgan',
 'LiquidityMan',
 'blaked_baller',
 'jangles_mcdangles',
 'eyedontgetjokes',
 'StevenMcphearson',
 'tombomassasin',
 'charvo',
 'vegaseller',
 'AntinatalistPoet',
 'Zer033x',
 'alexstreerking203',
 'Smackythefrogs',
 'Ant0n61',
 'stormwillpass',
 'astafe',
 'RoadhouseSwayz3',
 'zachrf1',
 'TheCreatorishere',
 'Zmurray1996',
 'omgoptions',
 'Sakira-Cadman',
 'sgalligan17',
 'TimAppleBurner',
 'oranguthang87',
 'bencointl',
 'spy400qqq300',
 'Hadron90',
 'Toasty_Man00',
 'OGFlakah',
 'narusik',
 'TrendSpiderDan',
 '27onfire',
 'bireland203',
 'MostlyKelp',
 'daddydickie',
 'Stockbaron',
 'DGAF0752',
 'mcele311',
 'DiffManyFold',
 'opoopt',
 'TheWorstTroll',
 'philmacrack123',
 'livestrong2209',
 'triptamimico',
 'Growt

In [17]:
cleaned = df[ df.author.isin(more_than_one_post)]


In [20]:
compounded = cleaned[['tokenized_title', 'author']].groupby("author").agg('sum')
compounded.drop(labels = ["None", "AutoModerator"])
clouded = cloudify(compounded, model)
clouds = clouded.point_cloud
clouds

author
121518nine    [[-0.022375492, 0.19902197, 0.0023248703, 0.11...
1353-         [[-0.023973137, 0.2047774, -0.0038402965, 0.12...
1sildurr      [[-0.013164273, 0.1047017, 0.00011137097, 0.06...
27onfire      [[-0.008771987, 0.07340773, -0.002343914, 0.04...
3Roontgen     [[-0.008042797, 0.07422254, 0.0018973012, 0.04...
                                    ...                        
ynggekko      [[-0.02795647, 0.22810908, -0.0011223668, 0.12...
zachrf1       [[-0.021291675, 0.1968608, 0.003938506, 0.1231...
zevzev        [[-0.023973137, 0.2047774, -0.0038402965, 0.12...
zsd99         [[-0.01478351, 0.16230886, 0.004318557, 0.0981...
ztnabulsi     [[-0.007575758, 0.07291553, -0.0010454685, 0.0...
Name: point_cloud, Length: 394, dtype: object

In [None]:
distances = np.zeros( shape= (len(clouds), len(clouds)) )
for i in range(len(clouds)):
    print(f"Processing column {i} out of {len(clouds)}")
    for j in range(len(clouds)):
        if i < j:
            d = ot_distance(clouds.iloc[i], clouds.iloc[j])
            distances[i,j] = d
            distances[j,i] = d
            

Processing the 0th column out of 394


  check_result(result_code)


Processing the 1th column out of 394
Processing the 2th column out of 394
Processing the 3th column out of 394
