In [1]:
import numpy as np
import matplotlib.pylab as pl
import ot
import ot.plot

import pandas as pd
import praw
import re
import nltk

import gensim.models

import networkx as nx
import xgboost as xgb

import numpy as np
import seaborn as sns


import sklearn 
from sklearn.model_selection import train_test_split

from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

from sklearn.cluster import SpectralClustering

from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.tokenize import word_tokenize

regex = re.compile('[^a-zA-Z ]')

#@numba.jit # unfortunately this doesn't jit easily :(
def tokenize(text):
    # given a body of text, this splits into sentences, then processes each word in the sentence to remove
    # non alphabetical characters... (? bad idea, what about users with numbers in their name)
    # returns it as a list of lists of words, the format desired by gensims word2vec
    
    sentences = []
    if type(text) == str:
        for sentence in nltk.tokenize.sent_tokenize(text):
            processed = [regex.sub('', word.lower()) for word in sentence.split(' ') ]
            processed = [word for word in processed if word not in set( ['' ])]
            sentences.append(processed)
    return sentences

def average_vector(text, model):
    present_keys = [x for x in text if x in model.wv.key_to_index ]
    if not present_keys:
        return np.array([0] * len( model.wv[ model.wv.index_to_key[0]]))
    return sum( [model.wv[x] for x in present_keys] ) /len(present_keys)

def average_vector_paragraph(text, model):
    if text == []:
        return np.zeros(model.wv.vector_size)
    return sum( average_vector(sentence, model)  for sentence in text )

## Most similar posts?


def similarity(vec_1, vec_2):
    return sklearn.metrics.pairwise.cosine_similarity([vec_1], [vec_2])[0]

def make_similarity_col(df, given_index):
    given_vector = df['avg_vector'][given_index] 
    df['similarity'] = df['avg_vector'].apply( lambda x : similarity(x, given_vector))
    
# helper function for printing the most similar word vectors

def sims(args, model):
    for word, sim in model.wv.most_similar(**args, topn = 10):
        print( f"{word} - similarity {sim}")    

        
        
def train_w2v(tokenized_text):
    # the train dataframe ot build the w2v model on
    
    corpus = []
    for tokenized in tokenized_text:
        corpus += tokenized

    model = gensim.models.Word2Vec(sentences = corpus,  min_count=10, vector_size=300, epochs = 4)
    #model_fasttext = gensim.models.FastText(sentences = corpus,  min_count=10, vector_size=200, epochs = 4)
    
    return model

def vectorize(df, model):
    df['avg_vector'] = df['tokenized_title'].apply(lambda text : average_vector_paragraph(text, model)) 
    X = np.vstack(df['avg_vector'].to_numpy())
    #df.concat(axis = 1, X)
    return X

def unpack_vectors(text, model_dict, cut_off = 300):
    vectors = []
    
    for sentance in text:
        for word in sentance:
            if word in model_dict.keys():
                if word not in stopwords.words():
                    vectors.append(model_dict[word])
                if len(vectors) >= cut_off:
                    return np.asarray(vectors)
    return np.asarray(vectors)

def cloudify(df, model_dict, cut_off = 300):
    df['title_point_cloud'] = df['tokenized_title'].apply(lambda text : unpack_vectors(text, model_dict, cut_off)) 
    df['selftext_point_cloud']= df['tokenized_selftext'].apply(lambda text : unpack_vectors(text, model_dict, cut_off)) 
    df['merged_point_cloud'] = df['tokenized_merged'].apply(lambda text : unpack_vectors(text, model_dict, cut_off))
    #df.apply( lambda x : np.concatenate([x['title_point_cloud'], x['selftext_point_cloud']]), axis = 1 )
    return df

def uniform(length):
    return np.ones( (length,))/ length

def decay(length, step = .3, shift = 1):
    #step and shift parameters try to mellow out the decay
    likelihood = 1 / (np.cumsum(np.ones(length)*step) + shift)
    return likelihood / sum(likelihood)

def ot_distance(cloud_a, cloud_b, distribution = 'uniform'):
    n_a = len(cloud_a)
    n_b = len(cloud_b)
    if distribution == 'uniform':
        a, b = uniform(n_a), uniform(n_b)
    if distribution == 'decay':
        a, b = decay(n_a), decay(n_b)        
    M = ot.dist(cloud_a, cloud_b)
    M /= M.max()
    d = ot.emd2(a, b, M)
    return d

def ot_distance_regularized(cloud_a, cloud_b):
    n_a = len(cloud_a)
    n_b = len(cloud_b)
    a, b = np.ones((n_a,)) / n_a, np.ones((n_b,)) / n_b 
    M = ot.dist(cloud_a, cloud_b)
    M /= M.max()
    lambd = 1e-3
    d = ot.sinkhorn2(a, b, M, lambd)[0]
    return d

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\lnajt\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
df = pd.read_csv("../Data/subreddit_WallStreetBets/otherdata/wsb_cleaned.csv", nrows = 5000)
df = df.dropna(subset = ['title','selftext'])
df['merged'] = df.apply( lambda x : x.title + " " + x.selftext , axis = 1)
#df.set_index('id')

In [3]:
df['tokenized_title'] = df.title.apply(tokenize)
df['tokenized_selftext'] = df.selftext.apply(tokenize)
df['tokenized_merged'] = df.merged.apply(tokenize)
#model = train_w2v(df['tokenized_title'].append(df['tokenized_selftext'])) 

In [4]:


corpus_df = pd.read_csv('learned_embedding.csv')

_RE_COMBINE_WHITESPACE = re.compile(r"\s+")

def fix_encoding(string):
    string = string.replace('\n', '')
    string = _RE_COMBINE_WHITESPACE.sub(" ", string).strip()
    string = string.replace(" ", ",")
    string = string.replace("[,", "[")
    return eval(string)
model_dict = dict(corpus_df.set_index('0')['vector'].apply(lambda x : fix_encoding(x)))

In [136]:
#corpus_df = pd.DataFrame(model.wv.key_to_index.keys())
#corpus_df['vector'] = corpus_df[0].apply(lambda x : model.wv[x])
#corpus_df.to_csv("learned_embedding.csv")

## Cluster authors based on their word vector point cloud distributions:

In [127]:
%%script false 
author_counts = df.author.value_counts()

frequent_poasters = list(author_counts [ author_counts > 5 ].index)
author_counts [ author_counts > 5 ]

cleaned = df[ df.author.isin(frequent_poasters)]

compounded = cleaned[['tokenized_title', 'author']].groupby("author").agg('sum')
compounded.drop(labels = ["None", "AutoModerator"])
clouded = cloudify(compounded, model)
clouds = clouded[[point_cloud]]
clouds

Couldn't find program: 'false'


In [44]:
%%script false 


distances = np.zeros( shape= (len(clouds), len(clouds)) )
# This is expensive, I don't wnat to redo it every time...
for i in range(len(clouds)):
    print(f"Processing column {i} out of {len(clouds)}")
    for j in range(len(clouds)):
        if i < j:
            d = ot_distance_regularized(clouds.iloc[i], clouds.iloc[j])
            distances[i,j] = d
            distances[j,i] = d
            
aff_matrix = np.exp( -1 * distances / distances.std())

sc = SpectralClustering(n_clusters = 8, affinity = 'precomputed')
labels = sc.fit_predict(aff_matrix)
clouds['clusters'] = labels
clouds.clusters.sort_values()

Couldn't find program: 'false'


It's hard to make sense of this because I don't know the users well enough to cluster them. 

If we cluster posts by title instead at least then the clusters can be evaluated by inspection.

## Cluster posts based on their word vector clouds.

How well does nearest neighbor classification do?
Can we build a Bayesian hierarchical model that takes into account any groups we find here?
(If we find any conceptually meaningful clusters, what else can we do with that information?)

In [154]:
num_rows = 100
cut_off = 20 # ONly take the first cut_off words...
df_slice = pd.DataFrame(df.loc[:num_rows, :])
post_clouded = cloudify(df_slice, model_dict, cut_off = cut_off)
cloud_col = ['selftext_point_cloud','title_point_cloud', 'merged_point_cloud'][2]

post_clouded = pd.DataFrame(post_clouded [ post_clouded[cloud_col].apply(lambda x : len(x) > 0)])



In [155]:
distances = np.zeros( shape= (len(post_clouded), len(post_clouded)) )
distances_decay = np.zeros( shape= (len(post_clouded), len(post_clouded)) )

k = int(len(post_clouded) / 10)

for i in range(len(post_clouded)):
    if i % k == 0:
        print(f"Processing column {i} out of {len(post_clouded)}")
    for j in range(len(post_clouded)):
        if i < j:
            d = ot_distance(post_clouded.iloc[i][cloud_col], post_clouded.iloc[j][cloud_col], distribution = "uniform")
            distances[i,j] = d
            distances[j,i] = d
            d = ot_distance(post_clouded.iloc[i][cloud_col], post_clouded.iloc[j][cloud_col], distribution = "decay")
            distances_decay[i,j] = d
            distances_decay[j,i] = d

Processing column 0 out of 74
Processing column 7 out of 74
Processing column 14 out of 74
Processing column 21 out of 74
Processing column 28 out of 74
Processing column 35 out of 74
Processing column 42 out of 74
Processing column 49 out of 74
Processing column 56 out of 74
Processing column 63 out of 74
Processing column 70 out of 74


In [156]:
distance_matrix = [ distances, distances_decay][0]

aff_matrix = np.exp( -1 * distance_matrix / distance_matrix.std())

sc = SpectralClustering(n_clusters = int(len(post_clouded)/5), affinity = 'precomputed')
labels = sc.fit_predict(aff_matrix)
post_clouded['clusters'] = labels

In [160]:

pd.options.display.max_colwidth = 80
pd.options.display.max_rows = num_rows
#post_clouded['selftext_trunc'] = post_clouded.selftext.apply( lambda x : x[:cut_off])
#post_clouded[['id', 'ups', 'title', 'selftext_trunc', 'clusters']].sort_values(by = 'clusters')
post_clouded[['id', 'ups', 'title', 'selftext',  'clusters']].sort_values(by = 'clusters')

Unnamed: 0,id,ups,title,selftext,clusters
100,eifobk,0,Hot inside tip AMD crashing,[removed]\n,0
64,eijzxq,1,What do you think about CD Projekt?,[removed]\n,0
21,eio958,3,Is Robinhood a good platform for buying small amounts of stock for kids,[deleted]\n,0
61,eik56h,1,What do you think about CD Projekt?,[removed]\n,0
30,einvbk,44,Advanced portfolio funding strategy,[deleted]\n,0
31,einsgv,5,Teach me your ways - Shorting Dry Stock Vessels,"I'm pretty new to investing, so obviously my only experience is 2-3 weeks on...",0
60,eikbad,1,What do you think about CD Projekt?,[removed]\n,0
67,eijht4,1,A WSB lurker's 2019 results; primarily via position trading beaten down biot...,[deleted]\n,0
34,einjt5,3,Index funds don’t run same risk of assignment as stocks?,Warning: actual useful information \nStudying up on iron butterflies and it ...,0
36,einfso,4,Phase 1 trade deal charity bet,"Tards:\nA few days ago, in response to news that the Phase1 trade deal is sc...",0


### There are too many posts to cluster the entire dataframe. Instead, we can first group posts by time.



In [168]:
num_rows = 1000
cut_off = 30
df_slice = pd.DataFrame(df.loc[:num_rows, :])

post_clouded = cloudify(df_slice, model_dict, cut_off = cut_off)
cloud_col = ['selftext_point_cloud','title_point_cloud', 'merged_point_cloud'][2]

post_clouded = pd.DataFrame(post_clouded [ post_clouded[cloud_col].apply(lambda x : len(x) > 0)])


post_clouded= post_clouded.set_index('id')

In [169]:
post_graph = nx.Graph()
for i in range(len(df)):
    if df.iloc[i].id in post_clouded.index:
        post_graph.add_node(df.iloc[i].id)
        for j in range(-200,200): # should replace by posting time?
            if j >= 0 and j < len(df) and i != j:
                if df.iloc[j].id in post_clouded.index:
                    post_graph.add_edge( df.iloc[i].id, df.iloc[j].id)


In [170]:
possible_reposts = [] # these might not be resposts -- we removed stopwords and emojis.

for edge in post_graph.edges():
    distance = ot_distance(post_clouded.loc[edge[0]][cloud_col], post_clouded.loc[edge[1]][cloud_col])
    post_graph.edges[edge]["distance"] = distance
    if distance == 0:
        print("Repost:", edge)
        post_graph.edges[edge]["affinity"] = 99999999999999999
        possible_reposts.append(edge)
    else:
        post_graph.edges[edge]["affinity"] = 1/distance

In [171]:

node_list = list(post_graph.nodes())
aff_matrix = nx.adj_matrix(post_graph, nodelist = node_list)
sc = SpectralClustering(n_clusters = int(len(post_clouded)/5), affinity = 'precomputed')
labels = sc.fit_predict(aff_matrix)


In [172]:

post_df = pd.DataFrame()
post_df['id'] = node_list
post_df['title'] = [post_clouded.loc[post_id].title for post_id in node_list]
post_df['label'] = labels

In [174]:
pd.options.display.max_colwidth = 80
pd.options.display.max_rows = num_rows
post_df.sort_values(by = 'label')

Unnamed: 0,id,title,label
0,eipxnr,Good time to get on $BLUE,0
450,ejiop9,Presented without comment,0
451,ejilpz,Best way to close options sold,0
452,ejijn6,War go boom boom where will we live?,0
453,ejig8u,You'd think Investopedia would a bit intelligent,0
454,ejig5s,Best way to close options sold,0
455,ejidtq,Which of you degenerates did I find on Super Mario Maker 2?,0
457,ejicyv,"Bully ,Jezz chill , Bear gotta eat two",0
458,ejibiy,Any good weed ETFs?,0
459,eji9n6,Nice,0


Well that didn't work at all!

In [None]:
epsilon = 0
for e in post_graph.edges():
    if post_graph.edges[e]["distance"] <= epsilon:
        print(e)

In [121]:
e

('eibdob', 'ejpe44')

In [122]:
 post_graph.edges[e]["distance"]

0.5158603238983451

We could also try using various smooth functions on the graph just constructed as regression features.

## Clustering with BERT



In [7]:

import torch
import transformers as ppb
import warnings
import tqdm
warnings.filterwarnings('ignore')
# For DistilBERT:

col = 'title'
model_class, tokenizer_class, pretrained_weights = (ppb.DistilBertModel, ppb.DistilBertTokenizer, 'distilbert-base-uncased')

## Want BERT instead of distilBERT? Uncomment the following line:
#model_class, tokenizer_class, pretrained_weights = (ppb.BertModel, ppb.BertTokenizer, 'bert-base-uncased')

# Load pretrained model/tokenizer
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights)

batched_features = []

def split_dataframe(df, chunk_size = 1000): 
    # https://stackoverflow.com/questions/17315737/split-a-large-pandas-dataframe
    chunks = list()
    num_chunks = len(df) // chunk_size + (1 if len(df) % chunk_size else 0)
    for i in range(num_chunks):
        chunks.append(df[i*chunk_size:(i+1)*chunk_size])
    return chunks

cut_off = len(df) #200
for batch in tqdm.tqdm(split_dataframe(df[:cut_off])):
    # have to restrict into a smaller batch because my computer doesn't have enough memory...
    
    tokenized = batch[col].apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))


    max_len = 0
    for i in tokenized.values:
        if len(i) > max_len:
            max_len = len(i)

    padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])

    attention_mask = np.where(padded != 0, 1, 0)
    attention_mask.shape


    input_ids = torch.tensor(padded).type(torch.LongTensor) # have to cast to Longs
    attention_mask = torch.tensor(attention_mask).type(torch.LongTensor)

    with torch.no_grad():
        last_hidden_states = model(input_ids, attention_mask=attention_mask)

    features = last_hidden_states[0][:,0,:].numpy()
    batched_features.append(features)
    
features = np.vstack(batched_features)
labels = df[:cut_off].ups

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_transform.bias', 'vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|██████████| 4/4 [04:19<00:00, 64.88s/it]


In [11]:
df_partial = pd.DataFrame(df[:cut_off])
df_partial['bert_vector'] = list(features)

In [13]:
sc = SpectralClustering(n_clusters = int(len(df_partial)/5))
labels = sc.fit_predict(features)
df_partial['clusters'] = labels



In [1]:
pd.options.display.max_colwidth = 300
pd.options.display.max_rows = cut_off
df_partial[['title', 'clusters']].sort_values(by = 'clusters')

NameError: name 'pd' is not defined

Restricting to clustering popular posts only.
Q: Can we model culture change as new clusters among popular posts.