In [None]:
# https://medium.com/@black_swan/how-to-train-word2vec-and-fasttext-embedding-on-wikipedia-corpus-9e8ac45a0c0a
# https://radimrehurek.com/gensim/auto_examples/tutorials/run_word2vec.html

In [1]:
import pandas as pd
import praw
import re
import nltk

In [2]:
df = pd.read_csv("../Data/wsb_cleaned.csv")

In [3]:
regex = re.compile('[^a-zA-Z ]')
for col in ['title', 'author', 'selftext']:
    df[col] = df[col].apply(lambda x : regex.sub('', str(x) ))

In [4]:
df['title']

0                                   Good time to get on BLUE
1          i wanna buy a call but i dont want to bet too ...
2                                               Buy INTU  DD
3          If you thought your  was bad at least you aren...
4                                            I will remember
                                 ...                        
1260232                        ASO technical breakout and DD
1260233                                   First YOLO on CRSR
1260234                                      Rkt to the moon
1260235    My therapist told me Im delusional for thinkin...
1260236    CCIV  Lucid motors testing their FSD in Fremon...
Name: title, Length: 1260237, dtype: object

In [6]:
#nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\lnajt\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [8]:
import gensim.models


text = df['title']
#sentences = [nltk.word_tokenize(sent) for sent in nltk.sent_tokenize(text)]
sentences = [x.split(' ') for x in text]
model = gensim.models.Word2Vec(sentences = sentences,  min_count=10, vector_size=20, epochs = 1)

In [9]:
for index, word in enumerate(model.wv.index_to_key):
    if index == 200:
        break
    print(f"word #{index}/{len(model.wv.index_to_key)} is {word}")

word #0/29759 is 
word #1/29759 is the
word #2/29759 is to
word #3/29759 is I
word #4/29759 is a
word #5/29759 is and
word #6/29759 is GME
word #7/29759 is is
word #8/29759 is on
word #9/29759 is of
word #10/29759 is in
word #11/29759 is for
word #12/29759 is this
word #13/29759 is you
word #14/29759 is my
word #15/29759 is it
word #16/29759 is AMC
word #17/29759 is are
word #18/29759 is with
word #19/29759 is THE
word #20/29759 is buy
word #21/29759 is all
word #22/29759 is at
word #23/29759 is we
word #24/29759 is up
word #25/29759 is Im
word #26/29759 is The
word #27/29759 is me
word #28/29759 is be
word #29/29759 is but
word #30/29759 is that
word #31/29759 is Robinhood
word #32/29759 is not
word #33/29759 is stock
word #34/29759 is from
word #35/29759 is have
word #36/29759 is do
word #37/29759 is TO
word #38/29759 is like
word #39/29759 is just
word #40/29759 is now
word #41/29759 is will
word #42/29759 is about
word #43/29759 is can
word #44/29759 is What
word #45/29759 is HOLD


In [12]:
for word, sim in model.wv.most_similar(positive=['GME'], negative=[]):
    print('\"%s\"\t- similarity: %g' % (word, sim))
    print('')

"AMC"	- similarity: 0.915942

"shares"	- similarity: 0.835391

"gme"	- similarity: 0.802144

"amc"	- similarity: 0.801782

"dip"	- similarity: 0.778836

"everything"	- similarity: 0.759182

"puts"	- similarity: 0.756371

"calls"	- similarity: 0.753323

"today"	- similarity: 0.750671

"yesterday"	- similarity: 0.749364



In [21]:
for word, sim in model.wv.most_similar(positive=['tendies']):
    print('\"%s\"\t- similarity: %g' % (word, sim))
    print('')

"gains"	- similarity: 0.876917

"profits"	- similarity: 0.868857

"losses"	- similarity: 0.822006

"hands"	- similarity: 0.819225

"money"	- similarity: 0.808949

"stonks"	- similarity: 0.793267

"ass"	- similarity: 0.792297

"cause"	- similarity: 0.789839

"friends"	- similarity: 0.785216

"rich"	- similarity: 0.783376



In [9]:
## Saving and loading

import tempfile

with tempfile.NamedTemporaryFile(prefix='gensim-model-', delete=False) as tmp:
    temporary_filepath = tmp.name
    model.save(temporary_filepath)
    #
    # The model is now safely stored in the filepath.
    # You can copy it to other machines, share it with others, etc.
    #
    # To load a saved model:
    #
    new_model = gensim.models.Word2Vec.load(temporary_filepath)

In [13]:
from sklearn.decomposition import IncrementalPCA    # inital reduction
from sklearn.manifold import TSNE                   # final reduction
import numpy as np                                  # array handling


def reduce_dimensions(model):
    num_dimensions = 2  # final num dimensions (2D, 3D, etc)

    # extract the words & their vectors, as numpy arrays
    vectors = np.asarray(model.wv.vectors)
    labels = np.asarray(model.wv.index_to_key)  # fixed-width numpy strings

    # reduce using t-SNE
    tsne = TSNE(n_components=num_dimensions, random_state=0)
    vectors = tsne.fit_transform(vectors)

    x_vals = [v[0] for v in vectors]
    y_vals = [v[1] for v in vectors]
    return x_vals, y_vals, labels


x_vals, y_vals, labels = reduce_dimensions(model)

def plot_with_plotly(x_vals, y_vals, labels, plot_in_notebook=True):
    from plotly.offline import init_notebook_mode, iplot, plot
    import plotly.graph_objs as go

    trace = go.Scatter(x=x_vals, y=y_vals, mode='text', text=labels)
    data = [trace]

    if plot_in_notebook:
        init_notebook_mode(connected=True)
        iplot(data, filename='word-embedding-plot')
    else:
        plot(data, filename='word-embedding-plot.html')


def plot_with_matplotlib(x_vals, y_vals, labels):
    import matplotlib.pyplot as plt
    import random

    random.seed(0)

    plt.figure(figsize=(12, 12))
    plt.scatter(x_vals, y_vals)

    #
    # Label randomly subsampled 25 data points
    #
    indices = list(range(len(labels)))
    selected_indices = random.sample(indices, 25)
    for i in selected_indices:
        plt.annotate(labels[i], (x_vals[i], y_vals[i]))

try:
    get_ipython()
except Exception:
    plot_function = plot_with_matplotlib
else:
    plot_function = plot_with_plotly

plot_function(x_vals, y_vals, labels)


KeyboardInterrupt: 