In [63]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.style.use('ggplot')

from nltk.corpus import stopwords
stopwords = [word for word in set(stopwords.words('english'))] # get stopwords from nltk

# Helper Functions

In [5]:
def preprocess(s, lower=True, strip_punc=True):
    '''
    Input: String, lower(Bool), strip_punc(Bool)
    Output: List of Strings
    '''
    punc = '.-,?<>:;"\'!%'
    if isinstance(s, str):
        s = s.split() # tokenize text
    if lower:
        s = [t.lower() for t in s]
    if strip_punc:
        s = [t.strip(punc) for t in s]
        
    return s

def token_frequency(tokens, tf= None, relative=False):
    """
    Inputs: 
        tokens = List of Strings or None
        tf = dict or None
        relative = Boolean
    Output: 
        Dictionary of a token frequencies
    """
    tf = {} if tf==None else tf
    
    if len(tf) != 0 and relative==True:
        if isinstance(list(tf.items())[0][1], float):
            print('WARNING: Adding raw counts to relative frequency')
            return tf
        
    for token in tokens:
        if token in tf:
            tf[token] += 1
        else:
            tf[token] = 1
    
    if relative:
        total = sum([v for k,v in tf.items()])
        tf = {k:v/total for k, v in tf.items()}
          
    return tf

# Read in and Tokenize New Headlines

In [7]:
# read in cnbc articles
df = pd.read_csv('data/cnbc_news.csv', parse_dates=['publish_date'])
df['publish_date'] = df['publish_date'].dt.tz_convert('UTC')

# drop any rows with null
df = df.dropna()
df = df.drop_duplicates()

# preprocess text in headlines
df['headline'] = df['headline'].apply(lambda x: preprocess(x))
df.tail()

Unnamed: 0,publish_date,headline
53377,2020-12-28 00:00:00+00:00,"[japan's, nikkei, 225, surges, to, levels, not..."
53378,2020-12-28 00:00:00+00:00,"[u.s, might, be, missing, the, new, covid, var..."
53496,2020-12-27 00:00:00+00:00,"[dow, futures, indicate, 200-point, gain, as, ..."
53501,2020-12-27 00:00:00+00:00,"[dow, jumps, 300, points, to, record, high, as..."
53504,2020-12-27 00:00:00+00:00,"[alibaba, shares, plunge, about, 8, for, secon..."


# Word2Vec With Gensim

Resources: 
- https://radimrehurek.com/gensim/models/word2vec.html
- https://towardsdatascience.com/a-beginners-guide-to-word-embedding-with-gensim-word2vec-model-5970fa56cc92

In [9]:
from gensim.models import Word2Vec

In [10]:
# Gensim takes a list of lists input: a list of tokenized documents
# here, we will treat news headlines as documents
docs = df['headline'].values
docs[:2]

array([list(['lobbyist', 'brother', 'of', 'biden', 'advisor', 'has', 'reputation', 'for', 'deep', 'connections', 'and', 'looking', 'to', 'avoid', 'possible', 'conflicts']),
       list(['how', 'to', 'navigate', 'the', 'world', 'of', 'sustainable', 'investing', 'ratings'])],
      dtype=object)

In [93]:
# Create a word2vec model with gensim
# size: number of dimension of the embeddings: default is 100 (dimensions as in tokens per vector)
# window: the max distance between a target word and words around the target, default is 5
# min_count: min count of words to consider when traning the model; words with occurance less than this count will be ignored.
        # default is 5
# workers: number of partitions during training and default is 3
# sg: the traingin algo to use: Either CBOW(0) or skipgram (1)

model = Word2Vec(docs, size=16, window=3, min_count=10, workers=3, sg=1)
vecs = model.wv

In [94]:
vecs['bull']

array([ 0.13671018,  0.49693862,  0.1728348 ,  0.19943671, -0.21791488,
        1.3944778 , -0.33834928, -1.0886257 ,  0.2216287 ,  0.36374584,
        0.21042699, -0.41904923,  0.1970994 , -0.75551695,  0.3579834 ,
       -0.31583467], dtype=float32)

In [95]:
# cosine similarity: 1 very similar and 0 not similar at all
vecs.similarity('biden','trump')

0.90213865

In [96]:
# most similar word
vecs.most_similar('sell')

[('buy', 0.9362697601318359),
 ('like', 0.9204457998275757),
 ('bet', 0.9000283479690552),
 ('upgrade', 0.8943588733673096),
 ('bought', 0.8809952735900879),
 ('rivals', 0.8772242069244385),
 ('selling', 0.8758612871170044),
 ('risky', 0.8747047185897827),
 ('expensive', 0.8741658926010132),
 ('bonds', 0.8724066019058228)]

In [52]:
# save a moel
# model.save("word2vec.model")

# load model
# model = Word2Vec.load("word2vec.model")

# Write to TSV file

In [101]:
with open('vecs.tsv','w') as tsv:
    words = model.wv.vocab.keys()

    for word in words:
        vector = [str(n) for n in model.wv[word]]
        row = '\t'.join(vector) + '\n'
        tsv.write(row)
        
with open('words.tsv','w') as tsv:
    
    words = model.wv.vocab.keys()
    tsv.write("word" + '\n')
    for word in words:
        tsv.write(word + '\n')