In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
stopwords = [word for word in set(stopwords.words('english'))] # get stopwords from nltk

# Helper Functions

In [2]:
def preprocess(s, lower=True, strip_punc=True):
    '''
    Input: String, lower(Bool), strip_punc(Bool)
    Output: List of Strings
    '''
    punc = '.-,?<>:;"\'!%'
    if isinstance(s, str):
        s = s.split() # tokenize text
    if lower:
        s = [t.lower() for t in s]
    if strip_punc:
        s = [t.strip(punc) for t in s]
        
    return s

def token_frequency(tokens, tf= None, relative=False):
    """
    Inputs: 
        tokens = List of Strings or None
        tf = dict or None
        relative = Boolean
    Output: 
        Dictionary of a token frequencies
    """
    tf = {} if tf==None else tf
    
    if len(tf) != 0 and relative==True:
        if isinstance(list(tf.items())[0][1], float):
            print('WARNING: Adding raw counts to relative frequency')
            return tf
        
    for token in tokens:
        if token in tf:
            tf[token] += 1
        else:
            tf[token] = 1
    
    if relative:
        total = sum([v for k,v in tf.items()])
        tf = {k:v/total for k, v in tf.items()}
          
    return tf

# Read in and Tokenize New Headlines

In [3]:
# read in cnbc articles
df = pd.read_csv('data/cnbc_news.csv',parse_dates=['publish_date'])
df['publish_date'] = df['publish_date'].dt.tz_convert(None)
# drop any rows with null
df = df.dropna()
df = df.drop_duplicates()

# preprocess text in headlines
df['headline'] = df['headline'].apply(lambda x: preprocess(x))
df.tail()

Unnamed: 0,publish_date,headline
16910,2020-07-02,"[morgan, stanley, analyst, upgrades, avis, bud..."
16911,2020-07-02,"[tesla, stock, hits, record, high, and, smashe..."
16912,2020-07-02,"[fiat, chrysler, unveils, dodge, durango, hell..."
16913,2020-07-02,"[bar, owners, reckon, with, costly, stop, and,..."
16914,2020-07-02,"[texas, issues, statewide, order, requiring, f..."


# Word2Vec With Gensim

- https://radimrehurek.com/gensim/models/word2vec.html
- https://towardsdatascience.com/a-beginners-guide-to-word-embedding-with-gensim-word2vec-model-5970fa56cc92

In [9]:
from gensim.models import Word2Vec

In [7]:
# Gensim takes a list of lists input: a list of tokenized documents
# here, we will treat news headlines as documents
docs = df['headline'].values
docs[:2]

array([list(['how', 'to', 'play', 'a', 'stock', 'market', 'that', 'may', 'be', 'stuck', 'in', 'place', 'for', 'a', 'while']),
       list(['stock', 'market', 'live', 'updates', 'stocks', 'give', 'up', 'gains', 'apple', 'to', 'reclose', 'some', 'stores', 'spotify', 'pops'])],
      dtype=object)

In [41]:
# Create a word2vec model with gensim
# size: number of dimension of the embeddings: default is 100 (dimensions as in tokens per vector)
# window: the max distance between a target word and words around the target, default is 5
# min_count: min count of words to consider when traning the model; words with occurance less than this count will be ignored.
        # default is 5
# workers: number of partitions during training and default is 3
# sg: the traingin algo to use: Either CBOW(0) or skipgram (1)

model = Word2Vec(docs, size=50, window=3, min_count=5, workers=3, sg=1)
vecs = model.wv

In [42]:
vecs['bull']

array([-0.46253225, -0.17376314, -0.5132673 , -0.08568393, -0.2092115 ,
        0.06243603,  0.16513984,  0.16365807, -0.2592522 ,  0.04411814,
       -0.05382431, -0.03621269, -0.33821273, -0.00457379,  0.05200712,
       -0.1989373 , -0.23323207, -0.06765874,  0.04428462,  0.06372458,
        0.04689382,  0.4443095 ,  0.01307581, -0.59132373,  0.16506147,
        0.18172164, -0.14536189,  0.14935048, -0.35808894,  0.12052608,
       -0.04363895,  0.10720734, -0.01859086, -0.09799474,  0.27447236,
        0.08243097, -0.07105912, -0.06738936, -0.11797523, -0.35667685,
        0.18708815,  0.02740279, -0.0369225 , -0.23976038,  0.28822368,
       -0.02022022, -0.54944307, -0.45074967,  0.26717478,  0.10301977],
      dtype=float32)

In [43]:
# cosine similarity: 1 very similar and 0 not similar at all
vecs.similarity('bull','crash')

0.9426507

In [45]:
# most similar word
vecs.most_similar('coronavirus')

[('covid-19', 0.8384757041931152),
 ('virus', 0.8301467895507812),
 ('global', 0.7958473563194275),
 ('us', 0.7786386013031006),
 ('slow', 0.719185471534729),
 ('early', 0.7063236236572266),
 ('trial', 0.7009888291358948),
 ('lockdowns', 0.6995306611061096),
 ('canceled', 0.6964773535728455),
 ('antibody', 0.6947957277297974)]

In [None]:
# save a moel
# model.save(word2vec.model)

# load model
# model = Word2Vec.load("word2vec.model")