# NLP Modeling

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import spacy
!spacy download 'en_core_web_md'

import time
from gensim.models import Word2Vec, Doc2Vec
from gensim.models.doc2vec import TaggedDocument


[38;5;1m✘ No compatible model found for 'en_core_web_md ' (spaCy v2.3.2).[0m



In [None]:
cleaned_tweets = pd.read_csv('./data/cleaned_tweets.csv', encoding = 'utf-8').astype('str')
cleaned_raw_tweets = pd.read_csv('./data/cleaned_raw_tweets.csv', encoding = 'utf-8').astype('str')

In [None]:
# Was getting an error from importing the saved cleaned tweet files. Some were being imported as type float for some reason
types = pd.Series([type(item) for item in cleaned_raw_tweets.text])
print(types.value_counts())
cleaned_raw_tweets.loc[types[types == float].index.to_list()]

<class 'str'>    48368
dtype: int64


Unnamed: 0,date,time,user_name,screen_name,text,user_description,retweeted,geo,location,source


## Tokenize Text

In [None]:
spacy.prefer_gpu()
nlp = spacy.load('en_core_web_md')

In [None]:
start_time = time.time()

docs = list(nlp.pipe(cleaned_tweets.text))

cleaned_docs = []
for doc in docs:
    cleaned_docs.append([token for token in doc if token.is_alpha == True or token.is_stop == True])


print('---- Total time to execute: %s seconds ----' %(time.time() - start_time))

---- Total time to execute: 114.63301873207092 seconds ----


In [None]:
len([x for x in cleaned_docs if x ==[]])

1

In [None]:
blanks = []
for i in range(len(docs)):
    if cleaned_docs[i] == []:
        blanks.append(i)

print(blanks, '\n')

print('Number of total cleaned docs: %i' %len(cleaned_docs))
print('Number of blank docs: %i' %len(blanks))
print('Number of total non-blank cleaned docs: %i' %(len(cleaned_docs) - len(blanks)))


[26841] 

Number of total cleaned docs: 41366
Number of blank docs: 1
Number of total non-blank cleaned docs: 41365


In [None]:
cleaned_docs = [x for x in cleaned_docs if x != []]
len(cleaned_docs)

41365

In [None]:
len([x for x in cleaned_docs if x ==[]])

0

## Doc2Vec

In [None]:
start_time = time.time()

lemma_docs = []
for doc in cleaned_docs:
    lemma_docs.append(' '.join([(token.lemma_ if token.lemma_ != "-PRON-" else token.text) for token in doc if token.is_stop == False]))

lemma_docs = list(nlp.pipe(lemma_docs))

print('---- Total time to execute: %s seconds ----' %(time.time() - start_time))

---- Total time to execute: 57.562408685684204 seconds ----


In [None]:
train_set, val_set = train_test_split(lemma_docs, test_size = 5000, random_state = 42)
tagged_train = [TaggedDocument([word.text for word in doc], ['train' + '_%s' % i]) for i, doc in enumerate(train_set)]


In [None]:
import pickle 

with open('./data/lemma_text_train_set.csv', 'wb') as output:
    pickle.dump(train_set, output)

with open('./data/lemma_text_val_set.csv', 'wb') as output:
    pickle.dump(val_set, output)

In [None]:
with open ('./data/lemma_text_train_set.csv', 'rb') as fp:
    train_set = pickle.load(fp)

with open ('./data/lemma_text_val_set.csv', 'rb') as fp:
    val_set = pickle.load(fp)

In [None]:
import multiprocessing
cores = multiprocessing.cpu_count()  # 19937

# start_time = time.time()

# dm_model = Doc2Vec(dm = 1, vector_size = 50, min_count = 3, epochs = 30, workers = cores, seed = 42, alpha = 0.05, min_alpha = 0.001)
# dm_model.build_vocab(tagged_train)
# dm_model.train(tagged_train, total_examples = dm_model.corpus_count, epochs = dm_model.epochs)
# dm_model.save('./models/nlp models/dm_model')

# print('---- Total time to execute: %s seconds ----' %(time.time() - start_time))

In [None]:
# start_time = time.time()

# dm_mean_model = Doc2Vec(dm = 1, vector_size = 50, min_count = 3, epochs = 30, workers = cores, seed = 42, alpha = 0.05, min_alpha = 0.001)
# dm_mean_model.build_vocab(tagged_train)
# dm_mean_model.train(tagged_train, total_examples = dm_mean_model.corpus_count, epochs = dm_mean_model.epochs)
# dm_mean_model.save('./models/nlp models/dm_mean_model')

# print('---- Total time to execute: %s seconds ----' %(time.time() - start_time))

In [None]:
# start_time = time.time()

# dm_concat_model = Doc2Vec(dm = 1, vector_size = 50, min_count = 3, epochs = 30, workers = cores, seed = 42, alpha = 0.05, min_alpha = 0.001)
# dm_concat_model.build_vocab(tagged_train)
# dm_concat_model.train(tagged_train, total_examples = dm_concat_model.corpus_count, epochs = dm_concat_model.epochs)
# dm_concat_model.save('./models/nlp models/dm_concat_model')

# print('---- Total time to execute: %s seconds ----' %(time.time() - start_time))

In [None]:
# start_time = time.time()

# dbow_model = Doc2Vec(dm = 1, vector_size = 50, min_count = 3, epochs = 30, workers = cores, seed = 42, alpha = 0.05, min_alpha = 0.001)
# dbow_model.build_vocab(tagged_train)
# dbow_model.train(tagged_train, total_examples = dbow_model.corpus_count, epochs = dbow_model.epochs)
# dbow_model.save('./models/nlp models/dbow_model')

# print('---- Total time to execute: %s seconds ----' %(time.time() - start_time))

In [None]:
dm_model = Doc2Vec.load('./models/nlp models/dm_model')
dm_mean_model = Doc2Vec.load('./models/nlp models/dm_mean_model')
dm_concat_model = Doc2Vec.load('./models/nlp models/dm_concat_model')
dbow_model = Doc2Vec.load('./models/nlp models/dbow_model')

In [None]:
dm_model.wv.most_similar('economy')
# dm_model.infer_vector([token.text for token in val_set[0]])   # to "predict" the paragraph vector for a new tweet

[('economic', 0.7207037210464478),
 ('recession', 0.6659744381904602),
 ('modest', 0.5987817049026489),
 ('market', 0.5922441482543945),
 ('coronavirus', 0.5887203216552734),
 ('sector', 0.5802310705184937),
 ('outbreak', 0.5655859112739563),
 ('pandemic', 0.5629265904426575),
 ('manufacturing', 0.5598702430725098),
 ('factory', 0.5186828970909119)]

In [None]:
dm_model.wv.most_similar('virus')

[('coronavirus', 0.8227459192276001),
 ('covid', 0.6431212425231934),
 ('viral', 0.5927944779396057),
 ('region', 0.5651749968528748),
 ('illness', 0.5579205751419067),
 ('novel', 0.5516122579574585),
 ('deadly', 0.5470724105834961),
 ('china', 0.5455681085586548),
 ('outbreak', 0.5437846183776855),
 ('dependence', 0.5412525534629822)]

## Prepare Word Vectors

In [None]:
def get_vectors(model, corpus, test_set = False):
    if test_set == True:        
        text = [[token.text for token in doc] for doc in corpus]
        vecs = np.array([model.infer_vector(words) for i, words in enumerate(text)])
        return pd.DataFrame(vecs)
    vecs = np.asarray([model.docvecs[i] for i, words in enumerate(corpus)])
    return pd.DataFrame(vecs)

In [None]:
dm_train_vecs = get_vectors(dm_model, train_set)
dm_val_vecs = get_vectors(dm_model, val_set, test_set = True)
# dm_train_vecs.to_csv('./data/dm_train_vecs.csv', index = False, header = False)
# dm_val_vecs.to_csv('./data/dm_val_vecs.csv', index = False, header = False)
print('The dimensions of the dm_model training set are:', dm_train_vecs.shape,
      '\nThe dimensions of the dm_model validation set are:', dm_val_vecs.shape)


The dimensions of the dm_model training set are: (36365, 50) 
The dimensions of the dm_model validation set are: (5000, 50)


In [None]:
dm_mean_train_vecs = get_vectors(dm_mean_model, train_set)
dm_mean_val_vecs = get_vectors(dm_mean_model, val_set, test_set = True)
# dm_mean_train_vecs.to_csv('./data/dm_mean_train_vecs.csv', index = False, header = False)
# dm_mean_val_vecs.to_csv('./data/dm_mean_val_vecs.csv', index = False, header = False)
print('The dimensions of the dm_mean_model training set are:', dm_mean_train_vecs.shape,
      '\nThe dimensions of the dm_mean_model validation set are:', dm_mean_val_vecs.shape)


The dimensions of the dm_mean_model training set are: (36365, 50) 
The dimensions of the dm_mean_model validation set are: (5000, 50)


In [None]:
dm_concat_train_vecs = get_vectors(dm_concat_model, train_set)
dm_concat_val_vecs = get_vectors(dm_concat_model, val_set, test_set = True)
# dm_concat_train_vecs.to_csv('./data/dm_concat_train_vecs.csv', index = False, header = False)
# dm_concat_val_vecs.to_csv('./data/dm_concat_val_vecs.csv', index = False, header = False)
print('The dimensions of the dm_concat_model training set are:', dm_concat_train_vecs.shape,
      '\nThe dimensions of the dm_concat_model validation set are:', dm_concat_val_vecs.shape)


The dimensions of the dm_concat_model training set are: (36365, 50) 
The dimensions of the dm_concat_model validation set are: (5000, 50)


In [None]:
dbow_train_vecs = get_vectors(dbow_model, train_set)
dbow_val_vecs = get_vectors(dbow_model, val_set, test_set = True)
# dbow_train_vecs.to_csv('./data/dbow_train_vecs.csv', index = False, header = False)
# dbow_val_vecs.to_csv('./data/dbow_val_vecs.csv', index = False, header = False)
print('The dimensions of the dbow_model training set are:', dbow_train_vecs.shape,
      '\nThe dimensions of the dbow_model validation set are:', dbow_val_vecs.shape)


The dimensions of the dbow_model training set are: (36365, 50) 
The dimensions of the dbow_model validation set are: (5000, 50)


## TF- IDF

In [None]:
def get_sparse_vecs():
    pass

In [None]:
import pickle
with open ('./data/lemma_text_train_set.csv', 'rb') as fp:
    train_set = pickle.load(fp)

with open ('./data/lemma_text_val_set.csv', 'rb') as fp:
    val_set = pickle.load(fp)

In [None]:
train_text = [[word.text for word in doc] for doc in train_set]
train_text = [[' '.join(word)] for word in train_text]
train_text = [sent for doc in train_text for sent in doc]


val_text = [[word.text for word in doc] for doc in val_set]
val_text = [[' '.join(word)] for word in val_text]
val_text = [sent for doc in val_text for sent in doc]

#### Uni-Gram Model

In [None]:
tfidf_ugram = TfidfVectorizer(ngram_range = (1,1), max_features = 500)
tf_uni_train_vecs = tfidf_ugram.fit_transform(train_text)
tf_uni_train_vecs = pd.DataFrame(tf_uni_train_vecs.toarray())

tf_uni_val_vecs = tfidf_ugram.transform(val_text)
tf_uni_val_vecs = pd.DataFrame(tf_uni_val_vecs.toarray())

In [None]:
tf_uni_train_vecs.to_csv('./data/tf_uni_train_vecs.csv', index = False, header = False)
tf_uni_val_vecs.to_csv('./data/tf_uni_val_vecs.csv', index = False, header = False)

#### Bi-Gram Model

In [None]:
tfidf_bigram = TfidfVectorizer(ngram_range = (1,2), max_features = 500)
tf_bi_train_vecs = tfidf_bigram.fit_transform(train_text)
tf_bi_train_vecs = pd.DataFrame(tf_bi_train_vecs.toarray())

tf_bi_val_vecs = tfidf_bigram.transform(val_text)
tf_bi_val_vecs = pd.DataFrame(tf_bi_val_vecs.toarray())

In [None]:
tf_bi_train_vecs.to_csv('./data/tf_bi_train_vecs.csv', index = False, header = False)
tf_bi_val_vecs.to_csv('./data/tf_bi_val_vecs.csv', index = False, header = False)

#### Tri-Gram Model

In [None]:
tfidf_trigram = TfidfVectorizer(ngram_range = (1,3), max_features = 500)
tf_tri_train_vecs = tfidf_trigram.fit_transform(train_text)
tf_tri_train_vecs = pd.DataFrame(tf_tri_train_vecs.toarray())

tf_tri_val_vecs = tfidf_trigram.transform(val_text)
tf_tri_val_vecs = pd.DataFrame(tf_tri_val_vecs.toarray())

In [None]:
tf_tri_train_vecs.to_csv('./data/tf_tri_train_vecs.csv', index = False, header = False)
tf_tri_val_vecs.to_csv('./data/tf_tri_val_vecs.csv', index = False, header = False)