In [0]:
import pandas as pd 
pd.options.mode.chained_assignment = None
import numpy as np 
from copy import deepcopy
from string import punctuation
from random import shuffle
from collections import Counter

import gensim
from gensim.models.word2vec import Word2Vec # the word2vec model gensim class
LabeledSentence = gensim.models.doc2vec.LabeledSentence 

from tqdm import tqdm
tqdm.pandas(desc="progress-bar")

from nltk.tokenize import TweetTokenizer # a tweet tokenizer from nltk.
tokenizer = TweetTokenizer()

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

In [0]:
def ingest():
    data = pd.read_csv('./cleaned_dataset_v5.csv')
    print ('dataset loaded with shape', data.shape )   
    return data

data = ingest()
data.head(5)

dataset loaded with shape (14235, 7)


Unnamed: 0,Tweet_date,Tweet_time,Tweet_City,Tweet_Country,Tweet_account,Retweet_count,Tweet_Text
0,4/1/2020,0:08:00,,Australia,GSK_AU,0,ask award research excellence open nomination ...
1,4/1/2020,0:35:00,,Australia,GSK_AU,3,award research excellence open nomination awar...
2,4/1/2020,0:45:00,Basel,Switzerland,Novartis News,31,face global crisis novartis mobilized ramp cap...
3,4/1/2020,0:46:00,Basel,Switzerland,Novartis News,2,information response visit
4,4/1/2020,0:59:00,Basel,Switzerland,Novartis News,4,clinical investigation assessing portfolio exi...


In [0]:
def tokenize(tweet):
    # print(tweet)
    try:
        tokens = tokenizer.tokenize(tweet)
        
        return tokens
    except:
        return 'NC'

In [0]:
def postprocess(data, n=100):
    # data = data.head(n)
    data['tokens'] = data['Tweet_Text'].progress_map(tokenize)  ## progress_map is a variant of the map function plus a progress bar. Handy to monitor DataFrame creations.
    # data = data[data.tokens != 'NC']
    data.reset_index(inplace=True)
    data.drop('index', inplace=True, axis=1)
    return data
tokenData = postprocess(data)

progress-bar: 100%|██████████| 14235/14235 [00:00<00:00, 32324.72it/s]


In [0]:
data.head(10)

Unnamed: 0,Tweet_date,Tweet_time,Tweet_City,Tweet_Country,Tweet_account,Retweet_count,Tweet_Text,tokens
0,4/1/2020,0:08:00,,Australia,GSK_AU,0,ask award research excellence open nomination ...,"[ask, award, research, excellence, open, nomin..."
1,4/1/2020,0:35:00,,Australia,GSK_AU,3,award research excellence open nomination awar...,"[award, research, excellence, open, nomination..."
2,4/1/2020,0:45:00,Basel,Switzerland,Novartis News,31,face global crisis novartis mobilized ramp cap...,"[face, global, crisis, novartis, mobilized, ra..."
3,4/1/2020,0:46:00,Basel,Switzerland,Novartis News,2,information response visit,"[information, response, visit]"
4,4/1/2020,0:59:00,Basel,Switzerland,Novartis News,4,clinical investigation assessing portfolio exi...,"[clinical, investigation, assessing, portfolio..."
5,4/1/2020,0:59:00,Basel,Switzerland,Novartis News,3,ramp response leveraging capability discovery ...,"[ramp, response, leveraging, capability, disco..."
6,4/1/2020,1:00:00,Basel,Switzerland,Novartis News,3,community fund response fund provide million s...,"[community, fund, response, fund, provide, mil..."
7,4/1/2020,1:02:00,Basel,Switzerland,Novartis News,3,drug donation amp pricing commitment committed...,"[drug, donation, amp, pricing, commitment, com..."
8,4/1/2020,1:17:00,Basel,Switzerland,Novartis,0,reaching contact office uk click,"[reaching, contact, office, uk, click]"
9,4/1/2020,1:22:00,Basel,Switzerland,Novartis,0,reaching hear experience provide contact info,"[reaching, hear, experience, provide, contact,..."


In [0]:
x_train, x_test = train_test_split(np.array(data.tokens),
                                                     test_size=0.2)

In [0]:
# Cosine similarity between sentences
from nltk.corpus import stopwords 
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [0]:
data.tokens.size

14235

In [0]:
# Cosine Similarity


for ii in range(0,1000):
  for jj in range(ii+1,1000):
    X_list = data.tokens[ii]
    Y_list = data.tokens[jj]
    
    sw = stopwords.words('english') 

    X_set = {w for w in X_list if not w in sw}  
    Y_set = {w for w in Y_list if not w in sw} 

    l1 =[];l2 =[] 
    # form a set containing keywords of both strings  
    rvector = X_set.union(Y_set)  
    for w in rvector: 
        if w in X_set: l1.append(1) # create a vector 
        else: l1.append(0) 
        if w in Y_set: l2.append(1) 
        else: l2.append(0) 
    c = 0

    # cosine formula  
    for i in range(len(rvector)): 
            c+= l1[i]*l2[i] 
    cosine = c / float((sum(l1)*sum(l2))**0.5) 
    if cosine > 0.85 and cosine < 1.0:
      print("similarity: ", cosine) 
      print(data.Tweet_Text[ii])
      print(data.Tweet_Text[jj])
      print("**********************")

similarity:  0.8571428571428571
doug dm booking reference email address guide
david dm booking reference email address guide
**********************
similarity:  0.8571428571428571
doug dm booking reference email address guide
sunset dm booking reference email address guide
**********************
similarity:  0.9258200997725514
doug dm booking reference email address guide
dm booking reference email address guide
**********************
similarity:  0.9258200997725514
doug dm booking reference email address guide
dm booking reference email address guide
**********************
similarity:  0.8888888888888888
oscarspanna send direct message booking reference email address contact
lindsay send direct message booking reference email address contact
**********************
similarity:  0.8888888888888888
oscarspanna send direct message booking reference email address contact
send direct message booking reference email address contact number
**********************
similarity:  0.857142857142857

In [0]:
# Jacquard similarity

def jaccard_similarity(list1, list2):
    intersection = len(list(set(list1).intersection(list2)))
    union = (len(list1) + len(list2)) - intersection
    return float(intersection) / union

for ii in range(0,10000):
  for jj in range(ii+1,10000):
    jacq = jaccard_similarity(data.tokens[ii], data.tokens[jj])
    if jacq > 0.85 and jacq < 1:
      print("*******************************")
      print(jacq)
      print(data.Tweet_Text[ii])
      print(data.Tweet_Text[jj])

*******************************
0.8571428571428571
doug dm booking reference email address guide
dm booking reference email address guide
*******************************
0.8571428571428571
doug dm booking reference email address guide
dm booking reference email address guide
*******************************
0.8571428571428571
david dm booking reference email address guide
dm booking reference email address guide
*******************************
0.8571428571428571
david dm booking reference email address guide
dm booking reference email address guide
*******************************
0.8571428571428571
pasgaspard flight suspended uae government directive check
flight suspended uae government directive check
*******************************
0.8571428571428571
pasgaspard flight suspended uae government directive check
flight suspended uae government directive check
*******************************
0.875
oliver dm booking reference email address check option
dm booking reference email address ch

In [0]:
def labelizeTweets(tweets, label_type):
    labelized = []
    for i,v in tqdm(enumerate(tweets)):
        label = '%s_%s'%(label_type,i)
        labelized.append(LabeledSentence(v, [label]))
    return labelized

x_train = labelizeTweets(x_train, 'TRAIN')
x_test = labelizeTweets(x_test, 'TEST')

  """
11388it [00:00, 214284.84it/s]
2847it [00:00, 204307.89it/s]


In [0]:
tweet_w2v = Word2Vec(size=2000, min_count=10)
tweet_w2v.build_vocab([x.words for x in tqdm(x_train)])
tweet_w2v.train([x.words for x in tqdm(x_train)],total_examples=tweet_w2v.corpus_count,epochs=100)

100%|██████████| 11388/11388 [00:00<00:00, 1245495.02it/s]
100%|██████████| 11388/11388 [00:00<00:00, 1404679.86it/s]


(5389407, 8152800)

In [0]:
# tweet_w2v.wv.most_similar('cough')

In [0]:
tweet_w2v.wv.most_similar('ventilator')

  if np.issubdtype(vec.dtype, np.int):


[('army', 0.6084273457527161),
 ('expected', 0.5938495993614197),
 ('glove', 0.5040854811668396),
 ('germany', 0.47624343633651733),
 ('machine', 0.4717262387275696),
 ('speaks', 0.4636801481246948),
 ('lifenplease', 0.450458824634552),
 ('pandemicnbut', 0.4500352740287781),
 ('necessity', 0.44189393520355225),
 ('japanese', 0.4290078282356262)]

In [0]:
tweet_w2v.wv.most_similar('corona')

  if np.issubdtype(vec.dtype, np.int):


[('giovanni', 0.5837835073471069),
 ('agar', 0.5084028244018555),
 ('shiva', 0.4971287250518799),
 ('table', 0.4835934638977051),
 ('ke', 0.4747019112110138),
 ('pandemicnbut', 0.4571372866630554),
 ('lifenplease', 0.44821739196777344),
 ('anna', 0.4251677393913269),
 ('necessity', 0.4217962920665741),
 ('ka', 0.4120282232761383)]

In [0]:
from gensim.models import FastText
# print(x_train)
f2vec = FastText(size=100, window=5, min_count=5, workers=4,sg=1)
f2vec.build_vocab([x.words for x in tqdm(x_train)])
f2vec.train([x.words for x in tqdm(x_train)],total_examples=tweet_w2v.corpus_count,epochs=100)

100%|██████████| 11388/11388 [00:00<00:00, 1285172.84it/s]
100%|██████████| 11388/11388 [00:00<00:00, 1436706.19it/s]


In [0]:
f2vec.wv.most_similar('cough')

  if np.issubdtype(vec.dtype, np.int):


[('sneeze', 0.7949872612953186),
 ('crowding', 0.7304932475090027),
 ('needle', 0.7257699966430664),
 ('smother', 0.7132829427719116),
 ('cou', 0.6959042549133301),
 ('mouth', 0.617779552936554),
 ('gps', 0.6068882346153259),
 ('congregate', 0.6036831736564636),
 ('assume', 0.6035717129707336),
 ('tough', 0.6027818918228149)]

In [0]:
f2vec.wv.most_similar('corona')

  if np.issubdtype(vec.dtype, np.int):


[('coronavirus', 0.5861368775367737),
 ('giovanni', 0.5428277254104614),
 ('table', 0.5203661918640137),
 ('shiva', 0.5107232928276062),
 ('condolence', 0.506089985370636),
 ('ke', 0.4809196889400482),
 ('ka', 0.4804372787475586),
 ('cor', 0.471768319606781),
 ('gathering', 0.454797625541687),
 ('capitalism', 0.4547179639339447)]

In [0]:
f2vec.wv.most_similar('ventilator')

  if np.issubdtype(vec.dtype, np.int):


[('run-up', 0.7273077964782715),
 ('donates', 0.6279716491699219),
 ('army', 0.5959568023681641),
 ('stockpile', 0.5734863877296448),
 ('breaking', 0.5491451621055603),
 ('between', 0.5468143224716187),
 ('breakthrough', 0.5440250635147095),
 ('expected', 0.5429250001907349),
 ('respirator', 0.5412840843200684),
 ('spanish-speaking', 0.5374648571014404)]

In [0]:
# SkipGram with Fast text
f2vec = FastText(size=100, window=5, min_count=5, workers=4,sg=0)
f2vec.build_vocab([x.words for x in tqdm(x_train)])
f2vec.train([x.words for x in tqdm(x_train)],total_examples=tweet_w2v.corpus_count,epochs=100)

100%|██████████| 11388/11388 [00:00<00:00, 1027950.20it/s]
100%|██████████| 11388/11388 [00:00<00:00, 1904646.86it/s]


In [0]:
f2vec.wv.most_similar('ventilator')

  if np.issubdtype(vec.dtype, np.int):


[('run-up', 0.5872458815574646),
 ('senator', 0.540156364440918),
 ('army', 0.5372908115386963),
 ('donates', 0.5039277672767639),
 ('respirator', 0.4817245900630951),
 ('laboratory', 0.4521855413913727),
 ('stockpile', 0.4458812475204468),
 ('italy', 0.418903648853302),
 ('equipment', 0.4101434051990509),
 ('japanese', 0.392594575881958)]

In [0]:
f2vec.wv.most_similar('pandemic')

  if np.issubdtype(vec.dtype, np.int):


[('pandemicnbut', 0.8193777799606323),
 ('postpandemic', 0.8191128969192505),
 ('pandemicnnhttps', 0.7057230472564697),
 ('pandemicnplease', 0.6511387825012207),
 ('epidemic', 0.5263695120811462),
 ('academic', 0.5219470858573914),
 ('healthcare', 0.3021814823150635),
 ('economic', 0.2832545340061188),
 ('untold', 0.2740529775619507),
 ('emi', 0.2541292607784271)]

In [0]:
import bokeh.plotting as bp
from bokeh.models import HoverTool, BoxSelectTool
from bokeh.plotting import figure, show, output_notebook
from sklearn.manifold import TSNE


# defining the chart
output_notebook()
plot_tfidf = bp.figure(plot_width=700, plot_height=600, title="A map of word vectors",
    tools="pan,wheel_zoom,box_zoom,reset,hover,previewsave",
    x_axis_type=None, y_axis_type=None, min_border=1)

# getting a list of word vectors each is of 2000 dimensions
word_vectors = [tweet_w2v[w] for w in list(tweet_w2v.wv.vocab.keys())[:]]

# dimensionality reduction. converting the vectors to 2d vectors
tsne_model = TSNE(n_components=2, verbose=1, random_state=0)
tsne_w2v = tsne_model.fit_transform(word_vectors)

# putting everything in a dataframe
tsne_df = pd.DataFrame(tsne_w2v, columns=['x', 'y'])
tsne_df['words'] = list(tweet_w2v.wv.vocab.keys())[:]


  


[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 1434 samples in 0.196s...
[t-SNE] Computed neighbors for 1434 samples in 8.553s...
[t-SNE] Computed conditional probabilities for sample 1000 / 1434
[t-SNE] Computed conditional probabilities for sample 1434 / 1434
[t-SNE] Mean sigma: 2.391683
[t-SNE] KL divergence after 250 iterations with early exaggeration: 95.235306
[t-SNE] KL divergence after 1000 iterations: 2.369486


In [0]:
# plotting. the corresponding word appears when you hover on the data point.
plot_tfidf.scatter(x='x', y='y', source=tsne_df)
hover = plot_tfidf.select(dict(type=HoverTool))
hover.tooltips={"word": "@words"}
show(plot_tfidf)