# Inspect Data and Apply Word2Vec 
The methods shown in this notebook are simple methods. The more accurate methods are shown in the latter notebooks.

Reference: https://www.kaggle.com/liananapalkova/simply-about-word2vec

In [1]:
import pandas as pd
import gensim
import nltk
nltk.download('wordnet')


from sklearn.manifold import TSNE

from tqdm import tqdm


from bokeh.plotting import figure, show, output_notebook, save
from bokeh.models import HoverTool, value, LabelSet, Legend, ColumnDataSource
output_notebook()

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [2]:
# filename_dataset = './datasets/twitter_trump_2019_05.csv'
filename_dataset =  './datasets/twitter_trump_2019_0101-2019_0531.csv'

data_df = pd.read_csv(filename_dataset)

In [3]:
data_df.shape

(2125, 7)

In [4]:
data_df.head()

Unnamed: 0,source,text,created_at,retweet_count,favorite_count,is_retweet,id_str
0,Twitter for iPhone,Robert Mueller came to the Oval Office (along ...,05-30-2019 15:34:11,27304,101121,False,1134120831389392896
1,Twitter for iPhone,“Comey and Brennan are turning on each other.”...,05-30-2019 14:41:24,15412,66318,False,1134107544681455616
2,Twitter for iPhone,Congressman John Ratcliffe “The Trump Campaign...,05-30-2019 13:41:43,14424,64311,False,1134092525218590721
3,Twitter for iPhone,....say he fought back against this phony crim...,05-30-2019 11:57:47,22154,104244,False,1134066372584062976
4,Twitter for iPhone,Russia Russia Russia! That’s all you heard at ...,05-30-2019 11:57:47,27082,118033,False,1134066371510378501


In [5]:
raw_docs = data_df['text']

In [6]:
raw_docs[:5]

0    Robert Mueller came to the Oval Office (along ...
1    “Comey and Brennan are turning on each other.”...
2    Congressman John Ratcliffe “The Trump Campaign...
3    ....say he fought back against this phony crim...
4    Russia Russia Russia! That’s all you heard at ...
Name: text, dtype: object

# Normalize, Tokenize, Remove Stopwords

In [7]:
docs = []

for d in tqdm(raw_docs):
    # normalize and tokenize
    raw_tokens = gensim.utils.simple_preprocess(d)
    
    # Create lemmatizer
    lemmatizer = nltk.stem.WordNetLemmatizer()
    doc = []
    # remove stopwords
    for t in raw_tokens:
        if t not in gensim.parsing.preprocessing.STOPWORDS:
            # lemmatize and change the past tense to present tense
            t = lemmatizer.lemmatize(t, pos='v')
            doc.append(t)
            
            
    docs.append(doc)

100%|██████████| 2125/2125 [00:02<00:00, 749.50it/s]


In [8]:
print(docs[0])
print(docs[1])

['robert', 'mueller', 'come', 'oval', 'office', 'potential', 'candidates', 'seek', 'name', 'director', 'fbi', 'position', 'years', 'tell', 'day', 'name', 'special', 'counsel', 'total', 'conflict', 'nice']
['comey', 'brennan', 'turn', 'kilmeade']


# Build Word2Vec

In [9]:
model = gensim.models.Word2Vec(size=150,
                              window=10,
                              min_count=2,
                              sg=1,    # 1: Skip-Gram. 0:CBOW
                              workers=1)



# Prepare the Model Vocabulary

In [10]:
model.build_vocab(docs)

In [11]:
print('Size of vocabulary: {}'.format(len(model.wv.vocab)))

Size of vocabulary: 2584


In [12]:
print(list(model.wv.vocab.keys())[:100])

['robert', 'mueller', 'come', 'oval', 'office', 'potential', 'seek', 'name', 'director', 'fbi', 'position', 'years', 'tell', 'day', 'special', 'counsel', 'total', 'conflict', 'nice', 'comey', 'brennan', 'turn', 'kilmeade', 'congressman', 'john', 'ratcliffe', 'trump', 'campaign', 'clearly', 'conspire', 'collude', 'foxnews', 'fight', 'phony', 'crime', 'exist', 'horrendous', 'false', 'shouldn', 'sit', 'obstruction', 'presidential', 'harassment', 'russia', 'hear', 'begin', 'witch', 'hunt', 'hoax', 'disappear', 'help', 'elect', 'dems', 'partner', 'fake', 'news', 'media', 'greatest', 'history', 'spend', 'dark', 'unlimited', 'access', 'people', 'resources', 'cooperation', 'highly', 'bring', 'charge', 'inform', 'have', 'navy', 'ship', 'uss', 'mccain', 'recent', 'visit', 'japan', 'flotus', 'love', 'great', 'military', 'men', 'women', 'spectacular', 'job', 'tonight', 'seanhannity', 'number', 'far', 'mark', 'levin', 'congrats', 'book', 'impeach', 'republican', 'president', 'commit', 'democrats', 

# Train Word2Vec

In [13]:
model.epochs

5

In [14]:
results = model.train(sentences=docs,
                       total_examples=len(docs),
                       epochs=30,
                      report_delay=1)

In [15]:
results

(738202, 919680)

# Save the model

In [16]:
filename_prefix = './models/trump_twitts.word2vec'
model.save(filename_prefix)

# Restore the Model

In [17]:
model = gensim.models.Word2Vec.load(filename_prefix)

# Insepct Length of  the Encoded Vector
The length is the same as defined in the following snippet:
```
gensim.models.Word2Vec(size=150,...)
```

In [18]:
vector = model.wv['trump']

In [19]:
len(vector)

150

In [20]:
vector[:10]

array([-0.32850593,  0.12494201, -0.23195346,  0.10563996, -0.118173  ,
       -0.22661002, -0.19973227, -0.12601261, -0.50985044,  0.30842406], dtype=float32)

In [21]:
print('min: {}, max: {}'.format(min(vector),
                               max(vector)))

min: -0.6710901260375977, max: 0.6229332685470581


# Generate Features

In [22]:
words_topic_1 = ['dbongino', 'great', 'today', 'trump', 'job', 'joe', 'china', 'vote', 'realdonaldtrump', 'biden']
words_topic_2 = ['president', 'mueller', 'democrats', 'report', 'collusion', 'trump', 'realdonaldtrump', 'tomfitton', 'know', 'want']
words_topic_3 = ['china', 'great', 'tariff', 'dollars', 'state', 'president', 'billion', 'want', 'japan', 'years']

words_testing = words_topic_1 + words_topic_2 + words_topic_3

In [23]:
len(words_testing)

30

In [24]:
# features = model.wv.__getitem__(words_testing)

total_words = list(model.wv.vocab)
features = model.wv.__getitem__(total_words)
type(features)

numpy.ndarray

In [25]:
total_words[:10]

['robert',
 'mueller',
 'come',
 'oval',
 'office',
 'potential',
 'seek',
 'name',
 'director',
 'fbi']

In [26]:
print('Length = {}\nfeature size = {}'.format(features.shape[0], features.shape[1]))

Length = 2584
feature size = 150


In [27]:
features.max()

1.4015204

In [28]:
features.min()

-1.2704477

# Test Some Words

In [29]:
model.wv.most_similar('trump')

[('donald', 0.5802708864212036),
 ('graham', 0.5547558069229126),
 ('breitbart', 0.5287465453147888),
 ('shred', 0.5283063054084778),
 ('nolte', 0.5161701440811157),
 ('ratcliffe', 0.5117678046226501),
 ('marist', 0.5114983320236206),
 ('stevehiltonx', 0.5089367628097534),
 ('wsj', 0.5008276104927063),
 ('breitbartnews', 0.49553459882736206)]

In [30]:
model.wv.most_similar('china')

[('renegotiate', 0.7260676622390747),
 ('tariff', 0.7085151076316833),
 ('asia', 0.6824506521224976),
 ('slowly', 0.6776384711265564),
 ('xi', 0.676251232624054),
 ('wto', 0.6683308482170105),
 ('intellectual', 0.6676908135414124),
 ('subsidize', 0.6625998020172119),
 ('degree', 0.6609886884689331),
 ('property', 0.6486684679985046)]

In [31]:
model.wv.most_similar('doj')

[('jw', 0.8678972125053406),
 ('cia', 0.8633334040641785),
 ('communications', 0.8540863990783691),
 ('bruce', 0.8527845144271851),
 ('nsa', 0.8480514287948608),
 ('overthrow', 0.8381711840629578),
 ('coup', 0.8376858234405518),
 ('fb', 0.835588812828064),
 ('ohr', 0.8275354504585266),
 ('docs', 0.8227896094322205)]

In [32]:
model.wv.most_similar('border')

[('southern', 0.727216362953186),
 ('facilities', 0.6854546070098877),
 ('loopholes', 0.6798961162567139),
 ('patrol', 0.6747167110443115),
 ('favor', 0.6743601560592651),
 ('veto', 0.6720930337905884),
 ('security', 0.6640868186950684),
 ('traffic', 0.6633055210113525),
 ('emergency', 0.6579341292381287),
 ('invasion', 0.65627521276474)]

In [33]:
model.wv.most_similar('space')

[('uc', 0.7828674912452698),
 ('chuckgrassley', 0.768885612487793),
 ('moon', 0.7604418396949768),
 ('workforce', 0.7584115266799927),
 ('initiative', 0.7498108148574829),
 ('balance', 0.7455888390541077),
 ('guard', 0.7425921559333801),
 ('annual', 0.7373372316360474),
 ('louisiana', 0.7320613265037537),
 ('wgdp', 0.7293070554733276)]

In [34]:
model.wv.most_similar('women')

[('men', 0.89604651927948),
 ('vital', 0.7698303461074829),
 ('army', 0.7636630535125732),
 ('sale', 0.7604930400848389),
 ('sheriffs', 0.7366962432861328),
 ('cemetery', 0.735630452632904),
 ('devotion', 0.7328227758407593),
 ('guard', 0.7274119257926941),
 ('brave', 0.7255415320396423),
 ('arlington', 0.7187284231185913)]

In [36]:
model.wv.most_similar('friend')

[('smith', 0.669882595539093),
 ('unbelievable', 0.6672016978263855),
 ('els', 0.6471987366676331),
 ('awesome', 0.6462419033050537),
 ('staff', 0.6409105658531189),
 ('majesty', 0.6407461166381836),
 ('indianapolis', 0.6365588307380676),
 ('themasters', 0.6363410353660583),
 ('brilliant', 0.6347208023071289),
 ('google', 0.6306241750717163)]

In [39]:
model.wv.most_similar('liar')

[('fraudster', 0.9649631381034851),
 ('testimony', 0.8125750422477722),
 ('contradict', 0.7634248733520508),
 ('cohen', 0.7626761198043823),
 ('manuscript', 0.762614905834198),
 ('hostile', 0.7515232563018799),
 ('adam', 0.7421088218688965),
 ('nonsense', 0.7411695718765259),
 ('prison', 0.7385744452476501),
 ('spin', 0.7373374104499817)]

In [37]:
model.wv.most_similar('google')

[('fairness', 0.7470129132270813),
 ('dirty', 0.7232107520103455),
 ('extremely', 0.7169377207756042),
 ('column', 0.7101934552192688),
 ('andy', 0.7091387510299683),
 ('narrative', 0.7075923085212708),
 ('difficult', 0.7023041248321533),
 ('correctly', 0.6871157884597778),
 ('cop', 0.6864380836486816),
 ('conclusion', 0.6833996772766113)]

In [38]:
model.wv.most_similar('amazon')

[('lobbyist', 0.8553866147994995),
 ('post', 0.8528805375099182),
 ('checker', 0.8169164061546326),
 ('outlets', 0.8106592297554016),
 ('washington', 0.8053920269012451),
 ('purposely', 0.8022329807281494),
 ('paper', 0.7894401550292969),
 ('accurate', 0.7827918529510498),
 ('derange', 0.7775524854660034),
 ('favorite', 0.7726907730102539)]

In [55]:
model.wv.most_similar('son')

[('uniform', 0.870519757270813),
 ('spouses', 0.8508966565132141),
 ('ronnebeck', 0.8229634165763855),
 ('devotion', 0.8049011826515198),
 ('electionday', 0.7890223264694214),
 ('godblessamerica', 0.7816034555435181),
 ('nd', 0.7810846567153931),
 ('liberty', 0.7784813046455383),
 ('presidenttrump', 0.7780365943908691),
 ('nf', 0.7778208255767822)]

In [60]:
model.wv.most_similar('family')

[('otto', 0.6536098122596741),
 ('dingell', 0.6499768495559692),
 ('warmbier', 0.6348521709442139),
 ('quest', 0.6266598105430603),
 ('corporations', 0.6148864030838013),
 ('accomplishments', 0.6085590720176697),
 ('gr', 0.6070329546928406),
 ('hearts', 0.5983178019523621),
 ('gather', 0.5933815836906433),
 ('smaller', 0.5827628970146179)]

In [61]:
model.wv.most_similar('father')

[('meghan', 0.8162789344787598),
 ('lrihendry', 0.7671878337860107),
 ('paulsperry_', 0.7174932360649109),
 ('fireman', 0.6764699816703796),
 ('realsaavedra', 0.6746256947517395),
 ('unfortunately', 0.6732372641563416),
 ('trish_regan', 0.6700133085250854),
 ('clients', 0.667826235294342),
 ('nbcnews', 0.6560743451118469),
 ('ken', 0.6549465656280518)]

In [48]:
model.wv.most_similar('good')

[('kremlin', 0.5977956056594849),
 ('putin', 0.5510589480400085),
 ('rough', 0.5310858488082886),
 ('fashion', 0.5306275486946106),
 ('excellent', 0.5294192433357239),
 ('tumble', 0.5243400931358337),
 ('vladimir', 0.5118155479431152),
 ('luck', 0.5062312483787537),
 ('export', 0.5041525363922119),
 ('wet', 0.5024435520172119)]

In [49]:
model.wv.most_similar('bad')

[('violation', 0.5866803526878357),
 ('pure', 0.5797054767608643),
 ('hurt', 0.5650379657745361),
 ('pretty', 0.5526587963104248),
 ('cheat', 0.5475195050239563),
 ('portray', 0.5343050360679626),
 ('deception', 0.5326645970344543),
 ('passive', 0.529072642326355),
 ('kremlin', 0.5259602069854736),
 ('overall', 0.5257994532585144)]

In [51]:
model.wv.most_similar(positive=['clinton', 'xi'], negative=['good'])

[('dnc', 0.5811668634414673),
 ('hu', 0.5692771673202515),
 ('brother', 0.5690678954124451),
 ('ukraine', 0.5671563148498535),
 ('ohr', 0.5658411979675293),
 ('nellie', 0.5656121373176575),
 ('podesta', 0.5590076446533203),
 ('kerry', 0.5587401390075684),
 ('smoke', 0.5580701231956482),
 ('wh', 0.5567339658737183)]

# Display Words using t-SNE

In [57]:
SEED=0
tsne_model = TSNE(perplexity=20, n_components=2, random_state=SEED)

In [58]:
X_tsne = tsne_model.fit_transform(features)

In [59]:
x_tsne = X_tsne[:, 0]
y_tsne = X_tsne[:, 1]
label = total_words
contents = total_words


cluster_colors = {0: 'blue', 1: 'green', 2: 'yellow', 3: 'red', 4: 'skyblue', 5:'salmon', 6:'orange', 7:'maroon', 8:'crimson', 9:'black', 10:'gray'}

# labels = ['Topic {}'.format(i) for i in topic_tfidf]
# topic_colors = [cluster_colors[i] for i in topic_tfidf]

settings = dict(x=x_tsne,
               y=y_tsne,
                label=label,
#                 color=topic_colors,
               content=contents
               )

source = ColumnDataSource(settings)

labels = LabelSet(x='x', y='y', text='label', level='glyph',
              x_offset=5, y_offset=5, source=source, render_mode='canvas', text_font_size='6pt')


title = 'T-SNE visualization of Trump\'s twitts'

plot_lda = figure(plot_width=1000, plot_height=600,
                     title=title, tools="pan,wheel_zoom,box_zoom,reset,hover,previewsave",
                     x_axis_type=None, y_axis_type=None, min_border=1)

plot_lda.scatter(x='x', y='y',
#                  legend='label', 
                 source=source, 
#                  color='color',
                 alpha=0.8, size=10)#'msize', )


plot_lda.add_layout(labels)

hover = plot_lda.select(dict(type=HoverTool))
hover.tooltips = {"content": "@content"}
plot_lda.legend.location = "top_left"

show(plot_lda)

In [52]:
SEED=0
tsne_model = TSNE(perplexity=40, n_components=2, random_state=SEED)

In [53]:
X_tsne = tsne_model.fit_transform(features)

In [54]:
x_tsne = X_tsne[:, 0]
y_tsne = X_tsne[:, 1]
label = total_words
contents = total_words


cluster_colors = {0: 'blue', 1: 'green', 2: 'yellow', 3: 'red', 4: 'skyblue', 5:'salmon', 6:'orange', 7:'maroon', 8:'crimson', 9:'black', 10:'gray'}

# labels = ['Topic {}'.format(i) for i in topic_tfidf]
# topic_colors = [cluster_colors[i] for i in topic_tfidf]

settings = dict(x=x_tsne,
               y=y_tsne,
                label=label,
#                 color=topic_colors,
               content=contents
               )

source = ColumnDataSource(settings)

labels = LabelSet(x='x', y='y', text='label', level='glyph',
              x_offset=5, y_offset=5, source=source, render_mode='canvas', text_font_size='6pt')


title = 'T-SNE visualization of Trump\'s twitts'

plot_lda = figure(plot_width=1000, plot_height=600,
                     title=title, tools="pan,wheel_zoom,box_zoom,reset,hover,previewsave",
                     x_axis_type=None, y_axis_type=None, min_border=1)

plot_lda.scatter(x='x', y='y',
#                  legend='label', 
                 source=source, 
#                  color='color',
                 alpha=0.8, size=10)#'msize', )


plot_lda.add_layout(labels)

hover = plot_lda.select(dict(type=HoverTool))
hover.tooltips = {"content": "@content"}
plot_lda.legend.location = "top_left"

show(plot_lda)