# Inspect Data and Apply Word2Vec 
The methods shown in this notebook are simple methods. The more accurate methods are shown in the latter notebooks.

In [35]:
import pandas as pd
import gensim
import nltk
nltk.download('wordnet')


from sklearn.manifold import TSNE

from tqdm import tqdm


from bokeh.plotting import figure, show, output_notebook, save
from bokeh.models import HoverTool, value, LabelSet, Legend, ColumnDataSource
output_notebook()

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
filename_dataset = './datasets/twitter_trump_2019_05.csv'

data_df = pd.read_csv(filename_dataset)

In [4]:
data_df.shape

(677, 3)

In [5]:
data_df.head()

Unnamed: 0,text,created_at,id_str
0,Robert Mueller came to the Oval Office (along ...,05-30-2019 15:34:11,1134120831389392896
1,“Comey and Brennan are turning on each other.”...,05-30-2019 14:41:24,1134107544681455616
2,Congressman John Ratcliffe “The Trump Campaign...,05-30-2019 13:41:43,1134092525218590721
3,Russia Russia Russia! That’s all you heard at ...,05-30-2019 11:57:47,1134066371510378501
4,....say he fought back against this phony crim...,05-30-2019 11:57:47,1134066372584062976


In [6]:
raw_docs = data_df['text']

In [7]:
raw_docs[:5]

0    Robert Mueller came to the Oval Office (along ...
1    “Comey and Brennan are turning on each other.”...
2    Congressman John Ratcliffe “The Trump Campaign...
3    Russia Russia Russia! That’s all you heard at ...
4    ....say he fought back against this phony crim...
Name: text, dtype: object

# Normalize, Tokenize, Remove Stopwords

In [8]:
docs = []

for d in tqdm(raw_docs):
    # normalize and tokenize
    raw_tokens = gensim.utils.simple_preprocess(d)
    
    # Create lemmatizer
    lemmatizer = nltk.stem.WordNetLemmatizer()
    doc = []
    # remove stopwords
    for t in raw_tokens:
        if t not in gensim.parsing.preprocessing.STOPWORDS:
            # lemmatize and change the past tense to present tense
            t = lemmatizer.lemmatize(t, pos='v')
            doc.append(t)
            
            
    docs.append(doc)

100%|██████████| 677/677 [00:02<00:00, 257.69it/s]


In [9]:
print(docs[0])
print(docs[1])

['robert', 'mueller', 'come', 'oval', 'office', 'potential', 'candidates', 'seek', 'name', 'director', 'fbi', 'position', 'years', 'tell', 'day', 'name', 'special', 'counsel', 'total', 'conflict', 'nice']
['comey', 'brennan', 'turn', 'kilmeade']


# Build Word2Vec

In [10]:
model = gensim.models.Word2Vec(size=150,
                              window=10,
                              min_count=2,
                              sg=1,    # 1: Skip-Gram. 0:CBOW
                              workers=1)



# Prepare the Model Vocabulary

In [11]:
model.build_vocab(docs)

In [12]:
print('Size of vocabulary: {}'.format(len(model.wv.vocab)))

Size of vocabulary: 1187


In [13]:
print(list(model.wv.vocab.keys())[:100])

['robert', 'mueller', 'come', 'oval', 'office', 'potential', 'name', 'director', 'fbi', 'position', 'years', 'tell', 'day', 'special', 'counsel', 'total', 'conflict', 'nice', 'comey', 'brennan', 'turn', 'kilmeade', 'congressman', 'john', 'trump', 'campaign', 'clearly', 'collude', 'foxnews', 'russia', 'hear', 'begin', 'witch', 'hunt', 'hoax', 'disappear', 'help', 'elect', 'crime', 'exist', 'dems', 'partner', 'fake', 'news', 'media', 'fight', 'phony', 'horrendous', 'false', 'shouldn', 'sit', 'obstruction', 'presidential', 'harassment', 'greatest', 'history', 'spend', 'dark', 'unlimited', 'access', 'people', 'highly', 'bring', 'charge', 'inform', 'have', 'ship', 'recent', 'visit', 'japan', 'flotus', 'love', 'great', 'military', 'men', 'women', 'job', 'tonight', 'seanhannity', 'number', 'far', 'mark', 'levin', 'congrats', 'book', 'impeach', 'republican', 'president', 'commit', 'democrats', 'rt', 'presssec', 'white', 'house', 'statement', 'today', 'https', 'kayleighmcenany', 'time', 'spy']


# Train Word2Vec

In [14]:
model.epochs

5

In [15]:
results = model.train(sentences=docs,
                       total_examples=len(docs),
                       epochs=model.epochs)

In [16]:
results

(34447, 47905)

# Save the model

In [17]:
filename_prefix = './models/trump_twitts.word2vec'
model.save(filename_prefix)

# Restore the Model

In [18]:
model = gensim.models.Word2Vec.load(filename_prefix)

# Insepct Length of  the Encoded Vector
The length is the same as defined in the following snippet:
```
gensim.models.Word2Vec(size=150,...)
```

In [19]:
vector = model.wv['trump']

In [20]:
len(vector)

150

In [21]:
vector[:10]

array([-0.10676198, -0.18929283,  0.20128597,  0.02491658, -0.09900754,
       -0.08931006,  0.03638783,  0.25493547, -0.19223677, -0.1081264 ], dtype=float32)

In [22]:
print('min: {}, max: {}'.format(min(vector),
                               max(vector)))

min: -0.29632511734962463, max: 0.3292270004749298


# Generate Features

In [51]:
words_topic_1 = ['dbongino', 'great', 'today', 'trump', 'job', 'joe', 'china', 'vote', 'realdonaldtrump', 'biden']
words_topic_2 = ['president', 'mueller', 'democrats', 'report', 'collusion', 'trump', 'realdonaldtrump', 'tomfitton', 'know', 'want']
words_topic_3 = ['china', 'great', 'tariff', 'dollars', 'state', 'president', 'billion', 'want', 'japan', 'years']

words_testing = words_topic_1 + words_topic_2 + words_topic_3

In [52]:
len(words_testing)

30

In [65]:
# features = model.wv.__getitem__(words_testing)

total_words = list(model.wv.vocab)
features = model.wv.__getitem__(total_words)
type(features)

numpy.ndarray

In [67]:
total_words[:10]

['robert',
 'mueller',
 'come',
 'oval',
 'office',
 'potential',
 'name',
 'director',
 'fbi',
 'position']

In [57]:
print('Length = {}\nfeature size = {}'.format(features.shape[0], features.shape[1]))

Length = 1187
feature size = 150


In [58]:
features.max()

0.41159707

In [59]:
features.min()

-0.38196316

# Display Words using t-SNE

In [70]:
SEED=0
tsne_model = TSNE(perplexity=20, n_components=2, random_state=SEED)

In [71]:
X_tsne = tsne_model.fit_transform(features)

In [72]:
x_tsne = X_tsne[:, 0]
y_tsne = X_tsne[:, 1]
label = total_words
contents = total_words


cluster_colors = {0: 'blue', 1: 'green', 2: 'yellow', 3: 'red', 4: 'skyblue', 5:'salmon', 6:'orange', 7:'maroon', 8:'crimson', 9:'black', 10:'gray'}

# labels = ['Topic {}'.format(i) for i in topic_tfidf]
# topic_colors = [cluster_colors[i] for i in topic_tfidf]

settings = dict(x=x_tsne,
               y=y_tsne,
                label=label,
#                 color=topic_colors,
               content=contents
               )

source = ColumnDataSource(settings)

labels = LabelSet(x='x', y='y', text='label', level='glyph',
              x_offset=5, y_offset=5, source=source, render_mode='canvas', text_font_size='6pt')


title = 'T-SNE visualization of Trump\'s twitts'

plot_lda = figure(plot_width=1000, plot_height=600,
                     title=title, tools="pan,wheel_zoom,box_zoom,reset,hover,previewsave",
                     x_axis_type=None, y_axis_type=None, min_border=1)

plot_lda.scatter(x='x', y='y',
#                  legend='label', 
                 source=source, 
#                  color='color',
                 alpha=0.8, size=10)#'msize', )


plot_lda.add_layout(labels)

hover = plot_lda.select(dict(type=HoverTool))
hover.tooltips = {"content": "@content"}
plot_lda.legend.location = "top_left"

show(plot_lda)