# Add Phrases (Bigrams)
Using gensim.models.phrases.Phrases and gensim.models.phrases.Phraser to add phrases (bigrams) automatically.

Reference: https://www.kaggle.com/pierremegret/gensim-word2vec-tutorial

Correct way to tokenize when using gensim.models.phrase.Phrase(): https://stackoverflow.com/questions/50009030/correct-way-of-using-phrases-and-preprocess-string-gensim

In [1]:
import pandas as pd
import gensim
import nltk
nltk.download('wordnet')


from sklearn.manifold import TSNE


from tqdm import tqdm


from bokeh.plotting import figure, show, output_notebook, save
from bokeh.models import HoverTool, value, LabelSet, Legend, ColumnDataSource
output_notebook()

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


# Preprocess

In [2]:
def read_dataset(filename):
    dataset_pd = pd.read_csv(filename, )
    
    return dataset_pd['text']

def preprocess(raw_docs, has_stopwords=False):
    """
    Normalize, tokenize, lemmatize, remove stopwords
    """
    
    lemmatizer = nltk.stem.WordNetLemmatizer()
    
    stopwords_http = ['https', 'rt', 'amp']
    
    docs = []
    
    for d in tqdm(raw_docs):
        doc = []
        
        # normalize and tokenize
        raw_tokens = gensim.utils.simple_preprocess(d)
        
        for t in raw_tokens:
            if t not in stopwords_http:
                if has_stopwords is False:
                    if t not in gensim.parsing.preprocessing.STOPWORDS:
                        # convert all tenses to present tense.
                        t = lemmatizer.lemmatize(t, pos='v')
                        doc.append(t)
                else:
                    doc.append(t)
    
        docs.append(doc)
        
    return docs


# def simple_preprocess(raw_docs):
#     docs = [list(gensim.utils.tokenize(d, lower=True)) for d in raw_docs]
    
#     return docs

In [3]:
filename_dataset = './datasets/twitter_trump_2019_0101-2019_0531.csv'

raw_docs = read_dataset(filename_dataset)
docs = preprocess(raw_docs)

100%|██████████| 2125/2125 [00:02<00:00, 740.12it/s]


In [4]:
docs_with_stopwords  = preprocess(raw_docs, has_stopwords=True) 
# docs_with_stopwords = simple_preprocess(raw_docs)

100%|██████████| 2125/2125 [00:00<00:00, 16421.64it/s]


# \*\*\* Create Bigram Model Using Documents w/ **STOPWORDS**
Trained with Stopwords, otherwise the bigram model fail to generate good results.

Reference: https://stackoverflow.com/questions/50009030/correct-way-of-using-phrases-and-preprocess-string-gensim

In [5]:
phrase = gensim.models.phrases.Phrases(docs_with_stopwords, min_count=1, threshold=2)
bigram_model = gensim.models.phrases.Phraser(phrase)

# Validate the Bigram

In [6]:
index_doc = 8
print('raw doc:\n{}\n'.format(raw_docs[index_doc]))
print('doc w/ stopwords:\n{}\n'.format(docs_with_stopwords[index_doc]))
# print(docs[index_doc])
print('Bigrams of doc w/ stopwords:')
print(bigram_model[docs_with_stopwords[index_doc]])
print()

print('Bigrams of doc w/0 stopwords:')
print(bigram_model[docs[index_doc]])

raw doc:
I was not informed about anything having to do with the Navy Ship USS John S. McCain during my recent visit to Japan. Nevertheless @FLOTUS and I loved being with our great Military Men and Women - what a spectacular job they do!

doc w/ stopwords:
['was', 'not', 'informed', 'about', 'anything', 'having', 'to', 'do', 'with', 'the', 'navy', 'ship', 'uss', 'john', 'mccain', 'during', 'my', 'recent', 'visit', 'to', 'japan', 'nevertheless', 'flotus', 'and', 'loved', 'being', 'with', 'our', 'great', 'military', 'men', 'and', 'women', 'what', 'spectacular', 'job', 'they', 'do']

Bigrams of doc w/ stopwords:
['was_not', 'informed', 'about_anything', 'having', 'to_do', 'with_the', 'navy', 'ship', 'uss', 'john_mccain', 'during_my', 'recent', 'visit_to', 'japan', 'nevertheless', 'flotus', 'and', 'loved', 'being_with', 'our_great', 'military', 'men_and', 'women', 'what', 'spectacular', 'job', 'they_do']

Bigrams of doc w/0 stopwords:
['inform', 'have', 'navy', 'ship', 'uss', 'john_mccain'

# Generate Docs with Bigrams

In [7]:
docs_with_phrases = bigram_model[docs]

docs_target = docs_with_phrases  # doc

# Init Program

In [8]:
size_feature = 150
window = 10
min_count = 2   # ingnore words with total frequency lower than this value.
model = gensim.models.Word2Vec(size=size_feature,
                              window=window,
                              min_count=min_count,
                              sg=1, # 1: Skip-Gram. 0: BOW
                              workers=1)

model.build_vocab(docs_target)


# Inspect the Results

In [9]:
print('Size of vocab: {}'.format(len(model.wv.vocab)))
print()
print('The first 100 words:\n{}'.format(list(model.wv.vocab.keys())[:100]))

Size of vocab: 3065

The first 100 words:
['robert_mueller', 'come', 'oval_office', 'potential', 'seek', 'name', 'director', 'fbi', 'position', 'years', 'tell', 'day', 'special_counsel', 'total', 'conflict', 'nice', 'comey', 'brennan', 'turn', 'kilmeade', 'congressman', 'john', 'ratcliffe', 'trump_campaign', 'clearly', 'conspire', 'collude', 'foxnews', 'fight', 'phony_crime', 'exist', 'horrendous_false', 'shouldn_fight', 'sit', 'obstruction', 'mueller', 'presidential_harassment', 'russia_russia', 'russia', 'hear', 'begin', 'witch_hunt', 'hoax', 'disappear', 'help', 'elect', 'crime', 'dems', 'partner', 'fake_news', 'media', 'greatest', 'history', 'spend', 'dark', 'unlimited', 'access', 'people', 'resources', 'cooperation', 'highly', 'bring', 'charge', 'inform', 'have', 'navy', 'ship', 'uss', 'john_mccain', 'recent', 'visit', 'japan', 'flotus', 'love', 'great_military', 'men_women', 'spectacular', 'job', 'great', 'tonight', 'seanhannity_foxnews', 'number', 'far', 'mark_levin', 'congrats'

# Train Program

In [10]:
results = model.train(sentences=docs_target,
                     total_examples=len(docs_target),
                     epochs=30,
                     report_delay=1)

results

(646787, 773580)

# Save Model

In [11]:
filename_model = './models/trump_twitts_with_phrases.word2vec'
model.save(filename_model)

# Restore Model

In [12]:
model = gensim.models.Word2Vec.load(filename_model)

# Test Some Words

In [21]:
model.wv.most_similar('donald_trump')

[('throw', 0.7470657825469971),
 ('suppose', 0.7421317100524902),
 ('gopleader', 0.7401736378669739),
 ('jr', 0.7380250692367554),
 ('shred', 0.7378155589103699),
 ('clearly', 0.7351237535476685),
 ('stevehiltonx', 0.7343442440032959),
 ('jessebwatters', 0.7274690270423889),
 ('supporters', 0.7269744277000427),
 ('biggest_scandal', 0.7187231779098511)]

In [20]:
model.wv.most_similar('hillary_clinton')

[('crook', 0.8719490766525269),
 ('oh', 0.8405627012252808),
 ('davis', 0.8357037901878357),
 ('data', 0.8278486728668213),
 ('democratic_national', 0.8071596622467041),
 ('wash', 0.8057330250740051),
 ('acid', 0.8051459193229675),
 ('russia_investigation', 0.8020377159118652),
 ('tomfitton_fbi', 0.8002674579620361),
 ('roger_stone', 0.7979101538658142)]

In [26]:
model.wv.most_similar('kim_jong')

[('economic_powerhouse', 0.8804218769073486),
 ('summit', 0.8724449872970581),
 ('nuclear_weapons', 0.8592386841773987),
 ('chairman_kim', 0.8500730991363525),
 ('north_korea', 0.8458744287490845),
 ('dinner', 0.8355292677879333),
 ('awesome', 0.8307632803916931),
 ('wise', 0.8241710662841797),
 ('continuation', 0.8207369446754456),
 ('hanoi_vietnam', 0.8188294768333435)]

In [14]:
model.wv.most_similar('china')

[('tariff', 0.7596133351325989),
 ('trade_negotiations', 0.7594571113586426),
 ('subsidize', 0.7531495690345764),
 ('asia', 0.7526028752326965),
 ('renegotiate', 0.7512441873550415),
 ('best_idea', 0.7504585981369019),
 ('large_degree', 0.7472637891769409),
 ('greatly_slow', 0.7443885207176208),
 ('product_inside', 0.7406420707702637),
 ('automatically_speed', 0.7401478886604309)]

In [52]:
model.wv.most_similar('women')

[('great_interview', 0.8672328591346741),
 ('great_men', 0.8204637765884399),
 ('brave_men', 0.8119024634361267),
 ('grateful', 0.7967354655265808),
 ('balance', 0.7953092455863953),
 ('founder', 0.7947789430618286),
 ('incredible_men', 0.7894577980041504),
 ('believer', 0.7850879430770874),
 ('ways', 0.7794942259788513),
 ('amy', 0.7754597067832947)]

In [33]:
model.wv.most_similar('mexico')

[('migration', 0.8289365768432617),
 ('illegals', 0.8092644214630127),
 ('big_caravan', 0.8062359094619751),
 ('sadly_murder', 0.8034271597862244),
 ('strong_immigration', 0.7937170267105103),
 ('big_contributor', 0.7895392179489136),
 ('stop_illegals', 0.7876311540603638),
 ('long_march', 0.7849839925765991),
 ('americans_die', 0.7693694233894348),
 ('coyotes', 0.7673327326774597)]

In [38]:
# topic 0
model.wv.most_similar('america')

[('veterans', 0.651187002658844),
 ('caldwell', 0.6485228538513184),
 ('good_day', 0.6383963823318481),
 ('credit_dan', 0.633927583694458),
 ('brave', 0.6325311660766602),
 ('modern', 0.6265507340431213),
 ('heroes', 0.6232959032058716),
 ('heart', 0.6206722855567932),
 ('whitehouse_happy', 0.6043614149093628),
 ('worker', 0.604219377040863)]

In [39]:
# topic 1
model.wv.most_similar('job')

[('april_unemployment', 0.8094142079353333),
 ('april', 0.7974698543548584),
 ('roar', 0.7521603107452393),
 ('shocker', 0.7327584028244019),
 ('surge', 0.7264478206634521),
 ('fell', 0.7083963751792908),
 ('create', 0.7025642991065979),
 ('expectations', 0.6824380159378052),
 ('percent', 0.6718088388442993),
 ('gdp_growth', 0.6691694855690002)]

In [40]:
# topic 2
model.wv.most_similar('dbongino')

[('endorse', 0.8733628392219543),
 ('firefighter', 0.8590326309204102),
 ('retire', 0.8258934020996094),
 ('piss', 0.8217328190803528),
 ('dues', 0.8202773928642273),
 ('union_leadership', 0.81878262758255),
 ('union_dues', 0.8125136494636536),
 ('membership', 0.7974640130996704),
 ('support_trump', 0.793086051940918),
 ('fireman', 0.7831147909164429)]

# Get Similar Words

In [43]:
similar_words = []
key_words = ['america', 'job', 'dbongino']

for key_w in key_words:
    similar_words.append(key_w)
    
    for s_w, _ in model.wv.most_similar(key_w):
        similar_words.append(s_w)
        
print(similar_words)

['america', 'veterans', 'caldwell', 'good_day', 'credit_dan', 'brave', 'modern', 'heroes', 'heart', 'whitehouse_happy', 'worker', 'job', 'april_unemployment', 'april', 'roar', 'shocker', 'surge', 'fell', 'create', 'expectations', 'percent', 'gdp_growth', 'dbongino', 'endorse', 'firefighter', 'retire', 'piss', 'dues', 'union_leadership', 'union_dues', 'membership', 'support_trump', 'fireman']


# Display Words using t-SNE

In [44]:
# total_words = list(model.wv.vocab)
# features = model.wv.__getitem__(total_words)

total_words = similar_words
features = model.wv.__getitem__(total_words)

print(total_words[:10])
print('Length = {}\nfeature size = {}'.format(features.shape[0], features.shape[1]))
print('feature max: {}, min: {}'.format(features.max(), features.min()))

['america', 'veterans', 'caldwell', 'good_day', 'credit_dan', 'brave', 'modern', 'heroes', 'heart', 'whitehouse_happy']
Length = 33
feature size = 150
feature max: 0.924269437789917, min: -0.9417712688446045


In [45]:
SEED = 0
tsne = TSNE(perplexity=20, n_components=2, random_state=SEED)
X_tsne = tsne.fit_transform(features)

In [48]:
x_tsne = X_tsne[:, 0]
y_tsne = X_tsne[:, 1]
label = total_words
contents = total_words


cluster_colors = {0: 'blue', 1: 'green', 2: 'yellow', 3: 'red', 4: 'skyblue', 5:'salmon', 6:'orange', 7:'maroon', 8:'crimson', 9:'black', 10:'gray'}

# labels = ['Topic {}'.format(i) for i in topic_tfidf]
topic_colors = [cluster_colors[index//11] for index, _ in enumerate(total_words)]

settings = dict(x=x_tsne,
               y=y_tsne,
                label=label,
                color=topic_colors,
               content=contents
               )

source = ColumnDataSource(settings)

labels = LabelSet(x='x', y='y', text='label', level='glyph',
              x_offset=5, y_offset=5, source=source, render_mode='canvas', text_font_size='6pt')


title = 'T-SNE visualization of Trump\'s twitts'

plot_lda = figure(plot_width=1000, plot_height=600,
                     title=title, tools="pan,wheel_zoom,box_zoom,reset,hover,previewsave",
                     x_axis_type=None, y_axis_type=None, min_border=1)

plot_lda.scatter(x='x', y='y',
#                  legend='label', 
                 source=source, 
                 color='color',
                 alpha=0.8, size=10)#'msize', )


plot_lda.add_layout(labels)

hover = plot_lda.select(dict(type=HoverTool))
hover.tooltips = {"content": "@content"}
plot_lda.legend.location = "top_left"

show(plot_lda)

# Plot 2nd Figure

# Get Similar Words

In [61]:
similar_words = []
key_words = ['donald_trump', 'hillary_clinton', 'kim_jong', 'china', 'women', 'mexico']

for key_w in key_words:
    similar_words.append(key_w)
    
    for s_w, _ in model.wv.most_similar(key_w):
        similar_words.append(s_w)
        
print(similar_words)

['donald_trump', 'throw', 'suppose', 'gopleader', 'jr', 'shred', 'clearly', 'stevehiltonx', 'jessebwatters', 'supporters', 'biggest_scandal', 'hillary_clinton', 'crook', 'oh', 'davis', 'data', 'democratic_national', 'wash', 'acid', 'russia_investigation', 'tomfitton_fbi', 'roger_stone', 'kim_jong', 'economic_powerhouse', 'summit', 'nuclear_weapons', 'chairman_kim', 'north_korea', 'dinner', 'awesome', 'wise', 'continuation', 'hanoi_vietnam', 'china', 'tariff', 'trade_negotiations', 'subsidize', 'asia', 'renegotiate', 'best_idea', 'large_degree', 'greatly_slow', 'product_inside', 'automatically_speed', 'women', 'great_interview', 'great_men', 'brave_men', 'grateful', 'balance', 'founder', 'incredible_men', 'believer', 'ways', 'amy', 'mexico', 'migration', 'illegals', 'big_caravan', 'sadly_murder', 'strong_immigration', 'big_contributor', 'stop_illegals', 'long_march', 'americans_die', 'coyotes']


# Display Words using t-SNE

In [62]:
# total_words = list(model.wv.vocab)
# features = model.wv.__getitem__(total_words)

total_words = similar_words
features = model.wv.__getitem__(total_words)

print(total_words[:10])
print('Length = {}\nfeature size = {}'.format(features.shape[0], features.shape[1]))
print('feature max: {}, min: {}'.format(features.max(), features.min()))

['donald_trump', 'throw', 'suppose', 'gopleader', 'jr', 'shred', 'clearly', 'stevehiltonx', 'jessebwatters', 'supporters']
Length = 66
feature size = 150
feature max: 1.0633093118667603, min: -0.9720051288604736


In [63]:
SEED = 0
tsne = TSNE(perplexity=20, n_components=2, random_state=SEED)
X_tsne = tsne.fit_transform(features)

In [64]:
x_tsne = X_tsne[:, 0]
y_tsne = X_tsne[:, 1]
label = total_words
contents = total_words


cluster_colors = {0: 'blue', 1: 'green', 2: 'yellow', 3: 'red', 4: 'skyblue', 5:'salmon', 6:'orange', 7:'maroon', 8:'crimson', 9:'black', 10:'gray'}

# labels = ['Topic {}'.format(i) for i in topic_tfidf]
topic_colors = [cluster_colors[index//11] for index, _ in enumerate(total_words)]

settings = dict(x=x_tsne,
               y=y_tsne,
                label=label,
                color=topic_colors,
               content=contents
               )

source = ColumnDataSource(settings)

labels = LabelSet(x='x', y='y', text='label', level='glyph',
              x_offset=5, y_offset=5, source=source, render_mode='canvas', text_font_size='6pt')


title = 'T-SNE visualization of Trump\'s twitts'

plot_lda = figure(plot_width=1000, plot_height=600,
                     title=title, tools="pan,wheel_zoom,box_zoom,reset,hover,previewsave",
                     x_axis_type=None, y_axis_type=None, min_border=1)

plot_lda.scatter(x='x', y='y',
#                  legend='label', 
                 source=source, 
                 color='color',
                 alpha=0.8, size=10)#'msize', )


plot_lda.add_layout(labels)

hover = plot_lda.select(dict(type=HoverTool))
hover.tooltips = {"content": "@content"}
plot_lda.legend.location = "top_left"

show(plot_lda)