In [1]:
import pandas as pd
import numpy as np
import preprocessor as p
import re
from gensim.parsing.preprocessing import remove_stopwords
import gensim
from gensim.models.phrases import Phraser, Phrases

In [2]:
# Loading Tweets

df = pd.read_csv("../data/tweets.csv")

df["tweet"]

0       stopped doing that 2 years ago.\n\nswitched to...
1                 Thank you @iambolajiayo for resharing 😌
2                                            My pleasure~
3       Day 63:\n\n#100DaysOfCode\n#hashnode\n\nI got ...
4                       thank you so much for the article
                              ...                        
4311    20 Best React Landing Page Templates\n{ by Ale...
4312    @hashnode: Check out my developer blog.  https...
4313    DAY 13 OF #100DAYSOFGADS2020\n{ by Moseti Zach...
4314    Ahhhhhh... You followed me sha! 🤣🤣🤣...\n\nI'm ...
4315                                                  😌😌😌
Name: tweet, Length: 4316, dtype: object

In [3]:
def vaccum_cleaner(tweet):
    p.set_options(p.OPT.URL, p.OPT.MENTION, p.OPT.EMOJI, p.OPT.SMILEY, p.OPT.NUMBER, p.OPT.RESERVED)
    tweet = p.clean(tweet)
    tweet = re.sub(r"[^a-z0-9]",' ',tweet.lower())
    tweet = remove_stopwords(tweet)
    return tweet

In [4]:
df["clean_tweet"] = df["tweet"].map(vaccum_cleaner)

In [5]:
# For training word embedding models, a list of sentences, where each sentence is a list of words is created.
all_sentences = []
for tweet in df["clean_tweet"]:
    words = tweet.split(" ")
    all_sentences.append(words)
all_sentences[:5]

[['stopped',
  'years',
  'ago',
  'switched',
  'markdown',
  'articles',
  'source',
  'code'],
 ['thank', 'resharing'],
 ['pleasure'],
 ['day',
  '100daysofcode',
  'hashnode',
  'got',
  'fed',
  'coding',
  'got',
  'stuck',
  'problem',
  'past',
  'days',
  't',
  'feel',
  'like',
  'coding',
  't',
  'understand',
  'flow',
  'instead',
  'wasting',
  'time',
  'switched',
  'writing',
  'blog'],
 ['thank', 'article']]

In [6]:

phrases = Phrases(all_sentences, min_count=5, threshold=2)
bigram = Phraser(phrases)
all_sentences = list(bigram[all_sentences])
all_sentences[:10]

[['stopped',
  'years',
  'ago',
  'switched',
  'markdown',
  'articles',
  'source',
  'code'],
 ['thank', 'resharing'],
 ['pleasure'],
 ['day',
  '100daysofcode',
  'hashnode',
  'got',
  'fed',
  'coding',
  'got',
  'stuck',
  'problem',
  'past',
  'days',
  't',
  'feel_like',
  'coding',
  't',
  'understand',
  'flow',
  'instead',
  'wasting',
  'time',
  'switched',
  'writing',
  'blog'],
 ['thank', 'article'],
 ['non',
  'podcast',
  'developer',
  'hosting',
  'personal',
  'giveaway',
  'announcement'],
 ['famous',
  'podcasts',
  'tech',
  'webdevelopment',
  'technology',
  '2articles1week',
  'beginners',
  'codenewbies1'],
 ['congratulations', 'ruth'],
 ['positively',
  'lost',
  'damon',
  'schulz',
  'quick',
  'piece',
  'short',
  'path',
  'brought',
  'developer',
  'juniordeveloper',
  'javascript',
  'learning',
  'education'],
 ['thinking',
  'blogging',
  'long',
  'time',
  'couldn',
  't',
  'finally',
  'started',
  'blog',
  'introduction',
  'write',
 

In [7]:
model = gensim.models.Word2Vec(all_sentences, min_count=15,  workers=4, window=5, epochs=100)     

In [8]:
def pp(obj):
    print(pd.DataFrame(obj))
    
def analogy(worda, wordb, wordc):
    result = model.wv.most_similar(negative=[worda], 
                                positive=[wordb, wordc])
    return result[0][0]
print("Finding some analogies")
keywords = ['webdevelopment', 'nodejs', "beginners", "2articles1week","restapi"]
answers = [analogy('js', 'python', kw) for kw in keywords]
pp(zip(keywords,answers))
print("-"*10)
answers = [analogy('javascript', 'python', kw) for kw in keywords]
pp(zip(keywords,answers))
print("-"*10)
print("Finding some similarities")
print(model.wv.most_similar("python", topn=20))
print("-"*10)
print(model.wv.most_similar("javascript", topn=20))

Finding some analogies
                0                1
0  webdevelopment  machinelearning
1          nodejs          python3
2       beginners          python3
3  2articles1week          python3
4         restapi  machinelearning
----------
                0                1
0  webdevelopment  machinelearning
1          nodejs          python3
2       beginners          python3
3  2articles1week          python3
4         restapi          python3
----------
Finding some similarities
[('python3', 0.7117981910705566), ('machinelearning', 0.610938549041748), ('datascience', 0.5108599066734314), ('devblogging', 0.4940609037876129), ('django', 0.4740975797176361), ('beginner', 0.46559756994247437), ('2articles1week', 0.36954450607299805), ('stuff', 0.3520037531852722), ('java', 0.34186333417892456), ('womenwhocode', 0.3401317000389099), ('different', 0.34008634090423584), ('coding', 0.3388567864894867), ('data', 0.3375648260116577), ('2articles1week_javascript', 0.33489835262298584), ('d