In [48]:
# imports
import os
import re
import gensim
import string
import pickle 
import warnings
import pyLDAvis
import numpy as np
import pandas as pd
import seaborn as sns
from gensim import corpora
from pprint import pprint
import matplotlib.pyplot as plt
from scipy.sparse import csr_matrix
from gensim.models import CoherenceModel
from wordcloud import STOPWORDS,WordCloud
import pyLDAvis.gensim_models as gensimvis
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from joblib import dump, load # used for saving and loading sklearn objects
from scipy.sparse import save_npz, load_npz # used for saving and loading sparse matrices
from sklearn.decomposition import NMF, LatentDirichletAllocation
warnings.filterwarnings("ignore")

In [3]:
# import custom libraries and scripts
# sys.path.append(os.path.abspath(os.path.join("../..")))
sys.path.append(".")
sys.path.append("..")

from defaults import *
from extract_dataframe import read_json
from extract_dataframe import TweetDfExtractor
from clean_tweets_dataframe import Clean_Tweets

In [4]:
# read csv
tweets_df = pd.read_csv("../data/clean_data.csv")
tweets_df.head()

Unnamed: 0,created_at,source,original_text,polarity,subjectivity,lang,favorite_count,status_count,retweet_count,screen_name,original_author,followers_count,friends_count,possibly_sensitive,hashtags,user_mentions,place
0,2022-08-07 22:31:20+00:00,Twitter for Android,RT @i_ameztoy: Extra random image (I):\n\nLets...,-0.125,0.190625,en,4,8097,2,i_ameztoy,i_ameztoy,20497,2621,unknown,"[{'text': 'City', 'indices': [132, 137]}]","[{'screen_name': 'i_ameztoy', 'name': 'Iban Am...",unknown
1,2022-08-07 22:31:16+00:00,Twitter for Android,RT @IndoPac_Info: #China's media explains the ...,-0.1,0.1,en,691,5831,201,ZIisq,ZIisq,65,272,unknown,"[{'text': 'China', 'indices': [18, 24]}, {'tex...","[{'screen_name': 'IndoPac_Info', 'name': 'Indo...",unknown
2,2022-08-07 22:31:07+00:00,Twitter for Android,"China even cut off communication, they don't a...",0.0,0.0,en,0,1627,0,Fin21Free,Fin21Free,85,392,unknown,"[{'text': 'XiJinping', 'indices': [127, 137]}]","[{'screen_name': 'ZelenskyyUa', 'name': 'Волод...",Netherlands
3,2022-08-07 22:31:06+00:00,Twitter for Android,"Putin to #XiJinping : I told you my friend, Ta...",0.1,0.35,en,0,1627,0,Fin21Free,Fin21Free,85,392,unknown,"[{'text': 'XiJinping', 'indices': [9, 19]}]",[],Netherlands
4,2022-08-07 22:31:04+00:00,Twitter for iPhone,"RT @ChinaUncensored: I’m sorry, I thought Taiw...",-6.938894e-18,0.55625,en,1521,18958,381,VizziniDolores,VizziniDolores,910,2608,unknown,[],"[{'screen_name': 'ChinaUncensored', 'name': 'C...","Ayent, Schweiz"


In [5]:
cleaner = Clean_Tweets(tweets_df.copy())

Automation in Action...!!!


### Sentiment analysis

In [11]:
# select necessary columns
cleanTweet = tweets_df[["original_text", "polarity", 'subjectivity']]
cleanTweet

Unnamed: 0,original_text,polarity,subjectivity
0,RT @i_ameztoy: Extra random image (I):\n\nLets...,-1.250000e-01,0.190625
1,RT @IndoPac_Info: #China's media explains the ...,-1.000000e-01,0.100000
2,"China even cut off communication, they don't a...",0.000000e+00,0.000000
3,"Putin to #XiJinping : I told you my friend, Ta...",1.000000e-01,0.350000
4,"RT @ChinaUncensored: I’m sorry, I thought Taiw...",-6.938894e-18,0.556250
...,...,...,...
7435,RT @metesohtaoglu: 📌📸 Map of #China's possible...,0.000000e+00,1.000000
7436,RT @NEVERBOW: China is doing #exactly what #Ru...,2.500000e-01,0.250000
7437,Minister Wu is crystal clear in his @BBCNews i...,1.583333e-01,0.419444
7438,Reports say that #China is planning to seize #...,0.000000e+00,0.000000


In [8]:
def text_category(p):
    """
    converts polarity into sentiment category
    """
    if p > 0:
        return "positive"
    elif p < 0:
        return "negative"
    else:
        return "neutral"

In [12]:
cleanTweet["score"] = cleanTweet["polarity"].apply(text_category)
cleanTweet

Unnamed: 0,original_text,polarity,subjectivity,score
0,RT @i_ameztoy: Extra random image (I):\n\nLets...,-1.250000e-01,0.190625,negative
1,RT @IndoPac_Info: #China's media explains the ...,-1.000000e-01,0.100000,negative
2,"China even cut off communication, they don't a...",0.000000e+00,0.000000,neutral
3,"Putin to #XiJinping : I told you my friend, Ta...",1.000000e-01,0.350000,positive
4,"RT @ChinaUncensored: I’m sorry, I thought Taiw...",-6.938894e-18,0.556250,negative
...,...,...,...,...
7435,RT @metesohtaoglu: 📌📸 Map of #China's possible...,0.000000e+00,1.000000,neutral
7436,RT @NEVERBOW: China is doing #exactly what #Ru...,2.500000e-01,0.250000,positive
7437,Minister Wu is crystal clear in his @BBCNews i...,1.583333e-01,0.419444,positive
7438,Reports say that #China is planning to seize #...,0.000000e+00,0.000000,neutral


In [13]:
sentiment = cleanTweet['score'].value_counts()
sentiment

neutral     2894
positive    2768
negative    1778
Name: score, dtype: int64

#### as we can see there are:
    * 2894 neutral sentiment
    * 2768 positive sentiment
    * 1778 negative sentiment
#### tweets

#### removing neutral sentiments

In [14]:
# remove neutral tweets
cleanTweet = cleanTweet[cleanTweet['score'] != 'neutral']
cleanTweet

Unnamed: 0,original_text,polarity,subjectivity,score
0,RT @i_ameztoy: Extra random image (I):\n\nLets...,-1.250000e-01,0.190625,negative
1,RT @IndoPac_Info: #China's media explains the ...,-1.000000e-01,0.100000,negative
3,"Putin to #XiJinping : I told you my friend, Ta...",1.000000e-01,0.350000,positive
4,"RT @ChinaUncensored: I’m sorry, I thought Taiw...",-6.938894e-18,0.556250,negative
5,RT @benedictrogers: We must not let this happe...,2.000000e-01,0.500000,positive
...,...,...,...,...
7432,@kiwispillow @dbongino No ones buying this non...,1.000000e-01,1.000000,positive
7433,Senior Taiwanese missile development official ...,-2.000000e-01,0.400000,negative
7436,RT @NEVERBOW: China is doing #exactly what #Ru...,2.500000e-01,0.250000,positive
7437,Minister Wu is crystal clear in his @BBCNews i...,1.583333e-01,0.419444,positive


In [16]:
# add score map column
def scoremap(score):
    """
    maps sentiment score to 0 and 1
    """
    if score == "positive":
        return 1
    else:
        return 0

In [17]:
cleanTweet['scoremap'] = cleanTweet["score"].map(scoremap)
cleanTweet

Unnamed: 0,original_text,polarity,subjectivity,score,scoremap
0,RT @i_ameztoy: Extra random image (I):\n\nLets...,-1.250000e-01,0.190625,negative,0
1,RT @IndoPac_Info: #China's media explains the ...,-1.000000e-01,0.100000,negative,0
3,"Putin to #XiJinping : I told you my friend, Ta...",1.000000e-01,0.350000,positive,1
4,"RT @ChinaUncensored: I’m sorry, I thought Taiw...",-6.938894e-18,0.556250,negative,0
5,RT @benedictrogers: We must not let this happe...,2.000000e-01,0.500000,positive,1
...,...,...,...,...,...
7432,@kiwispillow @dbongino No ones buying this non...,1.000000e-01,1.000000,positive,1
7433,Senior Taiwanese missile development official ...,-2.000000e-01,0.400000,negative,0
7436,RT @NEVERBOW: China is doing #exactly what #Ru...,2.500000e-01,0.250000,positive,1
7437,Minister Wu is crystal clear in his @BBCNews i...,1.583333e-01,0.419444,positive,1


In [21]:
(X, y) = cleanTweet['original_text'], cleanTweet['scoremap']
trigram_vectorizer = CountVectorizer(ngram_range=(1, 3))
trigram_vectorizer.fit(X.values)

In [22]:
X_trigram_vectorizer = trigram_vectorizer.transform(X.values)

In [24]:
def train_and_show_scores(X: csr_matrix, y: np.array, title: str) -> None:
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, train_size=0.75, stratify=y
    )

    clf = SGDClassifier()
    clf.fit(X_train, y_train)
    train_score = clf.score(X_train, y_train)
    test_score = clf.score(X_test, y_test)
    print(f'{title}\nTrain score: {round(train_score, 2)} ; Test score: {round(test_score, 2)}\n')

In [25]:
train_and_show_scores(X_trigram_vectorizer, y.values, 'Trigram Counts')

Trigram Counts
Train score: 1.0 ; Test score: 0.8



### Topic modeling

In [27]:
tweets_df.columns

Index(['created_at', 'source', 'original_text', 'polarity', 'subjectivity',
       'lang', 'favorite_count', 'status_count', 'retweet_count',
       'screen_name', 'original_author', 'followers_count', 'friends_count',
       'possibly_sensitive', 'hashtags', 'user_mentions', 'place'],
      dtype='object')

In [28]:
#text Preprocessing
tweets_df['original_text']=tweets_df['original_text'].astype(str)
tweets_df['original_text'] = tweets_df['original_text'].apply(lambda x: x.lower())
tweets_df['original_text']= tweets_df['original_text'].apply(lambda x: x.translate(str.maketrans(' ', ' ', string.punctuation)))

In [29]:
#Converting tweets to list of words For feature engineering
sentence_list = [tweet for tweet in tweets_df['original_text']]
word_list = [sent.split() for sent in sentence_list]

In [30]:
word_list

[['rt',
  'iameztoy',
  'extra',
  'random',
  'image',
  'i',
  'lets',
  'focus',
  'in',
  'one',
  'very',
  'specific',
  'zone',
  'of',
  'the',
  'western',
  'coast',
  'gt',
  'longjing',
  'district',
  'taichung',
  'city',
  'ta…'],
 ['rt',
  'indopacinfo',
  'chinas',
  'media',
  'explains',
  'the',
  'military',
  'reasons',
  'for',
  'each',
  'area',
  'of',
  'the',
  'drills',
  'in',
  'the',
  'taiwan',
  'strait',
  'read',
  'the',
  'labels',
  'in',
  'the',
  'pi…'],
 ['china',
  'even',
  'cut',
  'off',
  'communication',
  'they',
  'dont',
  'anwer',
  'phonecalls',
  'from',
  'the',
  'us',
  'but',
  'here',
  'clown',
  'zelenskyyua',
  'enters',
  'the',
  'stage',
  'to',
  'ask',
  'xijinping',
  'to',
  'change',
  'putins',
  'mind'],
 ['putin',
  'to',
  'xijinping',
  'i',
  'told',
  'you',
  'my',
  'friend',
  'taiwan',
  'will',
  'be',
  'a',
  'vassal',
  'state',
  'including',
  'nukes',
  'much',
  'like',
  'the',
  'ukrainian',
  '

In [32]:
len(word_list), type(word_list)

(7440, list)

In [33]:
#Create dictionary which contains Id and word 
id2word = corpora.Dictionary(word_list)
corpus= [id2word.doc2bow(tweet) for tweet in word_list]

In [37]:
print(id2word)

Dictionary<25113 unique tokens: ['city', 'coast', 'district', 'extra', 'focus']...>


In [39]:
corpus

[[(0, 1),
  (1, 1),
  (2, 1),
  (3, 1),
  (4, 1),
  (5, 1),
  (6, 1),
  (7, 1),
  (8, 1),
  (9, 1),
  (10, 1),
  (11, 1),
  (12, 1),
  (13, 1),
  (14, 1),
  (15, 1),
  (16, 1),
  (17, 1),
  (18, 1),
  (19, 1),
  (20, 1),
  (21, 1),
  (22, 1)],
 [(9, 2),
  (12, 1),
  (15, 1),
  (19, 5),
  (23, 1),
  (24, 1),
  (25, 1),
  (26, 1),
  (27, 1),
  (28, 1),
  (29, 1),
  (30, 1),
  (31, 1),
  (32, 1),
  (33, 1),
  (34, 1),
  (35, 1),
  (36, 1),
  (37, 1)],
 [(19, 2),
  (38, 1),
  (39, 1),
  (40, 1),
  (41, 1),
  (42, 1),
  (43, 1),
  (44, 1),
  (45, 1),
  (46, 1),
  (47, 1),
  (48, 1),
  (49, 1),
  (50, 1),
  (51, 1),
  (52, 1),
  (53, 1),
  (54, 1),
  (55, 1),
  (56, 1),
  (57, 2),
  (58, 1),
  (59, 1),
  (60, 1)],
 [(6, 2),
  (19, 1),
  (24, 1),
  (37, 1),
  (40, 1),
  (57, 2),
  (59, 1),
  (61, 1),
  (62, 1),
  (63, 1),
  (64, 1),
  (65, 1),
  (66, 1),
  (67, 1),
  (68, 1),
  (69, 1),
  (70, 1),
  (71, 1),
  (72, 1),
  (73, 1),
  (74, 1),
  (75, 1),
  (76, 1),
  (77, 1),
  (78, 1),
  (79, 1

In [40]:
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus,
                                           id2word=id2word,
                                           num_topics=5, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [43]:
pprint(lda_model.show_topics(formatted=False))

[(0,
  [('taiwan', 0.061315455),
   ('the', 0.05964462),
   ('to', 0.040232584),
   ('china', 0.035543233),
   ('of', 0.030504225),
   ('and', 0.027452258),
   ('in', 0.023780696),
   ('is', 0.022769393),
   ('rt', 0.0222886),
   ('a', 0.022058424)]),
 (1,
  [('you', 0.025121083),
   ('we', 0.019022327),
   ('i', 0.017762445),
   ('taiwanstraitscrisis', 0.010783229),
   ('their', 0.010219298),
   ('your', 0.010165102),
   ('or', 0.009268895),
   ('if', 0.008947265),
   ('now', 0.008390369),
   ('time', 0.008370824)]),
 (2,
  [('russia', 0.024044784),
   ('war', 0.02392964),
   ('ukraine', 0.023536507),
   ('usa', 0.022903256),
   ('speakerpelosi', 0.010289242),
   ('israel', 0.009817532),
   ('against', 0.008910389),
   ('japan', 0.008573305),
   ('taiwanchina', 0.007948937),
   ('america', 0.0077911797)]),
 (3,
  [('taiwanese', 0.02229605),
   ('who', 0.013180205),
   ('missile', 0.011471781),
   ('found', 0.009427182),
   ('global', 0.008946125),
   ('missiles', 0.0082056075),
   ('a

In [44]:
# Compute Perplexity

#It's a measure of how good the model is. The lower the better. Perplexity is a negative value
print('Perplexity: ', lda_model.log_perplexity(corpus))  
doc_lda = lda_model[corpus]


# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=word_list, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()

Perplexity:  -8.240797422616497
Ldamodel Coherence Score/Accuracy on Tweets:  0.37401248504713375


In [46]:
print('Lda model Coherence Score/Accuracy on Tweets: ', coherence_lda)

Lda model Coherence Score/Accuracy on Tweets:  0.37401248504713375


In [49]:
# Visualize the topics
pyLDAvis.enable_notebook()
LDAvis_prepared = gensimvis.prepare(lda_model, corpus, id2word)
LDAvis_prepared