In [23]:
# imports
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from joblib import dump, load # used for saving and loading sklearn objects
from scipy.sparse import save_npz, load_npz # used for saving and loading sparse matrices
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split
from scipy.sparse import csr_matrix
import warnings
warnings.filterwarnings("ignore")

In [3]:
# import custom libraries and scripts
# sys.path.append(os.path.abspath(os.path.join("../..")))
sys.path.append(".")
sys.path.append("..")

from defaults import *
from extract_dataframe import read_json
from extract_dataframe import TweetDfExtractor
from clean_tweets_dataframe import Clean_Tweets

In [4]:
# read csv
tweets_df = pd.read_csv("../data/clean_data.csv")
tweets_df.head()

Unnamed: 0,created_at,source,original_text,polarity,subjectivity,lang,favorite_count,status_count,retweet_count,screen_name,original_author,followers_count,friends_count,possibly_sensitive,hashtags,user_mentions,place
0,2022-08-07 22:31:20+00:00,Twitter for Android,RT @i_ameztoy: Extra random image (I):\n\nLets...,-0.125,0.190625,en,4,8097,2,i_ameztoy,i_ameztoy,20497,2621,unknown,"[{'text': 'City', 'indices': [132, 137]}]","[{'screen_name': 'i_ameztoy', 'name': 'Iban Am...",unknown
1,2022-08-07 22:31:16+00:00,Twitter for Android,RT @IndoPac_Info: #China's media explains the ...,-0.1,0.1,en,691,5831,201,ZIisq,ZIisq,65,272,unknown,"[{'text': 'China', 'indices': [18, 24]}, {'tex...","[{'screen_name': 'IndoPac_Info', 'name': 'Indo...",unknown
2,2022-08-07 22:31:07+00:00,Twitter for Android,"China even cut off communication, they don't a...",0.0,0.0,en,0,1627,0,Fin21Free,Fin21Free,85,392,unknown,"[{'text': 'XiJinping', 'indices': [127, 137]}]","[{'screen_name': 'ZelenskyyUa', 'name': 'Волод...",Netherlands
3,2022-08-07 22:31:06+00:00,Twitter for Android,"Putin to #XiJinping : I told you my friend, Ta...",0.1,0.35,en,0,1627,0,Fin21Free,Fin21Free,85,392,unknown,"[{'text': 'XiJinping', 'indices': [9, 19]}]",[],Netherlands
4,2022-08-07 22:31:04+00:00,Twitter for iPhone,"RT @ChinaUncensored: I’m sorry, I thought Taiw...",-6.938894e-18,0.55625,en,1521,18958,381,VizziniDolores,VizziniDolores,910,2608,unknown,[],"[{'screen_name': 'ChinaUncensored', 'name': 'C...","Ayent, Schweiz"


In [5]:
cleaner = Clean_Tweets(tweets_df.copy())

Automation in Action...!!!


### Sentiment analysis

In [6]:
tweets_df.columns

Index(['created_at', 'source', 'original_text', 'polarity', 'subjectivity',
       'lang', 'favorite_count', 'status_count', 'retweet_count',
       'screen_name', 'original_author', 'followers_count', 'friends_count',
       'possibly_sensitive', 'hashtags', 'user_mentions', 'place'],
      dtype='object')

In [11]:
# select necessary columns
cleanTweet = tweets_df[["original_text", "polarity", 'subjectivity']]
cleanTweet

Unnamed: 0,original_text,polarity,subjectivity
0,RT @i_ameztoy: Extra random image (I):\n\nLets...,-1.250000e-01,0.190625
1,RT @IndoPac_Info: #China's media explains the ...,-1.000000e-01,0.100000
2,"China even cut off communication, they don't a...",0.000000e+00,0.000000
3,"Putin to #XiJinping : I told you my friend, Ta...",1.000000e-01,0.350000
4,"RT @ChinaUncensored: I’m sorry, I thought Taiw...",-6.938894e-18,0.556250
...,...,...,...
7435,RT @metesohtaoglu: 📌📸 Map of #China's possible...,0.000000e+00,1.000000
7436,RT @NEVERBOW: China is doing #exactly what #Ru...,2.500000e-01,0.250000
7437,Minister Wu is crystal clear in his @BBCNews i...,1.583333e-01,0.419444
7438,Reports say that #China is planning to seize #...,0.000000e+00,0.000000


In [8]:
def text_category(p):
    """
    converts polarity into sentiment category
    """
    if p > 0:
        return "positive"
    elif p < 0:
        return "negative"
    else:
        return "neutral"

In [12]:
cleanTweet["score"] = cleanTweet["polarity"].apply(text_category)
cleanTweet

Unnamed: 0,original_text,polarity,subjectivity,score
0,RT @i_ameztoy: Extra random image (I):\n\nLets...,-1.250000e-01,0.190625,negative
1,RT @IndoPac_Info: #China's media explains the ...,-1.000000e-01,0.100000,negative
2,"China even cut off communication, they don't a...",0.000000e+00,0.000000,neutral
3,"Putin to #XiJinping : I told you my friend, Ta...",1.000000e-01,0.350000,positive
4,"RT @ChinaUncensored: I’m sorry, I thought Taiw...",-6.938894e-18,0.556250,negative
...,...,...,...,...
7435,RT @metesohtaoglu: 📌📸 Map of #China's possible...,0.000000e+00,1.000000,neutral
7436,RT @NEVERBOW: China is doing #exactly what #Ru...,2.500000e-01,0.250000,positive
7437,Minister Wu is crystal clear in his @BBCNews i...,1.583333e-01,0.419444,positive
7438,Reports say that #China is planning to seize #...,0.000000e+00,0.000000,neutral


In [13]:
sentiment = cleanTweet['score'].value_counts()
sentiment

neutral     2894
positive    2768
negative    1778
Name: score, dtype: int64

#### as we can see there are:
    * 2894 neutral sentiment
    * 2768 positive sentiment
    * 1778 negative sentiment
#### tweets

#### removing neutral sentiments

In [14]:
# remove neutral tweets
cleanTweet = cleanTweet[cleanTweet['score'] != 'neutral']
cleanTweet

Unnamed: 0,original_text,polarity,subjectivity,score
0,RT @i_ameztoy: Extra random image (I):\n\nLets...,-1.250000e-01,0.190625,negative
1,RT @IndoPac_Info: #China's media explains the ...,-1.000000e-01,0.100000,negative
3,"Putin to #XiJinping : I told you my friend, Ta...",1.000000e-01,0.350000,positive
4,"RT @ChinaUncensored: I’m sorry, I thought Taiw...",-6.938894e-18,0.556250,negative
5,RT @benedictrogers: We must not let this happe...,2.000000e-01,0.500000,positive
...,...,...,...,...
7432,@kiwispillow @dbongino No ones buying this non...,1.000000e-01,1.000000,positive
7433,Senior Taiwanese missile development official ...,-2.000000e-01,0.400000,negative
7436,RT @NEVERBOW: China is doing #exactly what #Ru...,2.500000e-01,0.250000,positive
7437,Minister Wu is crystal clear in his @BBCNews i...,1.583333e-01,0.419444,positive


In [16]:
# add score map column
def scoremap(score):
    """
    maps sentiment score to 0 and 1
    """
    if score == "positive":
        return 1
    else:
        return 0

In [17]:
cleanTweet['scoremap'] = cleanTweet["score"].map(scoremap)
cleanTweet

Unnamed: 0,original_text,polarity,subjectivity,score,scoremap
0,RT @i_ameztoy: Extra random image (I):\n\nLets...,-1.250000e-01,0.190625,negative,0
1,RT @IndoPac_Info: #China's media explains the ...,-1.000000e-01,0.100000,negative,0
3,"Putin to #XiJinping : I told you my friend, Ta...",1.000000e-01,0.350000,positive,1
4,"RT @ChinaUncensored: I’m sorry, I thought Taiw...",-6.938894e-18,0.556250,negative,0
5,RT @benedictrogers: We must not let this happe...,2.000000e-01,0.500000,positive,1
...,...,...,...,...,...
7432,@kiwispillow @dbongino No ones buying this non...,1.000000e-01,1.000000,positive,1
7433,Senior Taiwanese missile development official ...,-2.000000e-01,0.400000,negative,0
7436,RT @NEVERBOW: China is doing #exactly what #Ru...,2.500000e-01,0.250000,positive,1
7437,Minister Wu is crystal clear in his @BBCNews i...,1.583333e-01,0.419444,positive,1


In [21]:
(X, y) = cleanTweet['original_text'], cleanTweet['scoremap']
trigram_vectorizer = CountVectorizer(ngram_range=(1, 3))
trigram_vectorizer.fit(X.values)

In [22]:
X_trigram_vectorizer = trigram_vectorizer.transform(X.values)

In [24]:
def train_and_show_scores(X: csr_matrix, y: np.array, title: str) -> None:
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, train_size=0.75, stratify=y
    )

    clf = SGDClassifier()
    clf.fit(X_train, y_train)
    train_score = clf.score(X_train, y_train)
    test_score = clf.score(X_test, y_test)
    print(f'{title}\nTrain score: {round(train_score, 2)} ; Test score: {round(test_score, 2)}\n')

In [25]:
train_and_show_scores(X_trigram_vectorizer, y.values, 'Trigram Counts')

Trigram Counts
Train score: 1.0 ; Test score: 0.8



### Topic modeling