Topic Modelling and Sentiment Analysis

In [3]:
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import STOPWORDS,WordCloud
from gensim import corpora
import pandas as pd
import statistics
import string
import os
import re

In [4]:
# inbuilt modules
from extract_dataframe import ExtractTweets
from clean_tweets_dataframe import TweetCleanser

In [5]:
# dataframe from extracted tweets
extracted_tweets = ExtractTweets("data/Economic_Twitter_Data.json")
df = extracted_tweets.get_tweet_df(save=False)


Preprocessing Tasks

In [15]:
# clean the dataframe
cleanser = TweetCleanser(df)
# drop unwanted columns
cleanser.drop_unwanted_column(df)
# drop duplicate values from original text
cleanser.drop_duplicate(df)
# convert date data to appropriate datetime
cleanser.convert_to_datetime(df)
# remove non english texts
df_ = cleanser.remove_non_english_tweets(df)

Automation in Action...!!!


In [16]:
df_.head()

Unnamed: 0,created_at,source,original_text,polarity,subjectivity,lang,favorite_count,retweet_count,original_author,followers_count,friends_count,possibly_sensitive,hashtags,user_mentions,place
38,2022-04-22 22:17:05+00:00,"<a href=""http://twitter.com/download/android"" ...",RT @NorthstarCharts: The 10-year yield is tell...,0.16,0.54,en,188,43,davideiacovozzi,18,55,,"[{'text': 'gold', 'indices': [116, 121]}, {'te...",,
39,2022-04-22 13:44:53+00:00,"<a href=""http://twitter.com/download/android"" ...",RT @MichaelAArouet: German 10y mortgage rate w...,0.15,0.175,en,179,32,davideiacovozzi,18,55,,[],,
41,2022-04-22 06:10:34+00:00,"<a href=""http://twitter.com/download/android"" ...",RT @goldseek: When? https://t.co/kO2FfHKaZg,0.0,0.0,en,193,26,davideiacovozzi,18,55,False,[],,
42,2022-04-21 17:22:09+00:00,"<a href=""http://twitter.com/download/android"" ...",RT @charliebilello: The 30-year mortgage rate ...,0.0,0.183333,en,620,213,davideiacovozzi,18,55,,[],,
43,2022-04-21 10:32:26+00:00,"<a href=""http://twitter.com/download/android"" ...",RT @biancoresearch: Rates rise until something...,-0.4,0.4,en,1787,417,davideiacovozzi,18,55,False,[],,


In [18]:
df_.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 15180 entries, 38 to 24622
Data columns (total 15 columns):
 #   Column              Non-Null Count  Dtype              
---  ------              --------------  -----              
 0   created_at          15180 non-null  datetime64[ns, UTC]
 1   source              15180 non-null  object             
 2   original_text       15180 non-null  object             
 3   polarity            15180 non-null  float64            
 4   subjectivity        15180 non-null  float64            
 5   lang                15180 non-null  object             
 6   favorite_count      15180 non-null  int64              
 7   retweet_count       15180 non-null  int64              
 8   original_author     15180 non-null  object             
 9   followers_count     15180 non-null  int64              
 10  friends_count       15180 non-null  int64              
 11  possibly_sensitive  6057 non-null   object             
 12  hashtags            15180 non-n

In [19]:
# checking for any missing values from the data
missing_values = df_.isnull().sum().sum()


In [20]:
missing_values

24303

In [24]:
# check the columns that have values
columns_with_null_values = df_.columns[df_.isnull().any()]

In [25]:
columns_with_null_values

Index(['possibly_sensitive', 'user_mentions'], dtype='object')

In [31]:
# univariate analysis on hashtags
def get_hashtags(tweet):
    '''This function will extract hashtags'''
    return re.findall('(#[A-Za-z]+[A-Za-z0-9-_]+)', tweet)

In [34]:
df_['hashtags'] = df_['original_text'].apply(get_hashtags)

In [39]:
df_.tail(2)

Unnamed: 0,created_at,source,original_text,polarity,subjectivity,lang,favorite_count,retweet_count,original_author,followers_count,friends_count,possibly_sensitive,hashtags,user_mentions,place
24573,2022-04-22 21:31:18+00:00,"<a href=""http://twitter.com/download/android"" ...",RT @Salt_Project_OS: Free yourself from writin...,0.4,0.8,en,46,3,RosieG1029,148,1240,,[],,
24586,2022-04-22 15:22:29+00:00,"<a href=""http://twitter.com/download/iphone"" r...",RT @pbhushan1: Thank you @BajpayeeManoj for th...,0.85,1.0,en,14671,5006,kitukalesatya,706,643,,[],,
24596,2022-04-22 15:01:27+00:00,"<a href=""http://twitter.com/download/iphone"" r...",RT @s_shreyatweets: Agree ?👇 https://t.co/R54Z...,0.0,0.0,en,5056,973,kitukalesatya,706,643,False,[],,
24599,2022-04-22 14:58:12+00:00,"<a href=""http://twitter.com/download/iphone"" r...",RT @tejjINC: 1. Peace Yatra by Late Sunil Dutt...,-0.3,0.6,en,636,115,kitukalesatya,706,643,False,[],,
24622,2022-04-22 14:44:35+00:00,"<a href=""http://twitter.com/download/iphone"" r...",RT @Parthtiwari25: Gujarat Congress MLA arrest...,0.0,0.0,en,1025,203,kitukalesatya,706,643,,[],,
