### Import the packages that are needed for the analysis.

In [2]:
! pip install wordcloud

Collecting wordcloud
  Obtaining dependency information for wordcloud from https://files.pythonhosted.org/packages/34/ac/72a4e42e76bf549dfd91791a6b10a9832f046c1d48b5e778be9ec012aa47/wordcloud-1.9.2-cp311-cp311-win_amd64.whl.metadata
  Downloading wordcloud-1.9.2-cp311-cp311-win_amd64.whl.metadata (3.4 kB)
Downloading wordcloud-1.9.2-cp311-cp311-win_amd64.whl (151 kB)
   ---------------------------------------- 0.0/151.4 kB ? eta -:--:--
   ---------------------------------------- 0.0/151.4 kB ? eta -:--:--
   -- ------------------------------------- 10.2/151.4 kB ? eta -:--:--
   ------- ------------------------------- 30.7/151.4 kB 262.6 kB/s eta 0:00:01
   ---------- ---------------------------- 41.0/151.4 kB 279.3 kB/s eta 0:00:01
   ----------------------- --------------- 92.2/151.4 kB 476.3 kB/s eta 0:00:01
   -------------------------------------- 151.4/151.4 kB 644.7 kB/s eta 0:00:00
Installing collected packages: wordcloud
Successfully installed wordcloud-1.9.2


In [6]:
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import STOPWORDS,WordCloud
from gensim import corpora
from gensim.models.ldamodel import LdaModel,CoherenceModel
from pprint import pprint
import pandas as pd
import statistics
import string
import os
import re
#import pyLDAvis.gensim_models as gensimvis
import pickle 
#import pyLDAvis
from random import shuffle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.linear_model import SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.cluster import KMeans
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
import numpy as np 
from joblib import dump, load 
from scipy.sparse import save_npz, load_npz 
from scipy.stats import uniform
from scipy.sparse import csr_matrix

In [10]:
import pandas as pd
import json
from textblob import TextBlob


class ExtractTweets:
    """
    - class for extracting tweets
    """
    def __init__(self,json_file: str)->list:
        data = []
        for tweets in open(json_file,'r'):
            data.append(json.loads(tweets))
        self.tweets = data

    def get_tweets(self):
        """just a basic function of getting tweets"""
        return self.tweets

    def find_statuses_count(self)->list:
        """
        - this function returns the status counts
        """
        return list(map(lambda tweet: tweet['user']['statuses_count'], self.tweets))
    
    def find_full_text(self)->list:
        """
        - this function extracts the full texts of
        the tweets
        """
        return list(map(lambda tweet: tweet['text'], self.tweets))
        

    def find_sentiments(self, data:list)->list:
        """
        - this function finds sentiments from the dataset
        """
        get_polarity = [TextBlob(text).sentiment.polarity for text in data]
        get_subjectivity = [TextBlob(text).sentiment.subjectivity for text in data]
        return get_polarity, get_subjectivity

    def find_created_time(self)->list:
        """
        - this function returns a list of 
        the created time tags for when the tweet was generated
        """
        return  list(map(lambda tweet: tweet['created_at'], self.tweets))


    def find_source(self)->list:
        """
        - this function returns the source of the tweet
        """
        return  list(map(lambda tweet: tweet['source'], self.tweets))
        
    def find_screen_name(self)->list:
        """
        - this function returns the screen name of the person who has tweeted
        """
        return  list(map(lambda tweet: tweet['user']['screen_name'], self.tweets))
        
    def find_followers_count(self)->list:
        """
        - this function returns the amount of followers per user
        """
        return  list(map(lambda tweet: tweet['user']['followers_count'], self.tweets))
        
    def find_friends_count(self)->list:
        """
        - this function returns the number of friends the user has
        """
        return  list(map(lambda tweet: tweet['user']['friends_count'], self.tweets))
        
    def is_sensitive(self)->list:
        """
        - this function checks whether the data is sensitive or not
        """
        return [tweet['possibly_sensitive'] if "possibly_sensitive" in tweet.keys() else None \
        for tweet in self.tweets]
               

    def find_favourite_count(self)->list:
        """
        - this function returns the amount of times the tweet has been counted
        as favorite
        """
        return [tweet['retweeted_status']['favorite_count'] if 'retweeted_status' in tweet.keys() else 0 \
        for tweet in self.tweets]
        
    def find_retweet_count(self)->list:
        """
        - this function finds how many times a tweet has been retweeted
        """
        return [tweet['retweeted_status']['retweet_count'] if 'retweeted_status' in tweet.keys() else 0 \
        for tweet in self.tweets]
        
    
    def find_hashtags(self) -> list:
        """
        return the amount of hashtags in tweets
        """
        return [tweet.get('entities',dict()).get('hashtags', None)
                    for tweet in self.tweets]

        
    def find_mentions(self)->list:
        """
        - this function returns how many times 
        a person was mentioned in a tweet
        """
        return [" , ".join([count_['screen_name']  for tweet in self.tweets for count_ in tweet['entities']['user_mentions']])]
    
    def find_lang(self)->list:
        """
        return the language used to tweet
        """
        return list(map(lambda tweet:tweet['lang'],self.tweets))

    def find_location(self)->list:
        """
        returns the location in which the tweet was published
        """
        return [tweet['user']['location'] for tweet in self.tweets]
        
    def get_tweet_df(self, save=False)->pd.DataFrame:
        """required column to be generated you should be creative and add more features"""
        
        columns = ['created_at', 'source', 'original_text','polarity','subjectivity', 'lang', 'favorite_count', 'retweet_count', 
            'original_author', 'followers_count','friends_count','possibly_sensitive', 'hashtags', 'user_mentions', 'place']
        
        created_at = self.find_created_time()
        source = self.find_source()
        text = self.find_full_text()
        polarity, subjectivity = self.find_sentiments(text)
        lang = self.find_lang()
        fav_count = self.find_favourite_count()
        retweet_count = self.find_retweet_count()
        screen_name = self.find_screen_name()
        follower_count = self.find_followers_count()
        friends_count = self.find_friends_count()
        sensitivity = self.is_sensitive()
        hashtags = self.find_hashtags()
        mentions = self.find_mentions()
        location = self.find_location()
        values = [created_at, source, text, polarity, subjectivity, lang, fav_count, retweet_count, screen_name, follower_count, friends_count, sensitivity, hashtags, mentions, location]
        data_ = dict(zip(columns,values))
        data  = { key:pd.Series(value) for key, value in data_.items() }
        df = pd.DataFrame(data=data)
        
        if save:
            df.to_csv('C:/Users/Admin/Desktop/Desmondonam/Data_Science/Week0/Twitter-Data-Analysis/data/processed_tweet_data.csv', index=False)
            print('File Successfully Saved.!!!')
            
        return df


if __name__ == "__main__":

    extracted_tweets = ExtractTweets(r"C:\Users\Admin\Desktop\Desmondonam\Data_Science\Week0\Twitter-Data-Analysis\data\Economic_Twitter_Data_minified.json")
    df = extracted_tweets.get_tweet_df(save=True)

File Successfully Saved.!!!


In [11]:
import pandas as pd
import re
# import enchant



class TweetCleanser:
    """
    -this class cleans the tweets and
    ensures that the data is easy to work with
    """
    # en_us = enchant.Dict("en_US")

    def __init__(self, df:pd.DataFrame):
        self.df = df
        print('Automation in Action...!!!')

    def drop_unwanted_column(self, df:pd.DataFrame)->pd.DataFrame:
        """
        remove rows that has column names. This error originated from
        the data collection stage.  
        """
        unwanted_rows = self.df[self.df['retweet_count'] == 'retweet_count' ].index
        self.df.drop(unwanted_rows , inplace=True)
        self.df = self.df[self.df['polarity'] != 'polarity']
        return self.df
        
    
    def drop_duplicate(self, df:pd.DataFrame)->pd.DataFrame:
        """
        - this function drop duplicate rows
        """
        self.df = self.df.drop_duplicates(subset=['original_text'])
        return self.df
        
    def convert_to_datetime(self, df:pd.DataFrame)->pd.DataFrame:
        """
        convert column to datetime
        """
        self.df['created_at'] = pd.to_datetime(self.df['created_at'])
        return self.df
    
    def convert_to_numbers(self, df:pd.DataFrame)->pd.DataFrame:
        """
        convert columns like polarity, subjectivity, retweet_count
        favorite_count etc to numbers
        """
        for key in self.df.columns:
            if self.df[key].dtype == 'float64':
                self.df[key] = self.df[key].astype(int)
        return self.df
    
    def remove_non_english_tweets(self,df:pd.DataFrame)->pd.DataFrame:
        """
        remove non english tweets from lang that is the language 
        column
        """
        self.df = self.df[self.df['lang'].str.contains("en")]
        return self.df

    def get_hashtags(self,tweet):
        '''This function will extract hashtags'''
        return re.findall('(#[A-Za-z]+[A-Za-z0-9-_]+)', tweet)

    def save_changes(self)->pd.DataFrame:
        self.df.to_csv(r"C:\Users\Admin\Desktop\Desmondonam\Data_Science\Week0\Twitter-Data-Analysis\data\cleaned_data.csv", index=False)

    # def clean_text(self,tweet):
    #     """this function cleans the original text"""
    #     return ' '.join(w for w in tweet.split() if self.en_us.check(w))

if __name__ == "__main__":
    df = pd.read_csv(r"C:\Users\Admin\Desktop\Desmondonam\Data_Science\Week0\Twitter-Data-Analysis\data\processed_tweet_data.csv")
    cleanser = TweetCleanser(df)
    cleanser.drop_unwanted_column(df)
    cleanser.drop_duplicate(df)
    cleanser.convert_to_datetime(df)
    cleanser.remove_non_english_tweets(df)
    cleanser.save_changes()



    

Automation in Action...!!!


## Data Preprocessing

In [16]:
# dataframe from extracted tweets
extracted_tweets = ExtractTweets(r"C:\Users\Admin\Desktop\Desmondonam\Data_Science\Week0\Twitter-Data-Analysis\data\Economic_Twitter_Data_minified.json")
df = extracted_tweets.get_tweet_df(save=False)
df.dropna()

Unnamed: 0,created_at,source,original_text,polarity,subjectivity,lang,favorite_count,retweet_count,original_author,followers_count,friends_count,possibly_sensitive,hashtags,user_mentions,place


Processing tasks

In [17]:
# clean the dataframe
cleanser = TweetCleanser(df)
# drop unwanted columns
cleanser.drop_unwanted_column(df)
# drop duplicate values from original text
cleanser.drop_duplicate(df)
# convert date data to appropriate datetime
cleanser.convert_to_datetime(df)
# remove non english texts
df_ = cleanser.remove_non_english_tweets(df)

Automation in Action...!!!


In [18]:
df_.head()

Unnamed: 0,created_at,source,original_text,polarity,subjectivity,lang,favorite_count,retweet_count,original_author,followers_count,friends_count,possibly_sensitive,hashtags,user_mentions,place
38,2022-04-22 22:17:05+00:00,"<a href=""http://twitter.com/download/android"" ...",RT @NorthstarCharts: The 10-year yield is tell...,0.16,0.54,en,188,43,davideiacovozzi,18,55,,"[{'text': 'gold', 'indices': [116, 121]}, {'te...",,
39,2022-04-22 13:44:53+00:00,"<a href=""http://twitter.com/download/android"" ...",RT @MichaelAArouet: German 10y mortgage rate w...,0.15,0.175,en,179,32,davideiacovozzi,18,55,,[],,
41,2022-04-22 06:10:34+00:00,"<a href=""http://twitter.com/download/android"" ...",RT @goldseek: When? https://t.co/kO2FfHKaZg,0.0,0.0,en,193,26,davideiacovozzi,18,55,False,[],,
42,2022-04-21 17:22:09+00:00,"<a href=""http://twitter.com/download/android"" ...",RT @charliebilello: The 30-year mortgage rate ...,0.0,0.183333,en,620,213,davideiacovozzi,18,55,,[],,
43,2022-04-21 10:32:26+00:00,"<a href=""http://twitter.com/download/android"" ...",RT @biancoresearch: Rates rise until something...,-0.4,0.4,en,1787,417,davideiacovozzi,18,55,False,[],,


In [19]:
df_.info()

<class 'pandas.core.frame.DataFrame'>
Index: 16 entries, 38 to 54
Data columns (total 15 columns):
 #   Column              Non-Null Count  Dtype              
---  ------              --------------  -----              
 0   created_at          16 non-null     datetime64[ns, UTC]
 1   source              16 non-null     object             
 2   original_text       16 non-null     object             
 3   polarity            16 non-null     float64            
 4   subjectivity        16 non-null     float64            
 5   lang                16 non-null     object             
 6   favorite_count      16 non-null     int64              
 7   retweet_count       16 non-null     int64              
 8   original_author     16 non-null     object             
 9   followers_count     16 non-null     int64              
 10  friends_count       16 non-null     int64              
 11  possibly_sensitive  5 non-null      object             
 12  hashtags            16 non-null     object

In [20]:
# checking for any missing values from the data
missing_values = df_.isnull().sum().sum()


In [21]:
missing_values

27

In [22]:
# check the columns that have values
columns_with_null_values = df_.columns[df_.isnull().any()]

In [23]:
columns_with_null_values

Index(['possibly_sensitive', 'user_mentions'], dtype='object')

## Exploratory Data Analysis

In [24]:
# univariate analysis on hashtags
def get_hashtags(tweet):
    '''This function will extract hashtags'''
    return re.findall('(#[A-Za-z]+[A-Za-z0-9-_]+)', tweet)