# Project: Topic Modeling and Sentiment Analysis on Twitter Data

## **Objective **
### Social Media Tweet Analysis on Twitter Dataset
*   Topic Modeling on Twitter Dataset
*   Sentiment analysis on Twitter Dataset

### **Topic modeling**
Topic modeling is a type of statistical model for discovering the abstract "topics" that occur in a collection of texts.


*   The task here is to discover abstract topics from tweets.


### **Sentiment analysis**
 It is used in social media monitoring, allowing businesses to gain insights about how customers feel about certain topics, and detect urgent issues in real time before they spiral out of control.


*   The task here is to classify a tweet as a positive or negative tweet sentiment wise.

## Data Understanding
### Loading necessary packages

In [16]:
import json
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import STOPWORDS,WordCloud
from gensim import corpora
import pandas as pd
import string
import re
from textblob import TextBlob

In [104]:
def read_json(json_file: str)->list:
    """
    json file reader to open and read json files into a list
    Args:
    -----
    json_file: str - path of a json file
    
    Returns
    -------
    length of the json file and a list of json
    """
    
    tweets_data = []
    for tweets in open(json_file,'r'):
        tweets_data.append(json.loads(tweets))
    
    return len(tweets_data), tweets_data


In [105]:
class TweetDfExtractor:
    """
    this function will parse tweets json into a pandas dataframe
    
    Return
    #------
    dataframe
    """
    def __init__(self, tweets_list):
        
        self.tweets_list = tweets_list

        
    def find_full_text(self)->list:
        text = []
        for element in self.tweets_list:
            if 'retweeted_status' in element:
                if 'extended_tweet' in element['retweeted_status']:
                    text.append(element['retweeted_status']['extended_tweet']['full_text'])
                else:
                    text.append(element['retweeted_status']['text'])
            else:
                try:

                    if 'extended_tweet' in element['quoted_status']:
                        text.append(element['quoted_status']['extended_tweet']['full_text'])
                    else:
                        text.append(element['quoted_status']['text'])
                except:
                    text.append(element['text'])


        return text
    

    def find_created_time(self)->list:
        created_at = [] # Initialize empty list
        for element in self.tweets_list:
            if 'retweeted_status' in element:
                created_at.append(element['retweeted_status']['created_at'])
                    
            else:
                created_at.append(element['created_at'])

        return created_at

    

    def is_sensitive(self)->list:
        is_sensitive = []
        for element in self.tweets_list:
            if 'retweeted_status' in element:
                try:
                    is_sensitive.append(element['retweeted_status']['possibly_sensitive'])
                except:
                    is_sensit = None
                    is_sensitive.append(is_sensit)
            else:
                is_sensit = None
                is_sensitive.append(is_sensit)

        return is_sensitive

    

    def find_lang(self)->list:
        lang = []
        for element in self.tweets_list:
            if 'lang' in element:
                lang.append(element['lang'])
                    
            else:
                lang = None
                
        return lang

    
        
        
    def get_tweet_df(self, save=False)->pd.DataFrame:
        """required column to be generated you should be creative and add more features"""
        
        columns = ['created_at', 'original_text', 'lang','possibly_sensitive']
        
        created_at = self.find_created_time()
#         print(len(created_at))
        text = self.find_full_text()
#         print(len(text))
        lang = self.find_lang()
#         print(len(lang))
        sensitivity = self.is_sensitive()
#         print(len(sensitivity))
        data = zip(created_at, text, lang, sensitivity)

        df = pd.DataFrame(data=data, columns=columns)
        if True:
            df.to_csv('processed_tweet_data.csv', index=False)
            print('File Successfully Saved.!!!')
        
        return df

                

# required column to be generated you should be creative and add more features
columns = ['created_at', 'original_text','clean_text', 'sentiment','polarity','subjectivity', 'lang','possibly_sensitive']
tweet_len, tweet_list = read_json("data/covid19.json")
tweet_list[:1]
tweet = TweetDfExtractor(tweet_list)
tweet_df = tweet.get_tweet_df() 
tweet_df.head()
# use all defined functions to generate a dataframe with the specified columns above

File Successfully Saved.!!!


Unnamed: 0,created_at,original_text,lang,possibly_sensitive
0,Thu Jun 17 16:18:28 +0000 2021,"🚨Africa is ""in the midst of a full-blown third...",en,False
1,Fri Jun 18 16:40:24 +0000 2021,"Dr Moeti is head of WHO in Africa, and one of ...",en,False
2,Fri Jun 18 17:45:27 +0000 2021,Thank you @research2note for creating this ama...,en,
3,Wed Jun 16 00:21:22 +0000 2021,"Former Pfizer VP and Virologist, Dr. Michael Y...",en,False
4,Fri Jun 18 13:34:47 +0000 2021,I think it’s important that we don’t sell COVA...,en,
