# Real Housewives of Lagos: An Analysis of Twitter's Perception

### 1. Introduction

In this Data analysis Project I mined over 170,000 tweets relating to the Real Housewives of Lagos reality TV show using the Python library Tweepy, then I performed a sentiment analysis on the data using the text blob library in Python.

### Contents
1. [Introduction](#1.-Introduction)
2. [Data Gathering](#2.-Data-Gathering)
3. [Data Assessment and Cleaning](#3.-Data-Assessment-and-Cleaning)
4. [Data Preprocessing](4.-Data-Preprocessing)
5. [Sentiment Analysis](5.-Sentiment-Analysis)
6. [Data Visualization](6.-Data-Visualization)
7. [Conclusion](7.-Conclusion)

### 2. Data Gathering

In [None]:
# Import libraries
import tweepy 
import pandas as pd
import numpy as np
import csv 
import re 
import string 
import glob  
import requests 
import matplotlib.pyplot as plt

from collections import Counter

import nltk
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
from nltk.stem import WordNetLemmatizer

import textblob
from textblob import TextBlob

from wordcloud import WordCloud
from emot.emo_unicode import UNICODE_EMOJI

import warnings
%matplotlib inline

In [None]:
# Access keys and codes from Twitter Developer Account
consumer_key = 'XXXXXXXXXXXXXXXXXXXXX'
consumer_secret = 'XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX'
access_key= '##########-XXXXXXXXXXXXXXXXXXXXX'
access_secret = 'XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX'

In [None]:
# Pass in twitter API authentication key
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_key, access_secret) 
api = tweepy.API(auth,wait_on_rate_limit=True)
sleep_on_rate_limit=False

In [None]:
# Timeframe
since_start= "2022-07-01"
since_end = "2022-07-08"

In [None]:
# Collect tweets using the Cursor object and scrape tweets individually:
def get_tweets(search_query, num_tweets):
    tweet_list = [tweets for tweets in tweepy.Cursor(api.search_tweets,
                                    q=search_query,
                                    lang="en",
                                    since_id = since_start,
                                    tweet_mode='extended').items(num_tweets)]
    for tweet in tweet_list:
        tweet_id = tweet.id # get user_id
        created_at = tweet.created_at # get time of tweet
        text = tweet.full_text # get the tweet
        location = tweet.user.location # get user's location
        retweet = tweet.retweet_count # get number of retweets
        favorite = tweet.favorite_count # get number of likes
        with open('rholagos.csv','a', newline='', encoding='utf-8') as csvFile:
            csv_writer = csv.writer(csvFile, delimiter=',') 
            csv_writer.writerow([tweet_id, created_at, text, location, retweet, favorite]) 

In [None]:
# Create keywords to search for, filter Links, retweets, replies.
search_words = "RHOLagos OR RHOLagosReunion OR Real housewives of lagos OR RHOL OR #realhousewivesoflagos OR #RHOL OR #rholagos OR #rhol"
search_query = search_words + " -filter:retweets AND -filter:replies"

#  Pass in search query and the number of tweets to retrieve
get_tweets(search_query,50000) 

In [None]:
# Collect older tweets using the Cursor object and scrape tweets individually:
def get_tweets2(search_query, num_tweets, since_id_num):
    tweet_list = [tweets for tweets in tweepy.Cursor(api.search_tweets,
                                    q=search_query,
                                    lang="en",
                                    since_id = since_end,
                                    tweet_mode='extended').items(num_tweets)]
    for tweet in tweet_list[::-1]:
        tweet_id = tweet.id # get user_id
        created_at = tweet.created_at # get time of tweet
        text = tweet.full_text # get the tweet
        location = tweet.user.location # get user's location
        retweet = tweet.retweet_count # get number of retweets
        favorite = tweet.favorite_count # get number of likes
        with open('rholagos2.csv','a', newline='', encoding='utf-8') as csvFile:
            csv_writer = csv.writer(csvFile, delimiter=',') 
            csv_writer.writerow([tweet_id, created_at, text, location, retweet, favorite]) 

In [None]:
# Create keywords to search for, filter Links, retweets, replies.
search_words = "RHOLagos OR RHOLagosReunion OR Real housewives of lagos OR RHOL OR #realhousewivesoflagos OR #RHOL OR #rholagos OR #rhol"
search_query = search_words + " -filter:retweets AND -filter:replies"

#  Pass in search query and the number of tweets to retrieve
get_tweets2(search_query,50000,since_start)

In [None]:
# Save file to my path and convert to dataframe
path = r"C:\Users\THERESA\Desktop\Tina Project"  
files = glob.glob(path + "/*.csv")

tweets = []

for file in files:
    df = pd.read_csv(file, index_col = None, header = None)
    tweets.append(df)
    
# Merge all dataframes
tweets_df = pd.concat(tweets, axis=0, ignore_index = True) 

tweets_df.head()

In [None]:
# Rename column names
dict = {0: 'Id',1: 'Time_of_tweet',2: 'Tweet', 3: 'Location', 4: 'Retweets', 5: 'Likes'}
tweets_df.rename(columns=dict, inplace=True)
tweets_df.head()

### 3. Data Assessment and Cleaning

In [None]:
# Check first five rows
tweets_df.head()

In [None]:
tweets_df.duplicated(subset='Id').sum()

In [None]:
# Inspect DataFrame
tweets_df.shape

In [None]:
#Check for missing values
tweets_df.isna()

In [None]:
# Fill missing locations with "No location"
tweets_df["Location"]=tweets_df["Location"].fillna('No location')

In [None]:
# Drop unnecessary columns
tweets_df.drop([6,7,8,9,10], axis = 1, inplace = True)

### 4. Data Preprocessing

In [None]:
# Define function to extract hashtags and remove # with REGEX
def getHashtags(tweet):
    tweet = tweet.lower()  
    tweet = re.findall(r'\#\w+',tweet) 
    return " ".join(tweet)

tweets_df['Hashtags'] = tweets_df['Tweet'].apply(getHashtags)
tweets_df.head()

In [None]:
hashtags_list = tweets_df['Hashtags'].tolist()

# Iterate over all hashtags and split where there is more than one hashtag
hashtags = []
for item in hashtags_list:
    item = item.split()
    for i in item:
        hashtags.append(i)

# Determine Unique count of all hashtags used
counts = Counter(hashtags)
hashtags_df = pd.DataFrame.from_dict(counts, orient='index').reset_index()
hashtags_df.columns = ['Hashtags', 'Count']
hashtags_df.sort_values(by='Count', ascending=False, inplace=True)

In [None]:
# Check for top 10 hashtags
hashtags_df.head(10)

In [None]:
Rhol_cast = ["Chioma", "Laura", "Toyin", "Iyabo", "Maryam", "Carolyna"]

In [None]:
# Define function to extract rhol casts from each Tweet
def getrholcast(tweet):
    tweet = tweet.lower() 
    tweet_tokens = word_tokenize(tweet)
    rhol_cast = [char for char in tweet_tokens if char in Rhol_cast] 
    return " ".join(rhol_cast)

In [None]:
# Extract casts to a new column
tweets_df['Rhol_cast'] = tweets_df['Tweet'].apply(getrholcast)
tweets_df.head()

In [None]:
# Define function to replace characters names with correct spellings
def castNames(rhol_cast):
    replacements = [('carolyn','carolyna'), ('caroline', 'carolyna'), ('tiannah', 'toyin'), ('mariam', 'maryam')]
    for pat,repl in replacements:
        rhol_cast = re.sub(pat, repl, rhol_cast)
    return rhol_cast
tweets_df['Rhol_cast'] = tweets_df['Rhol_cast'].apply(castNames)
tweets_df.head()

In [None]:
cast_list = tweets_df['Rhol_cast'].tolist()

# Iterate over all cast names and split where there is more than one cast
cast = []
for item in cast_list:
    item = item.split()
    for i in item:
        cast.append(i)

# Determine Unique count of all cast
counts = Counter(cast)
cast_df = pd.DataFrame.from_dict(counts, orient='index').reset_index()
cast_df.columns = ['Rhol_cast', 'Count']
cast_df.sort_values(by='Count', ascending=False, inplace=True)
cast_df.head(10)

In [None]:
# Defining set containing all stopwords in English 
stop_words = list(stopwords.words('english'))
user_stop_words = ["i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours", "yourself", 
                   "yourselves", "he", "him", "his", "himself", "she", "her", "hers", "herself", "it", "its", "itself",
                   "they", "them", "their", "theirs", "themselves", "what", "which", "who", "whom", "this", "that", 
                   "these", "those", "am", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", 
                   "having", "do", "does", "did", "doing", "a", "an", "the", "and", "but", "if", "or", "because", "as", 
                   "until", "while", "of", "at", "by", "for", "with", "about", "against", "between", "into", "through", 
                   "during", "before", "after", "above", "below", "to", "from", "up", "down", "in", "out", "on", "off", 
                   "over", "under", "again", "further", "then", "once", "here", "there", "when", "where", "why", "how",
                   "all", "any", "both", "each", "few", "more", "most", "other", "some", "such", "no", "nor", "not",
                   "only", "own", "same", "so", "than", "too", "very", "s", "t", "can", "will", "just", "don", "should", 
                   "now",'anyone','today','yesterday','day', 'already','real','housewife', 'housewives', 'lagos']
alphabets = list(string.ascii_lowercase)
stop_words = stop_words + user_stop_words + alphabets + Rhol_cast


In [None]:
emojis = list(UNICODE_EMOJI.keys())

In [None]:
# preProcess tweet for sentiment analysis
def preprocessTweets(tweet):
    tweet = tweet.lower()
    # Cleaning and removing URL’s
    tweet = re.sub(r"http\S+|www\S+|https\S+", '', tweet, flags = re.MULTILINE)
    # Cleaning and removing repeating characters
    tweet = re.sub(r'\@\w+|\#\w+|\d+', '', tweet)
    # Cleaning and removing the above stop words list from the tweet text
    tweet_tokens = word_tokenize(tweet)  
    filtered_words = [w for w in tweet_tokens if w not in stop_words]
    filtered_words = [w for w in filtered_words if w not in emojis]
    # Cleaning and removing punctuations
    unpunctuated_words = [w for w in filtered_words if w not in string.punctuation]
    lemmatizer = WordNetLemmatizer() 
    lemma_words = [lemmatizer.lemmatize(w) for w in unpunctuated_words]
    return " ".join(lemma_words)

In [None]:
# Generate a new column called 'Processed Tweets' by applying preprocessed tweets function to the 'Tweet' column.
tweets_df['Processed_Tweets'] = tweets_df['Tweet'].apply(preprocessTweets)
tweets_df.head()

In [None]:
# Extract all tweets into one long string with each word separate with a "space"
tweets_long_string = tweets_df['Processed_Tweets'].tolist()
tweets_long_string = " ".join(tweets_long_string)

### 5. Sentiment Analysis

In [None]:
# Define function to obtain Polarity Score
def getPolarity(tweet):
    return TextBlob(tweet).sentiment.polarity

# Define function to obtain Sentiment category
def getSentimentTextBlob(polarity):
    if polarity < 0:
        return "Negative"
    elif polarity == 0:
        return "Neutral"
    else:
        return "Positive"

In [None]:
# Apply the functions to respective columns
tweets_df['Polarity']=tweets_df['Processed_Tweets'].apply(getPolarity)
tweets_df['Sentiment']=tweets_df['Polarity'].apply(getSentimentTextBlob)
tweets_df['Sentiment'].value_counts()
tweets_df.head()

### 6. Data Visualization

In [None]:
# Instantiate the Twitter word cloud object
tweet_wc = WordCloud(collocations = False,max_words=400, background_color = 'white').generate(tweets_long_string)

# Display the generated Word Cloud
plt.imshow(tweet_wc, interpolation='bilinear')
plt.axis("off")
plt.show()

In [None]:
#Save final file
tweets_df.to_csv('Rholreunion_Finall_File.csv', index=False)

In [None]:
twitter_wc.to_file("wordcloud.png")

### 7. Conclusion

I exported this file to Power BI where i built a dashboard with it to better display my analysis.