# Twitter Sentiment Analysis of Nigerian Banks
### Introduction
The banking industry in Nigeria is the most digitized on the continent, and the nation was recently named the leader in digital payments in Africa. A [press release](https://www.businesswire.com/news/home/20220719005265/en/) from 2022 states that Nigeria recorded 3.7 billion real-time payments in 2021, placing it sixth among nations with the largest real-time payments markets.

However, due to high inflation, rising interest rates, shortage of US dollars, regulatory interference, and shortage of the Naira notes before the general election in February, weakens bank operating conditions in 2023.

### Aims and Objectives
This project aims to uncover insights in from bank customers' tweets on Twitter.com. To see how the banks' online customers perceive or react to the respective banks' services.

### Project Overview
There are three major sections to this project:

Data Collection: A python library called Snscrape is used to scrape tweets from Twitter, while pandas is used to read in the scraped data.

Data Cleaning and Preprocessing: The libraries used are pandas (for data cleaning and analysis), textblob (for sentiment analysis), and nltk (natural processing language toolkit).

Data Analysis and Visualization: For data visualization, the libraries matplotlib, seaborn and wordcloud were used.

#### Import necessary libraries

In [None]:
import pandas as pd
import snscrape.modules.twitter as sntwitter
import itertools
from tqdm.notebook import tqdm

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import nest_asyncio
import re
from textblob import TextBlob
from nltk.tokenize import WordPunctTokenizer
from nltk.corpus import stopwords   
from datetime import datetime
import glob                     
import os
nest_asyncio.apply()
import nltk
nltk.download('stopwords')
from nltk.tokenize import word_tokenize
from wordcloud import WordCloud

### Data Collection

In [None]:
bank_name={'Guarantee Trust Bank':'gtb OR \ gtbank',
    'Zenith Bank':'zenith bank',
    'UBA':'UBA',
    'Access Bank':'access bank',
    'Fidelity Bank':'fidelity bank',
    'Eco Bank':'eco bank',
    'First Bank':'first bank',
    'Wema Bank':'wema bank',
    'Kuda bank':'kuda bank OR \ kuda',
    'FCMB':'fcmb', 
    'Sterling bank':'sterling bank',
    'Opay':'opay',
    'Palmpay':'palmpay'}

In [None]:
def scrape_config(search_string):
    loc = '9.0820, 8.6753, 923768km'
    twts = pd.DataFrame(itertools.islice(sntwitter.TwitterSearchScraper(
    f'{search_string[1]} + since:2023-01-01 until:2023-05-30 geocode:"{loc}"').get_items(), 10000))
    return twts

In [None]:
def run_snscrape(search_vals):
    
    #set empty dataframe for join
    out_df= pd.DataFrame()
    
    for bank in search_vals.items():
        print ("running for search item: "+bank[0]+"\n")
        print ("Search string: "+bank[1]+"\n")
                
        #run snscrape
        tweets_df = scrape_config(bank)
       
        #join Dataframes and create 'Bank' column
        tweets_df["Bank"]= bank[0]
        out_df = pd.concat([out_df,tweets_df])
        
    return out_df

In [None]:
tweets_df = run_snscrape(bank_name)

In [None]:
pd.set_option('display.max_rows', 10000)
pd.set_option('display.max_columns', 10000)
pd.set_option('display.max_colwidth', None)

In [None]:
tweets_df.shape

In [None]:
tweets_df.head()

In [None]:
# saving the dataframe
#tweets_df.to_csv('raw_tweets.csv')

In [None]:
# read saved csv file
tweets_df = pd.read_csv('raw_tweets.csv', index_col=0)

In [None]:
tweets_df.head()

### Data Cleaning

In [None]:
# remove Unmaned column
tweets_df = tweets_df.loc[:, ~tweets_df.columns.str.contains('^Unnamed')]

In [None]:
# check for duplicate tweets
tweets_df[tweets_df['rawContent'].duplicated()]

In [None]:
 # Drop duplicate tweets
tweets_df.drop_duplicates(subset='rawContent', keep=False, inplace=True)
tweets_df[tweets_df['rawContent'].duplicated()]

In [None]:
tweets_df.shape

In [None]:
# rename rawContent to tweet
tweets_df.rename(columns = {'rawContent':'tweet'}, inplace = True)

In [None]:
tweets_df.columns

In [None]:
# function to clean tweet column
def clean_text(text):  
    pat1 = r'@[^ ]+'                   #@signs
    pat2 = r'https?://[A-Za-z0-9./]+'  #links
    pat3 = r'\'s'                      #floating s's
    pat4 = r'\#\w+'                     # hashtags
    pat5 = r'&amp '
    pat6 = r'[^A-Za-z\s]'         #remove non-alphabet
    pat7 = r'\n'
    combined_pat = r'|'.join((pat1, pat2,pat3,pat4,pat5, pat6, pat7))
    text = re.sub(combined_pat,"",text).lower()
    return text.strip()

In [None]:
tweets_df["tweet"] = tweets_df["tweet"].apply(clean_text)

In [None]:
tweets_df["tweet"]

In [None]:
tweets_df["lang"].unique()

In [None]:
#droping tweet rows which languange isn't english laguage
tweets_df = tweets_df[tweets_df["lang"].isin(['en'])]

In [None]:
tweets_df["lang"].unique()

#### Any tweet that contains "thank you for contacting us" will be removed.

In [None]:
contacting_df = tweets_df.loc[tweets_df['tweet'].str.contains("thank you for contacting|thanks for contacting", case=False)]
contacting_df.head(2)

In [None]:
tweets_df = tweets_df.loc[~(tweets_df['tweet'].str.contains("thank you for contacting|thanks for contacting|thank you for reaching out|sorry your message is yet to be responded|apologize for any inconvenience|your complaint via dm", case=False))]

In [None]:
#df['UserName'] = df['url'].str.slice(0, 3)
tweets_df['User'] = tweets_df['url'].apply(lambda x: x.split('/')[3])

In [None]:
# selecting the important columns for this analysis
tweets_df = tweets_df[['date', 'tweet', 'likeCount', 'Bank', 'User']]

In [None]:
tweets_df.tail(10)

### Data Preprocessing

In [None]:
print("Running sentiment process")

# creating two new columns(polarity and subjectivity)
def getSubjectivity(text):
    return TextBlob(text).sentiment.subjectivity
def getPolarity(text):
    return TextBlob(text).sentiment.polarity

tweets_df["Subjectivity"] = tweets_df['tweet'].apply(getSubjectivity)
tweets_df["Polarity"] = tweets_df['tweet'].apply(getPolarity)

In [None]:
tweets_df[['Subjectivity','Polarity','tweet']].head()

In [None]:
#creating a column to show if the tweet is positive, negative or neutral
def analysis(score):
    if score < 0:
        return "Negative"
    elif score == 0:
        return "Neutral"
    else:
        return "Positive"

tweets_df['Analysis'] = tweets_df['Polarity'].apply(analysis)

In [None]:
tweets_df["date"] = pd.to_datetime(tweets_df["date"])

# #set index = date so as to create rolling mean 
# tweets_df = tweets_df.sort_values("date").set_index("date")

In [None]:
tweets_df.head()

In [None]:
# check for duplicate tweets
tweets_df['tweet'].duplicated().sum()

In [None]:
# Drop duplicate tweets
tweets_df.drop_duplicates(subset='tweet', keep=False, inplace=True)
tweets_df['tweet'].duplicated().sum()

### Data Visualization and Exploratory

In [None]:
# Plotting a pie chart to show distribution of Sentiments
plt.figure(figsize=[7,5], facecolor='none')
plt.pie(tweets_df['Analysis'].value_counts(), labels=['Neutral','Postive', "Negative"], colors=['#1dc5af', '#666666', '#E6f2ee'], startangle=90, explode= [0,0,0.09], autopct='%1.1f%%');
plt.title('Twitter Users Sentiments');

In [None]:
sns.countplot(data=tweets_df, x='Bank', hue='Analysis')
plt.title('Tweet Analysis of Nigerian Banks'.upper())

plt.show()

#### Creating a word cloud

In [None]:
#import nltk
#nltk.download('stopwords')
#from nltk.corpus import stopwords

stop_words = stopwords.words('english')
stop_words = stop_words + ['u', 'na', 'know', 'one', 'go', 'make', 'see', 'dont', 'amp', 'im', 'cant']
tweets_df['cleaned_words'] = tweets_df['tweet'].astype(str).apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))

In [None]:
print(stop_words)

In [None]:
#Creating the text variable
text = " ".join(i for i in tweets_df.cleaned_words)

wc = WordCloud(width = 1000, height = 800, background_color = "white").generate(text)

#Remove axis and display the data as image
plt.axis("off")
plt.imshow(wc, interpolation = "bilinear")

In [None]:
tweets_df.columns

In [None]:
# save processed tweets as csv
tweets_df.to_csv('processed_bank_tweets.csv')