In [2]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns
import re
import time
import string
import warnings

# for all NLP related operations on text
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import *
from nltk.classify import NaiveBayesClassifier
from wordcloud import WordCloud

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, confusion_matrix, accuracy_score
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB

# To consume Twitter's API
import tweepy
from tweepy import OAuthHandler 

# To identify the sentiment of text
from textblob import TextBlob
from textblob.sentiments import NaiveBayesAnalyzer
from textblob.np_extractors import ConllExtractor

# ignoring all the warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

# downloading stopwords corpus
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('vader_lexicon')
nltk.download('averaged_perceptron_tagger')
nltk.download('movie_reviews')
nltk.download('punkt')
nltk.download('conll2000')
nltk.download('brown')
stopwords = set(stopwords.words("english"))

# for showing all the plots inline
%matplotlib inline

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\khura\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\khura\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\khura\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\khura\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package movie_reviews to
[nltk_data]     C:\Users\khura\AppData\Roaming\nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\khura\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is alr

In [34]:
class TwitterClient(object): 
    def __init__(self): 
        #Initialization method. 
        try: 
            # create OAuthHandler object 
            auth = OAuthHandler('MWagHIMZgBCE8ExJw4ZEhBeqd', 'pHoXxG9mXVDlLInne7xajFJ8WdyPKhuXdAwJOnYu9qUOT0Go8U') 
            # set access token and secret 
            auth.set_access_token('18837312-7UAgD8xarB9RgUKv263ftqGKnYkSIHAgjzVvf7K1f', 'pjc3MOSDmCXz3mAFt28gfjWdQqNVqcSUzA4BhVChNFQqP') 
            # create tweepy API object to fetch tweets 
            # add hyper parameter 'proxy' if executing from behind proxy "proxy='http://172.22.218.218:8085'"
            self.api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)
            
        except tweepy.TweepError as e:
            print(f"Error: Twitter Authentication Failed - \n{str(e)}")

    def get_tweets(self, query, maxTweets = 1000):
        #Function to fetch tweets. 
        # empty list to store parsed tweets 
        tweets = [] 
        sinceId = None
        max_id = -1
        tweetCount = 0
        tweetsPerQry = 100

        while tweetCount < maxTweets:
            try:
                if (max_id <= 0):
                    if (not sinceId):
                        new_tweets = self.api.search(q=query, geo = '43.651070,-79.347015,250km', lang='en', count=tweetsPerQry)
                    else:
                        new_tweets = self.api.search(q=query, count=tweetsPerQry,geo = '43.651070,-79.347015,250km', lang='en',
                                                since_id=sinceId)
                else:
                    if (not sinceId):
                        new_tweets = self.api.search(q=query, count=tweetsPerQry,geo = '43.651070,-79.347015,250km', lang='en',
                                                max_id=str(max_id - 1))
                    else:
                        new_tweets = self.api.search(q=query, count=tweetsPerQry,geo = '43.651070,-79.347015,250km', lang='en',
                                                max_id=str(max_id - 1),
                                                since_id=sinceId)
                if not new_tweets:
                    print("No more tweets found")
                    break

                for tweet in new_tweets:
                    parsed_tweet = {} 
                    parsed_tweet['tweets'] = tweet.text 

                    # appending parsed tweet to tweets list 
                    if tweet.retweet_count > 0: 
                        # if tweet has retweets, ensure that it is appended only once 
                        if parsed_tweet not in tweets: 
                            tweets.append(parsed_tweet) 
                    else: 
                        tweets.append(parsed_tweet) 
                        
                tweetCount += len(new_tweets)
                print("Downloaded {0} tweets".format(tweetCount))
                max_id = new_tweets[-1].id

            except tweepy.TweepError as e:
                # Just exit if any error
                print("Tweepy error : " + str(e))
                break
        
        return pd.DataFrame(tweets)

In [35]:
twitter_client = TwitterClient()

# calling function to get tweets
tweets_df = twitter_client.get_tweets('HSBC', maxTweets=7000)
print(f'tweets_df Shape - {tweets_df.shape}')
# tweets_df.head(10)
# len(tweets_df)

Downloaded 66 tweets
Downloaded 138 tweets
Downloaded 230 tweets
Downloaded 326 tweets
Downloaded 425 tweets
Downloaded 524 tweets
Downloaded 603 tweets
Downloaded 701 tweets
Downloaded 795 tweets
Downloaded 883 tweets
Downloaded 973 tweets
Downloaded 1061 tweets
Downloaded 1153 tweets
Downloaded 1229 tweets
Downloaded 1310 tweets
Downloaded 1391 tweets
Downloaded 1469 tweets
Downloaded 1541 tweets
Downloaded 1608 tweets
Downloaded 1686 tweets
Downloaded 1752 tweets
Downloaded 1846 tweets
Downloaded 1938 tweets
Downloaded 2015 tweets
Downloaded 2098 tweets
Downloaded 2185 tweets
Downloaded 2265 tweets
Downloaded 2354 tweets
Downloaded 2436 tweets
Downloaded 2517 tweets
Downloaded 2611 tweets
Downloaded 2687 tweets
Downloaded 2752 tweets
Downloaded 2834 tweets
Downloaded 2896 tweets
Downloaded 2965 tweets
Downloaded 3033 tweets
Downloaded 3093 tweets
Downloaded 3154 tweets
Downloaded 3215 tweets
Downloaded 3268 tweets
Downloaded 3343 tweets
Downloaded 3407 tweets
Downloaded 3458 tweets


In [51]:
tweets_df

Unnamed: 0,tweets,sentiment,tidy_tweets
0,RT @ipacglobal: The Telegraph: Senior staff at...,pos,"The Telegraph: Senior staff at HSBC, Standard..."
1,@HSBC had a cheque that I need to pay in for l...,pos,@HSBC had a cheque that I need to pay in for l...
2,RT @smitaprakash: Leaked files expose mass inf...,neg,Leaked files expose mass infiltration of UK f...
3,RT @BenGartside: Scoop with @JackHHazlewood an...,pos,Scoop with @JackHHazlewood and Senior staff a...
4,RT @stopadani: “There’s no long-term growth pr...,pos,“There’s no long-term growth prospect at all ...
...,...,...,...
1242,@MeghUpdates I heard HSBC had blocked account ...,neg,@MeghUpdates I heard HSBC had blocked account ...
1243,@afneil The HSBC Office is relatively close to...,pos,@afneil The HSBC Office is relatively close to...
1244,"@HSBC_UK Hi, thanks for your reply. I just sen...",pos,"@HSBC_UK Hi, thanks for your reply. I just sen..."
1245,Leaked files expose mass infiltration of UK fi...,neg,Leaked files expose mass infiltration of UK fi...


In [37]:
# Sentiment classification for above tweets

# 1 way
def fetch_sentiment_using_SIA(text):
    sid = SentimentIntensityAnalyzer()
    polarity_scores = sid.polarity_scores(text)
    return 'neg' if polarity_scores['neg'] > polarity_scores['pos'] else 'pos'

# 2 way
def fetch_sentiment_using_textblob(text):
    analysis = TextBlob(text)
    return 'pos' if analysis.sentiment.polarity >= 0 else 'neg'

In [38]:
# Using NLTK's SentimentIntensityAnalyzer
sentiments_using_SIA = tweets_df.tweets.apply(lambda tweet: fetch_sentiment_using_SIA(tweet))
pd.DataFrame(sentiments_using_SIA.value_counts())

Unnamed: 0,tweets
pos,830
neg,417


In [40]:
# Using TextBlob
sentiments_using_textblob = tweets_df.tweets.apply(lambda tweet: fetch_sentiment_using_textblob(tweet))
pd.DataFrame(sentiments_using_textblob.value_counts())

Unnamed: 0,tweets
pos,1083
neg,164


In [50]:
# higher negative classification by SIA so using that
tweets_df['sentiment'] = sentiments_using_SIA
tweets_df.head()

Unnamed: 0,tweets,sentiment,tidy_tweets
0,RT @ipacglobal: The Telegraph: Senior staff at...,pos,"The Telegraph: Senior staff at HSBC, Standard..."
1,@HSBC had a cheque that I need to pay in for l...,pos,@HSBC had a cheque that I need to pay in for l...
2,RT @smitaprakash: Leaked files expose mass inf...,neg,Leaked files expose mass infiltration of UK f...
3,RT @BenGartside: Scoop with @JackHHazlewood an...,pos,Scoop with @JackHHazlewood and Senior staff a...
4,RT @stopadani: “There’s no long-term growth pr...,pos,“There’s no long-term growth prospect at all ...


In [48]:
# Pre-processing - cleaning up tweets

# remove handles

def remove_pattern(text, pattern_regex):
    r = re.findall(pattern_regex, text)
    for i in r:
        text = re.sub(i, '', text)
    
    return text 

In [68]:
tweets_df['tidy_tweets'] = np.vectorize(remove_pattern)(tweets_df['tweets'], "@[\w]* | RT* | *@[\w]*:")
tweets_df.head(50)

Unnamed: 0,tweets,sentiment,tidy_tweets
0,RT @ipacglobal: The Telegraph: Senior staff at...,pos,"RT The Telegraph: Senior staff at HSBC, Standa..."
1,@HSBC had a cheque that I need to pay in for l...,pos,had a cheque that I need to pay in for like 2 ...
2,RT @smitaprakash: Leaked files expose mass inf...,neg,RT Leaked files expose mass infiltration of UK...
3,RT @BenGartside: Scoop with @JackHHazlewood an...,pos,"RT Scoop with and Senior staff at HSBC, Standa..."
4,RT @stopadani: “There’s no long-term growth pr...,pos,RT “There’s no long-term growth prospect at al...
5,RT @benedictrogers: HSBC should reveal its han...,pos,"RT HSBC should reveal its hand on Hong Kong, w..."
6,RT @ipacglobal: Full story:\n\nLeaked files ex...,neg,RT Full story:\n\nLeaked files expose mass inf...
7,RT @d1gitalflow: Leaked database of 1.95millio...,pos,RT Leaked database of 1.95million members from...
8,@stuyoso Hi there - Thanks for spreading aware...,pos,Hi there - Thanks for spreading awareness. ⚠️ ...
9,RT @benedictrogers: This may be part of the ex...,neg,RT This may be part of the explanation for why...


In [22]:
! git fetch
! git add Sentiment_Analysis_updated.ipynb
! git commit -m "New code push" Sentiment_Analysis_updated.ipynb
! git push origin

The file will have its original line endings in your working directory


[main e3be616] New code push
 1 file changed, 106 insertions(+), 138 deletions(-)


The file will have its original line endings in your working directory
To https://github.com/Ashishkhurana01/NLP.git
   b56c6a4..e3be616  main -> main
