In [23]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
# import geopandas as gp
import nltk
nltk.download('vader_lexicon')
nltk.download('stopwords')
from nltk.stem.porter import *
stemmer = PorterStemmer()
from nltk.sentiment import SentimentIntensityAnalyzer
sia = SentimentIntensityAnalyzer()
import nltk
nltk.download('movie_reviews')
nltk.download('punkt')
from textblob import TextBlob
from textblob import Blobber
from textblob.sentiments import NaiveBayesAnalyzer

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [24]:
tweets = pd.read_csv('/content/Charities_tweets.csv')

## Text pre-processing

In [25]:
# function to remove @user
def remove_pattern(input_txt, pattern):
    r = re.findall(pattern, input_txt)
    for i in r:
        input_txt = re.sub(i,'',input_txt)
    return input_txt

In [26]:
# additional cleaning
tweets['Tweet'] = np.vectorize(remove_pattern)(tweets['text'], '@[\w]*') # create new column with removed @user
tweets['Tweet'] = tweets['Tweet'].apply(lambda x: re.split('http:\/\/.*', str(x))[0]) # remove urls
tweets['Tweet'] = tweets['Tweet'].str.replace('[^a-zA-Z#]+',' ') # remove special characters, numbers, punctuations

In [27]:
#Creating a function that takes care of all the preprocessing stuff.
def preprocess():
    tweets['Tweet'] = tweets['Tweet'].str.lower() # Ensuring all words in the Tweet column of training data are lowercased
    #Parsing the stop_words.txt file and storing all the words in a list.
    stopwords = nltk.corpus.stopwords.words("english")

    #Removing all stopwords from all the tweets in training data.
    tweets["Tweet"] = tweets["Tweet"].apply(lambda func: ' '.join(sw 
                                            for sw in func.split() 
                                            if sw not in stopwords))
    #Training Data
    tweets['Tweet'] = tweets['Tweet'].str.replace(r'http?://[^\s<>"]+|www\.[^\s<>"]+', '') # Removing hyperlinks from all the tweets
    tweets['Tweet'] = tweets['Tweet'].str.replace('@[A-Za-z0-9]+', '') # Removing usernames from all the tweets.
    tweets['Tweet'] = tweets['Tweet'].str.replace(r'\B#\w*[a-zA-Z]+\w*', '') # Removing hashtags, including the text, from all the tweets
    tweets['Tweet'] = tweets['Tweet'].str.replace('\d+', '') # Removing numbers from all the tweets
    special_chars = ["!",'"',"%","&","amp","'","(",")", "*","+",",","-",".","/",":",";","<","=",">","?","[","\\","]","^","_","`","{","|","}","~","–","@","#","$"]
    for c in special_chars:
        tweets['Tweet'] = tweets['Tweet'].str.replace(c,'') # Removing all special characters from all the tweets
preprocess()

In [28]:
# create new variable tokenized tweet 
tokenized_tweet = tweets['Tweet'].apply(lambda x: x.split())
# remove stopwords
stopwords = nltk.corpus.stopwords.words("english")
tokenized_tweet = [w for w in tokenized_tweet if w not in stopwords]

In [29]:
# join tokens into one sentence
for i in range(len(tokenized_tweet)):
    tokenized_tweet[i] = ' '.join(tokenized_tweet[i])
# change df['Tweet'] to tokenized_tweet
tweets['Tweet']  = tokenized_tweet

In [30]:
# tweets after cleaning
tweets['Tweet']

0       catholic charities provides low cost immigrati...
1       rt media focuses negative sells copy lot envir...
2       friendly reminder donate portion proceeds take...
3       world poorest countries need support face show...
4       rt cause close hearts believe one feel alone i...
                              ...                        
1995    rt pls rt u retweet https co bq peyi ro giveaw...
1996    th annual bigelow tea community challenge call...
1997    funraising season officially begun learn help ...
1998    rt please follow links donate fundraising two ...
1999    rt yesterday cake cutting beginning upcoming d...
Name: Tweet, Length: 2000, dtype: object

## Deriving sentiment

In [32]:
# assign sentiment scores
scores = []
for tweet in tweets['Tweet']:
    score = sia.polarity_scores(tweet)
    scores.append(score['compound'])
tweets['sentiment_scores'] = scores
tweets['sentiment_derived'] = ["positive" if w >0 else "negative" if w < 0 else "neutral" for w in tweets['sentiment_scores']]

In [33]:
tweets['sentiment_scores']

0       0.2732
1      -0.1280
2       0.4939
3      -0.2023
4      -0.0772
         ...  
1995    0.0772
1996    0.0772
1997    0.8225
1998    0.8934
1999   -0.1280
Name: sentiment_scores, Length: 2000, dtype: float64

In [35]:
# percent match between assigned and derived sentiment
tweets['match'] = (tweets['sentiment_derived']==tweets['Geo_Enabled']).astype(int)
tweets[['Geo_Enabled','sentiment_derived','match']]
tweets['match'].mean()

0.0

In [36]:
# crosstab of assigned vs derived sentiment
pd.crosstab(tweets.Geo_Enabled, tweets.sentiment_derived)

sentiment_derived,negative,neutral,positive
Geo_Enabled,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
False,215,68,1006
True,117,61,533


In [40]:
import nltk
nltk.download('movie_reviews')
nltk.download('punkt')

[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [41]:
blobber = Blobber(analyzer=NaiveBayesAnalyzer())

blob = TextBlob("i love it!")
print(blob.sentiment)

blob = blobber("i hate it!")
print(blob.sentiment)

Sentiment(polarity=0.625, subjectivity=0.6)
Sentiment(classification='pos', p_pos=0.523148148148148, p_neg=0.4768518518518517)


In [42]:
scores = []
for tweet in tweets['Tweet']:
    score = TextBlob(tweet)
    scores.append(score.sentiment[0])
tweets['textblob_scores'] = scores
tweets['textblob_derived'] = ["positive" if w >0 else "negative" if w < 0 else "neutral" for w in tweets['textblob_scores']]

In [44]:
pd.crosstab(tweets.Geo_Enabled, tweets.textblob_derived)
pd.crosstab(tweets.sentiment_derived, tweets.textblob_derived)

textblob_derived,negative,neutral,positive
sentiment_derived,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
negative,158,134,40
neutral,11,79,39
positive,173,728,638


In [None]:
# tweets.to_csv('test.csv')

In [45]:
def combined_sentiment(tweets):
    if (tweets['textblob_derived'] == 'negative') or (tweets['sentiment_derived'] == 'negative'):
        return 'negative'
    if (tweets['textblob_derived'] == 'neutral') and (tweets['sentiment_derived'] == 'positive'):
        return 'neutral'
    if (tweets['textblob_derived'] == 'positive') and (tweets['sentiment_derived'] == 'neutral'):
        return 'neutral'
    if (tweets['textblob_derived'] == 'neutral') and (tweets['sentiment_derived'] == 'neutral'):
        return 'negative'
    if (tweets['textblob_derived'] == 'positive') and (tweets['sentiment_derived'] == 'positive'):
        return 'positive'
    else:
        return '0'

In [46]:
tweets['final_derived'] = tweets.apply(combined_sentiment, axis=1)

In [48]:
pd.crosstab(tweets.final_derived, tweets.Geo_Enabled)

Geo_Enabled,False,True
final_derived,Unnamed: 1_level_1,Unnamed: 2_level_1
negative,373,222
neutral,514,253
positive,402,236
