In [None]:
import re
import string
import numpy as np
from nltk.corpus import twitter_samples
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer
from nltk.sentiment import SentimentIntensityAnalyzer
import seaborn as sns
import matplotlib.pyplot as plt



class MySentimentModel:
    def __init__(self):
        self.__trainX, self.__trainY, self.__testX, self.__testY = self.__getTrainAndTestData()
        self.__freqDict = self.__populateFrequencies()
        self.__matrix = self.__createMatrix()
        self.__J, self.__theta = self.__gradientDescent_algo(1e-9, 1500)

    # returns training and testing data
    @staticmethod
    def __getTrainAndTestData():
        positiveTweets = twitter_samples.strings('positive_tweets.json')
        negativeTweets = twitter_samples.strings('negative_tweets.json')

        trainPos = positiveTweets[:4000]
        trainNeg = negativeTweets[:4000]
        testPos = positiveTweets[4000:]
        testNeg = negativeTweets[4000:]

        trainX = trainPos + trainNeg
        testX = testPos + testNeg
        trainY = np.append(np.ones((len(trainPos), 1)), np.zeros((len(trainNeg), 1)), axis=0)
        testY = np.append(np.ones((len(testPos), 1)), np.zeros((len(testNeg), 1)), axis=0)
        return trainX, trainY, testX, testY

    # Cleaning, tokenizing and stemming the data
    def __processText(self, text):
        text = re.sub(r'^RT[\s]+', '', text)
        text = re.sub(r'https?:\/\/.*[\r\n]*', '', text)
        text = re.sub(r'@\S*', '', text)
        tokenizedText = TweetTokenizer().tokenize(text)
        stopWords = stopwords.words('english')
        stemmer = PorterStemmer()
        return [stemmer.stem(word) for word in tokenizedText if word not in stopWords and
                         word not in string.punctuation]

    def __populateFrequencies(self):
        # Gets all the positive words
        posWords = [word for sentence in self.__trainX[:4000] for word in self.__processText(sentence)]
        posFreq = {}
        for word in posWords:
            if (word, 1) not in posFreq:
                posFreq[(word, 1)] = 1
            else:
                posFreq[(word, 1)] = posFreq[(word, 1)] + 1
        # Gets all the negative words
        negWords = [word for sentence in self.__trainX[4000:] for word in self.__processText(sentence)]
        negFreq = {}
        for word in negWords:
            if (word, 0) not in negFreq:
                negFreq[(word, 0)] = 1
            else:
                negFreq[(word, 0)] = negFreq[(word, 0)] + 1

        frequencies = dict(posFreq)
        frequencies.update(negFreq)
        return frequencies


    def __features_extraction(self, text):
        word_l = self.__processText(text)
        x = np.zeros((1, 3))
        x[0,0] = 1
        for word in word_l:
            try:
                x[0,1] += self.__freqDict[(word, 1)]
            except:
                x[0,1] += 0
            try:
                x[0,2] += self.__freqDict[(word, 0.0)]
            except:
                x[0,2] += 0
        assert(x.shape == (1, 3))
        return x

    @staticmethod
    def __sigmoid(x):
        h = 1/(1+np.exp(-x))
        return h

    def __gradientDescent_algo(self, alpha, num_iters):
        theta = np.zeros((3, 1))
        m = self.__matrix.shape[0]
        for i in range(0, num_iters):
            z = np.dot(self.__matrix, theta)
            h = self.__sigmoid(z)
            J = -1/m*(np.dot(self.__trainY.T,np.log(h))+np.dot((1-self.__trainY).T,np.log(1-h)))
            theta = theta-(alpha/m)*np.dot(self.__matrix.T, h - self.__trainY)
        return float(J), theta

    def __createMatrix(self):
        X = np.zeros((len(self.__trainX), 3))
        for i in range(len(self.__trainX)):
            X[i, :] = self.__features_extraction(self.__trainX[i])
        return X

    def compareWithNLTK(self):
        sia = SentimentIntensityAnalyzer()
        myScores = []
        nltkScores = []
        for tweet in self.__testX:
            yPred = self.sentimentAnalysis(tweet)
            myScores.append(yPred[0])
            nltkScores.append(sia.polarity_scores(tweet)['compound'])
        plt.figure()
        plt.plot(myScores[:1000], 'o')
        plt.plot(nltkScores[:1000], '.')
       # plt.plot(nltkScores)
        plt.draw()
        plt.show()



    def sentimentAnalysis(self, text):
        x = self.__features_extraction(text)
        sent = self.__sigmoid(np.dot(x, self.__theta))
        return equalize(sent[0])






#################################################################################
############################# HELPER FUNCTIONS ##################################
#################################################################################

# Removes punctuation from a string
def stripPunctuation(s, all=False):
    punctuationRegex = re.compile('[{0}]'.format(re.escape(string.punctuation)))
    return punctuationRegex.sub('', s.strip())

def equalize(score):
    if score < 0.5:
        score = score - 1
    return score
#
# # Returns a list of word tokens
# def tokenize(text, includePunc=True):
#     tokens = nltk.tokenize.word_tokenize(text)
#     if includePunc:
#         return tokens
#     else:
#         return [word if word.startswith("'") else stripPunctuation(word, all=False)
#                 for word in tokens if stripPunctuation(word, all=False)]



In [None]:
import nltk
nltk.download('stopwords')
nltk.download('vader_lexicon')
nltk.download('twitter_samples')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data] Downloading package twitter_samples to /root/nltk_data...
[nltk_data]   Unzipping corpora/twitter_samples.zip.


True

In [None]:
from nltk.sentiment import SentimentIntensityAnalyzer
import pandas as pd

sia = SentimentIntensityAnalyzer()
myModel = MySentimentModel()


def get_scores(content):
    myModelScores = myModel.sentimentAnalysis(content)[0]
    sia_scores = sia.polarity_scores(content)

    return pd.Series({
        'content': content,
        'nltk': sia_scores['compound'],
        'my model': myModelScores,
    })


def main():
    pd.set_option("display.max_colwidth", 400)
    df = pd.DataFrame({'content': [
        "I love love love love this kitten",
        "I hate hate hate hate this keyboard",
        "I'm not sure how I feel about toast",
        "Did you see the world cup game yesterday?",
        "The package was delivered late and the contents were broken",
        "Trashy television shows are some of my favorites",
        "I'm seeing a Kubrick film tomorrow, I hear not so great things about it.",
        "I find chirping birds irritating, but I know I'm not the only one",
        "I do not dislike cabin cruisers",
        "Disliking people is not really my thing.",
        "I love love love love this kitten :)",
        "I'd really truly love going out in this weather!",
    ]})
    scores = df.content.apply(get_scores)
    scores = scores.style.background_gradient(cmap='RdYlGn', axis=None, low=0.4, high=0.4)
    return scores

In [None]:
main()

Unnamed: 0,content,nltk,my model
0,I love love love love this kitten,0.9571,0.542918
1,I hate hate hate hate this keyboard,-0.9413,-0.588225
2,I'm not sure how I feel about toast,-0.2411,-0.596676
3,Did you see the world cup game yesterday?,0.0,0.508071
4,The package was delivered late and the contents were broken,-0.4767,0.501554
5,Trashy television shows are some of my favorites,0.4215,0.501132
6,"I'm seeing a Kubrick film tomorrow, I hear not so great things about it.",-0.6296,-0.560734
7,"I find chirping birds irritating, but I know I'm not the only one",-0.25,-0.656157
8,I do not dislike cabin cruisers,0.2924,-0.569802
9,Disliking people is not really my thing.,-0.3182,-0.506007


In [None]:
positiveTweets = twitter_samples.strings('positive_tweets.json')
negativeTweets = twitter_samples.strings('negative_tweets.json')

trainPos = positiveTweets[:4000]
trainNeg = negativeTweets[:4000]
testPos = positiveTweets[4000:]
testNeg = negativeTweets[4000:]
testX = testPos + testNeg

In [None]:
len(testPos)

1000

In [None]:
myScores = []
nltkScores = []
outliers = []
for tweet in testNeg:
    yPred = myModel.sentimentAnalysis(tweet)
    myScores.append(yPred[0])
    yPred2 = sia.polarity_scores(tweet)['compound']
    if yPred2 > 0:
        outliers.append(tweet)
    nltkScores.append(yPred2)
# plt.figure(figsize=(10, 8))
# plt.plot(myScores[900:1100], 'o', label='my model')
# plt.plot(nltkScores[900:1100], 's', label = 'nltk model')
# # plt.plot(nltkScores)
# plt.ylabel("Sentiment Score")
# plt.legend()
# plt.draw()
# plt.show()

In [None]:
outliers

['Splendour :(',
 '@archietalanay dont be sad :(((((( ily',
 "@RamaZafar hayeee :( hayeee :( patwari here mam but for IK's vision I would say nothing rather than a lil laugh",
 '@TheKelseeey awhhh ok ok :( see you nalang when class opens!!! Hehe',
 "@ryannhough I can imagine! This would shatter my dreams :-( We'll let our @CooperativeFood colleagues know all about this. ^SB",
 'I WANT A WHITE FRENCH BULLDOG :(((',
 'Wanna feel loved :(',
 "@WeeklyChris Aww Poor you T.T I wish I was there to help you. Even though I can't really help much :(",
 '@rcdlccom hello, any info about possible interest in Jonathas ?? He is close to join Betis :( greatings',
 'can some1 pls download smosh:the movie free online? Hahahaha :(',
 "There's a huge bag of presents from Luke and I can't open them until he's back from work :-((((",
 "@rauhlstilinski I'M OMW😍 lol I wish :(((((",
 '@lukesdagger JOKE LANG EH :( HAHDHDHSHHS',
 'RIP TOM MOORE...... I LOVE READING HIS COMIC BOOKS....\nANOTHER GREAT ARTIST I WIL

In [None]:
print(testX[921:928])

['me: as long as i feel comfortable im gonna wear what i want\nmy mother: haha...that sounds nice...but no :-)', '@richardosman congratulations to you daughter! :D', 'goodnight I love everyone but hate myself because in stupid :)', '@ailyngarciia Thank you for filling me in! Although my opinion still stands, BUT we can just agree to disagree, no harm done! :)', "@StormyKittyhawk You'll see me Saturday :p I'll see you then Stormy :D", 'Its time 2 party :D http://t.co/hjnT6v40eT', '@RblSports upgraded ans synced up. Plus it can be done from a single remote device. Getting there folks. :)']


In [None]:
myModel.sentimentAnalysis('goodnight I love everyone but hate myself because i\'m stupid')

array([-0.55731294])

In [None]:
print(testX[1023:1027])

['@archietalanay dont be sad :(((((( ily', '@sonzhi No, I am going to spend the night in Prague and then leaving tomorrow :(', 'one of my friend is following me , a little heart attack , im sorry youre blocked :((((( sadis', "@ellierowexo no I'm annoyed :("]


In [None]:
def main2():
    pd.set_option("display.max_colwidth", 400)
    df = pd.DataFrame({'content': [
        "RIP TOM MOORE! I LOVE READING HIS COMIC BOOKS. ANOTHER GREAT ARTIST I WILL TRULY MISS",
        "dont be sad :(((((( i love you",
        "i want to go back to the time where everything is still fine",
        "I like your eyes :D",
        "mom + :) = horror movie",
        "sure...",
        "I wish I was there to help :(",
        "You make me suffer, You make me feel.. \n\nAddictive song i always sing in KARAOKE :-)",
    ]})
    scores = df.content.apply(get_scores)
    scores = scores.style.background_gradient(cmap='RdYlGn', axis=None, low=0.4, high=0.4)
    return scores

In [None]:
main2()

Unnamed: 0,content,nltk,my model
0,RIP TOM MOORE! I LOVE READING HIS COMIC BOOKS. ANOTHER GREAT ARTIST I WILL TRULY MISS,0.8977,-0.606285
1,dont be sad :(((((( i love you,0.7753,-0.869361
2,i want to go back to the time where everything is still fine,0.2732,-0.538241
3,I like your eyes :D,0.7603,-0.501075
4,mom + :) = horror movie,-0.1779,0.822836
5,sure...,0.0,-0.505224
6,I wish I was there to help :(,0.3612,-0.92601
7,"You make me suffer, You make me feel.. Addictive song i always sing in KARAOKE :-)",-0.296,0.560603
