# Main notebook file
This file contains most of the work done on the twitter classifier and textblob comparison

##import required libraries

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from nltk.tokenize import wordpunct_tokenize
from nltk.corpus import stopwords
import nltk.stem
import nltk
from sklearn import cross_validation
from textblob import TextBlob
from nltk.corpus import wordnet
import sys, os

def blockPrint():
    sys.stdout = open(os.devnull, 'w')

def enablePrint():
    sys.stdout = sys.__stdout__

from sematch.semantic.similarity import WordNetSimilarity
wns = WordNetSimilarity()


%matplotlib inline

### import sts gold dataset

In [2]:
sts_gold = pd.read_csv('sts_gold_v03/sts_gold_tweet.csv', sep=';')
print(sts_gold.head())

           id  polarity                                              tweet
0  1467933112         0  the angel is going to miss the athlete this we...
1  2323395086         0  It looks as though Shaq is getting traded to C...
2  1467968979         0     @clarianne APRIL 9TH ISN'T COMING SOON ENOUGH 
3  1990283756         0  drinking a McDonalds coffee and not understand...
4  1988884918         0  So dissapointed Taylor Swift doesnt have a Twi...


# Pre processing
### Tokenizing

In [4]:
def preprocess_tokenize(input_dataset):
    sts_gold_tokenized = input_dataset.copy()
    
    # sts_gold_tokenized.drop('tweet',axis=1,inplace=True)
    
    
    for i,row in input_dataset.iterrows():
        sts_gold_tokenized.at[i,'tweet'] = wordpunct_tokenize(row['tweet'].lower())

        
    # print(sts_gold_tokenized.head())
    return sts_gold_tokenized


sts_gold_tokenized = preprocess_tokenize(sts_gold)

In [96]:
print(sts_gold['tweet'][1])
print(wordpunct_tokenize(sts_gold['tweet'][1]))


It looks as though Shaq is getting traded to Cleveland to play w/ LeBron... Too bad for Suns' fans. The Big Cactus is no more 
['It', 'looks', 'as', 'though', 'Shaq', 'is', 'getting', 'traded', 'to', 'Cleveland', 'to', 'play', 'w', '/', 'LeBron', '...', 'Too', 'bad', 'for', 'Suns', "'", 'fans', '.', 'The', 'Big', 'Cactus', 'is', 'no', 'more']


### Stemming

In [5]:

def preprocess_stemming(input_dataset):
    sts_gold_stemmed = input_dataset.copy()
    ps = nltk.stem.PorterStemmer()
    # print(ps.stem('getting'))
    
    
    # sts_gold_tokenized.drop('tweet',axis=1,inplace=True)
    
    
    for i,row in input_dataset.iterrows():
        sts_gold_stemmed.at[i,'tweet'] = [ps.stem(word) for word in row['tweet']]
        
    # print(sts_gold_stemmed.head())
    return sts_gold_stemmed



### POS-Tagging (filtering)

In [6]:
def preprocess_pos(input_dataset):
    
    def check_adj(word):
        synsets = wordnet.synsets(word)
        for ss in synsets:
            if ss.pos() in ['a'] :
                return True
            
        return False
    
    sts_gold_pos = input_dataset.copy()
    
    # print(input_dataset.at[0,'tweet'])
    # print(nltk.pos_tag(input_dataset.at[0,'tweet']))
    # 
    # sts_gold_tokenized.drop('tweet',axis=1,inplace=True)
    
    # accepted_tags= ['JJ','JJR','JJS','RB','RBR','RBS','VB','VBD','VBG','VBN','VBP','VBZ','MD','NN','NNS','NNP','NNPS']

    accepted_tags=['JJ','JJR','JJS','RB','RBR','RBS','VB','VBD','VBG','VBN','VBP','VBZ','VBG']
    # 
    
    # accepted_tags=['JJ','JJR','JJS','RB','RBR','RBS']
    
    # print(nltk.pos_tag(wordpunct_tokenize('@JBsFanArgentina Hey I luv this pic!!! was amazing of the last CHAT of The JB in FACEBOOK!'.lower())))

    
    pos_tags_found = []
    empty_lists = 0
    for i,row in input_dataset.iterrows():
        # pos = [word[0] for word in nltk.pos_tag(row['tweet']) if check_adj(word[0])]
        pos = [word[0] for word in nltk.pos_tag(row['tweet']) if word[1] in accepted_tags or check_adj(word[0])]
        if len(pos) < 1:
            empty_lists += 1
        
        sts_gold_pos.at[i,'tweet'] = pos
        
    # print('sts_gold len={}, neutrals={}, {:.2f}%'.format(len(sts_gold_pos),empty_lists,(empty_lists/len(sts_gold_pos))*100))
    # print(pos_tags_found)
    
    # print(sts_gold_pos.head())  
    return sts_gold_pos

sts_gold_pos = preprocess_pos(sts_gold_tokenized)

### Filtering using synsets to check for adjectives in related words, (not in final model, gave unsatisfactory results on cross validation)

In [7]:
def check_adj(word):
    synsets = wordnet.synsets(word)
    for ss in synsets:
        if ss.pos() in ['a','r','v'] :
            return True
        
    return False
[word for word in wordpunct_tokenize("@mikefish  Fair enough. But i have the Kindle2 and I think it's perfect  :)") if check_adj(word)]

wordnet.synsets('enough')
sts_gold_pos


Unnamed: 0,id,polarity,tweet
0,1467933112,0,"[is, going, miss]"
1,2323395086,0,"[looks, is, getting, traded, cleveland, play, ..."
2,1467968979,0,"[@, april, coming, soon, enough]"
3,1990283756,0,"[drinking, mcdonalds, not, understanding, hurt..."
4,1988884918,0,"[so, dissapointed, have]"
5,1467984364,0,"[was, on, fling, sigh]"
6,1964951623,0,"[got, got, just, left, work]"
7,1881133744,0,"[only, been, i, miss, especially, @, ktjade]"
8,1753802024,0,"[not, working, again]"
9,1980497384,0,[lebron]


### Stopword removal

In [8]:
def preprocess_remove_stopwords(input_dataset):
    sts_gold_no_stopwords = input_dataset.copy()
    s_words = set(stopwords.words('english'))
    s_words.update(['.', ',', '"', "'", '?', '!', ':', ';', '(', ')', '[', ']', '{', '}','@'])
    
    
    
    # sts_gold_tokenized.drop('tweet',axis=1,inplace=True)
    
    
    for i,row in input_dataset.iterrows():
        sts_gold_no_stopwords.at[i, 'tweet'] = [word for word in row['tweet'] if not (word in s_words or len(word) < 2)]
     
    empty_lists = 0
    for i,row in sts_gold_no_stopwords.iterrows():
        if sts_gold_no_stopwords.at[i, 'tweet'] == []:
            empty_lists += 1
    # print('sts_gold len={}, neutrals={}, {:.2f}%'.format(len(input_dataset),empty_lists,(empty_lists/len(input_dataset))*100))
    
        
    # print(sts_gold_no_stopwords.head(20))
    return sts_gold_no_stopwords

sts_gold_no_stopwords = preprocess_remove_stopwords(sts_gold_pos)

### Lemmatizing (not in final model)

In [9]:
def preprocess_lemmatize(input_dataset):
    sts_gold_lemmatized = input_dataset.copy()
    wordnet_lemmatizer = nltk.stem.WordNetLemmatizer()
    
    # print(wordnet_lemmatizer.lemmatize("churches"))
    
    
    for i,row in input_dataset.iterrows():
        sts_gold_lemmatized.at[i, 'tweet'] = [wordnet_lemmatizer.lemmatize(word) for word in row['tweet']]
        # if not (sts_gold_lemmatized.at[i,'tweet'] == sts_gold_no_stopwords.at[i,'tweet']):
        #     print("pre-lemmatized {}\npost-lemmatized {}".format(sts_gold_no_stopwords.at[i,'tweet'],sts_gold_lemmatized.at[i,'tweet']))
     
    empty_lists = 0
    for i,row in sts_gold_lemmatized.iterrows():
        if sts_gold_lemmatized.at[i, 'tweet'] == []:
            empty_lists += 1
            sts_gold_lemmatized.at[i,'polarity'] = 2
    # print('sts_gold len={}, neutrals={}, {:.2f}%'.format(len(sts_gold_pos),empty_lists,(empty_lists/len(sts_gold_pos))*100))
    
        
    # print(sts_gold_lemmatized.head(20))
    return sts_gold_lemmatized

sts_gold_lemmatized = preprocess_stemming(sts_gold_no_stopwords)


### Our classifier should consider tweets without allowed POS-tags to be neutral
so no adjectives for example

In [10]:
def preprocess_make_neutral(input_dataset):
    sts_gold_neutral = input_dataset.copy()

    for i,row in input_dataset.iterrows():
        if sts_gold_neutral.at[i, 'tweet'] == []:
            sts_gold_neutral.at[i,'polarity'] = 2

    return sts_gold_neutral

sts_gold_neutral = preprocess_make_neutral(sts_gold_lemmatized)

In [11]:
def preprocess_all(input_dataset):
    return preprocess_make_neutral(preprocess_stemming(preprocess_remove_stopwords(preprocess_pos(preprocess_tokenize(input_dataset)))))

# Classifier
### training the classifier and creating the features (bag of words)

In [12]:
all_words = []
for i, row in sts_gold_neutral.iterrows():
    all_words.extend(row['tweet'])
all_words = list(set(all_words)) #remove duplicates

def contains_feature(tweet):
    tweet_words = set(tweet)
    features = {}
    for word in all_words:
        features['contains(%s)' % word] = (word in tweet_words)
    return features


sts_gold_shuffled = sts_gold_neutral.sample(frac=1)
sts_gold_shuffled = sts_gold_shuffled[sts_gold_shuffled['polarity'] != 2]

def polarity_mapping(numb):
    if (numb == 0):
        return 'negative'
    elif (numb == 2):
        return 'neutral'
    else:
        return 'positive'

training_set = nltk.classify.apply_features(contains_feature,
[(row['tweet'],polarity_mapping(row['polarity'])) for i,row in sts_gold_shuffled.iterrows()],labeled=True)


clf = nltk.NaiveBayesClassifier.train(training_set)

clf.show_most_informative_features()








Most Informative Features
        contains(awesom) = True           positi : negati =     31.5 : 1.0
        contains(beauti) = True           positi : negati =     15.4 : 1.0
          contains(love) = True           positi : negati =     14.3 : 1.0
          contains(glad) = True           positi : negati =     12.4 : 1.0
          contains(sick) = True           negati : positi =     11.7 : 1.0
          contains(amaz) = True           positi : negati =     11.0 : 1.0
         contains(funni) = True           positi : negati =      9.5 : 1.0
         contains(excit) = True           positi : negati =      8.5 : 1.0
        contains(person) = True           positi : negati =      8.0 : 1.0
         contains(magic) = True           positi : negati =      8.0 : 1.0


### classify custom tweets/youtube comments

In [293]:
neutrals = 0
for dat in training_set:
    if dat[1] == 'neutral':
        neutrals += 1
print(len(training_set),neutrals)

dat = {
    'tweet' : ['bad things happen to terrible people','i like awesome stuff','so sentimental','i love you','what a sad world we live in',
               'i liked kiwi and corolina of his songs', 'amazing']
}
dat = pd.DataFrame(dat)
dat_pre = preprocess_all(dat)
dat_feat = [contains_feature(sent) for sent in dat_pre['tweet']]

clf.classify_many(dat_feat)


1985 0


['negative',
 'positive',
 'negative',
 'positive',
 'negative',
 'negative',
 'positive']

### Crossvalidation
With some multithreading that didn't improve the speed that much

In [294]:


def get_cross_val_score(resval):

    cv = cross_validation.KFold(len(training_set), n_folds=10, shuffle=True, random_state=None)
    
    from operator import itemgetter
    
    
    
    accuracies = []
    for traincv, testcv in cv:
        # print("train indices: {} to {}\ntest indices: {} to {}".format(traincv[0],traincv[len(traincv)-1],testcv[0],testcv[len(testcv)-1]))
        classifier = nltk.NaiveBayesClassifier.train(itemgetter(*traincv)(training_set))
        accuracy = nltk.classify.util.accuracy(classifier, itemgetter(*testcv)(training_set))
        print('\naccuracy: {:.02f}'.format(accuracy*100))
        accuracies.append(accuracy)
    
    print('\n\naverage accuracy: {:.02f}'.format(np.mean(accuracies)*100))
    return np.mean(accuracies)*100
    
    
# from multiprocessing.dummy import Pool as ThreadPool 
# pool = ThreadPool(3) 
# average_accuracies = [0]*3
# results = pool.map(get_cross_val_score, average_accuracies)
# print(results)

print(get_cross_val_score(0))



accuracy: 77.89



accuracy: 81.91



accuracy: 78.39



accuracy: 82.41



accuracy: 76.38



accuracy: 78.28



accuracy: 77.78



accuracy: 78.79



accuracy: 78.28



accuracy: 84.85


average accuracy: 79.50
79.4964722603


### Synsets testing

In [252]:
from nltk.corpus import wordnet
# print(wns.word_similarity('true','good',"wup"))
g1 = wordnet.synset('good.n.01')
g2 = wordnet.synset('amazing.s.01')
print(g1,g2)
for g2 in wordnet.synsets('super'):
    print(g1.wup_similarity(g2))

Synset('good.n.01') Synset('amazing.s.01')
0.125
None
None
None
None


### comparing results on the sentiment140 test-set

In [277]:
cv = cross_validation.KFold(len(training_set), n_folds=10, shuffle=True, random_state=None)

from operator import itemgetter

header = ['polarity','tweet_id','time','query','user','tweet']
sent140_test = pd.read_csv('sentiment140/testdata.manual.2009.06.14.csv',
                            error_bad_lines=False, encoding='latin-1',names=header,header=None)

sent140_test = sent140_test[sent140_test['polarity'] != 2]

print(len(sent140_test[sent140_test['polarity'] == 0]),len(sent140_test[sent140_test['polarity'] == 2]),len(sent140_test[sent140_test['polarity'] == 4]),len(sent140_test))

sent140_feat = preprocess_all(sent140_test)

print(len(sent140_test),len(sent140_test[sent140_test['polarity'] == 2]))

def polarity_mapping(numb):
    if (numb == 0):
        return 'negative'
    elif (numb == 2):
        return 'neutral'
    else:
        return 'positive'


test_set = nltk.classify.apply_features(contains_feature,
[(row['tweet'],polarity_mapping(row['polarity'])) for i,row in sent140_feat.iterrows()],labeled=True)



errors = 0
for i, row in sent140_test.iterrows():
    tb = TextBlob(text=row['tweet'])
    pol_tb = tb.polarity
    if (pol_tb < 0):
        pol_tb = 'negative'
    elif (pol_tb > 0):
        pol_tb = 'positive'
    else:
        pol_tb = 'neutral'
    tweet_data = pd.DataFrame({'tweet' : [row['tweet']]});
    pol_gt = polarity_mapping(row['polarity'])
    pol_clf = clf.classify(contains_feature(preprocess_all(tweet_data).at[0,'tweet']));
    
    # if (pol_gt != polarity_mapping(row['polarity'])):
    if (i < 50):
        print(pol_tb,pol_clf,pol_gt,row['tweet'],preprocess_all(tweet_data).at[0,'tweet'])
     
    if not pol_gt == pol_tb:
        errors += 1
        if (i < 500):
            pass
            # print(pol_tb,pol_clf,row['polarity'],row['tweet'],sts_gold_neutral.at[i,'tweet'])
print("{:.02f}".format(100-(errors/len(sent140_test))*100))

# print("train indices: {} to {}\ntest indices: {} to {}".format(traincv[0],traincv[len(traincv)-1],testcv[0],testcv[len(testcv)-1]))
classifier = nltk.NaiveBayesClassifier.train(training_set)
accuracy = nltk.classify.util.accuracy(classifier, test_set)
print('\naccuracy: {:.02f}'.format(accuracy*100))


177 0 182 359


359 0
positive positive positive @stellargirl I loooooooovvvvvveee my Kindle2. Not that the DX is cool, but the 2 is fantastic in its own right. ['loooooooovvvvvvee', 'cool', 'fantast']
positive positive positive Reading my kindle2...  Love it... Lee childs is good read. ['read', 'love', 'lee', 'good']
negative negative positive Ok, first assesment of the #kindle2 ...it fucking rocks!!! ['first', 'fuck', '!!!']
positive negative positive @kenburbary You'll love your Kindle2. I've had mine for a few months and never looked back. The new big one is huge! No need for remorse! :) ['never', 'look', 'back', 'new', 'big', 'huge']
positive negative positive @mikefish  Fair enough. But i have the Kindle2 and I think it's perfect  :) ['mikefish', 'enough', 'think']
positive positive positive @richardebaker no. it is too big. I'm quite happy with the Kindle2. ['big', 'quit', 'happi']


negative negative negative Fuck this economy. I hate aig and their non loan given asses. ['fuck', 'hate', 'given']
positive positive positive Jquery is my new best friend. ['new', 'best']
neutral negative positive Loves twitter []
negative positive positive how can you not love Obama? he makes jokes about himself. ['love', 'obama', 'make']
negative negative negative @Karoli I firmly believe that Obama/Pelosi have ZERO desire to be civil.  It's a charade and a slogan, but they want to destroy conservatism ['firmli', 'believ', 'zero', 'civil', 'want', 'destroy']
neutral negative positive House Correspondents dinner was last night whoopi, barbara &amp; sherri went, Obama got a standing ovation ['last', 'went', 'got']
positive positive positive Watchin Espn..Jus seen this new Nike Commerical with a Puppet Lebron..sh*t was hilarious...LMAO!!! ['watchin', 'seen', 'new', 'nike', 'hilari']
negative negative negative dear nike, stop with the flywire. that shit is a waste of science. and ugly. l

neutral negative positive @Pmillzz lebron IS THE BOSS []
positive positive positive @sketchbug Lebron is a hometown hero to me, lol I love the Lakers but let's go Cavs, lol ['hometown', 'lol', 'love', 'let', 'go']
positive positive positive lebron and zydrunas are such an awesome duo ['awesom']
neutral negative positive @wordwhizkid Lebron is a beast... nobody in the NBA comes even close. ['come', 'even', 'close']
positive negative positive downloading apps for my iphone! So much fun :-) There literally is an app for just about anything. ['download', 'much', 'liter']
negative positive positive good news, just had a call from the Visa office, saying everything is fine.....what a relief! I am sick of scams out there! Stealing! ['good', 'say', 'fine']
positive positive positive http://twurl.nl/epkr4b - awesome come back from @biz (via @fredwilson) ['nl', 'awesom', 'come', 'back']
positive negative positive In montreal for a long weekend of R&amp;R. Much needed. ['long', 'much', 'need']
ne

negative negative negative US planning to resume the military tribunals at Guantanamo Bay... only this time those on trial will be AIG execs and Chrysler debt holders ['plan', 'resum', 'militari', 'aig', 'chrysler']
negative negative negative omg so bored &amp; my tattoooos are so itchy!!  help! aha =) ['bore', 'itchi']
negative negative negative I'm itchy and miserable! ['miser']
positive negative negative @sekseemess no. I'm not itchy for now. Maybe later, lol. ['itchi', 'mayb', 'later']
positive positive positive RT @jessverr I love the nerdy Stanford human biology videos - makes me miss school. http://bit.ly/13t7NR ['love', 'nerdi', 'human', 'make', 'miss', '://', 'ly']
negative positive positive @spinuzzi: Has been a bit crazy, with steep learning curve, but LyX is really good for long docs. For anything shorter, it would be insane. ['crazi', 'steep', 'learn', 'realli', 'good', 'long']
positive positive positive I'm listening to "P.Y.T" by Danny Gokey &lt;3 &lt;3 &lt;3 Aww, he's s

neutral positive negative started to think that Citi is in really deep s&amp;^t. Are they gonna survive the turmoil or are they gonna be the next AIG? ['start', 'think', 'realli', 'deep', 'gonna', 'surviv', 'gonna', 'next']
64.07



accuracy: 70.19


In [67]:

clf_polarities = clf.classify_many([contains_feature(sent) for sent in preprocess_all(sts_gold)])


errors = 0
for i, row in sts_gold.iterrows():
    tb = TextBlob(text=row['tweet'])
    pol_tb = tb.polarity
    if (pol_tb < 0):
        pol_tb = 'negative'
    elif (pol_tb > 0):
        pol_tb = 'positive'
    else:
        pol_tb = 'neutral'
    tweet_data = pd.DataFrame({'tweet' : [row['tweet']]});
    pol_clf = clf.classify(contains_feature(preprocess_all(tweet_data).at[0,'tweet']));
    
    if ('amazing' in row['tweet'] and row['polarity'] == 4):
        print(pol_tb,pol_clf,row['polarity'],row['tweet'],sts_gold_neutral.at[i,'tweet'])
     
    if not pol_clf == pol_tb:
        errors += 1
        if (i < 500):
            pass
            # print(pol_tb,pol_clf,row['polarity'],row['tweet'],sts_gold_neutral.at[i,'tweet'])
    
    
print(errors,len(sts_gold),100-(errors/len(sts_gold))*100)

positive positive 4 @JBsFanArgentina Hey I luv this pic!!! was amazing of the last CHAT of The JB in FACEBOOK!  ['luv', 'amaz', 'last']
positive positive 4 I L&lt;3VE Taylor Swift.She is just so amazing and her music is wonderfull and makes me so happy.  ['amaz', 'wonderful', 'make', 'happi']


positive positive 4 watching you belong to mee video  --taylor swift is amazing, love her...my new idol lol  ['watch', 'belong', 'mee', 'amaz', 'love', 'new']


positive positive 4 @pcdnicole Hi Nicole, glad you loved Sydney! Was amazing working with you the other night at 301! Looking forward to you coming back  ['love', 'amaz', 'work', 'look', 'forward', 'come', 'back']


positive positive 4 @_Chelsea_Marie does target ship things to london? thanks so much! im such a demi fan shes amazing!  ['target', 'ship', 'london', 'much', 'amaz']


positive positive 4 Whirlpool Galaxy Deep Field  : http://apod.nasa.gov/apod/ap090526.html  what an amazing universe  ['galaxi', 'deep', 'html', 'amaz']


positive positive 4 hung out by Notre Dame today hoping  for a glimpse of Obama...and I got it!   the energy was amazing! ['notr', 'hope', 'got', 'amaz']
positive positive 4 Youtube comedy people are just amazing.  ['youtub', 'amaz']


positive positive 4 its amazing how a starbucks caramel frappucino can relieve stress..itz seriously an exordianry drink..am feelin much the better now!  ['amaz', 'caramel', 'reliev', 'serious', 'feelin', 'much', 'better']
positive positive 4 @ddlovato yesterday &quot;sonny with a chance&quot; came to brazil, i loved it! you're amazing &lt;33 please reply  ['came', 'brazil', 'love', 'amaz']


positive positive 4 @sydney_sider yes thanks I think they're amazing too! the images were taken by @insidecuisine photographer the very talented @rovingrob  ['sydney_sid', 'ye', 'think', 'amaz', 'taken', 'talent']
positive positive 4 and now off to bed after an amazing night chatting with a pretty amazing guy  ( you know who you are) ['amaz', 'chat', 'pretti', 'amaz', 'know']


positive positive 4 @adeline_sky  that sounds fantastic! You're amazing! We need to watch some Muse gigs too! Shall we do it Saturday night? ['sound', 'fantast', 'need', 'watch', 'saturday']


positive positive 4 @DonnieWahlberg YOU give me joy....and you have for years. You are such an amazing man and we are all lucky to know you.  ['give', 'amaz', 'lucki', 'know']
1052 2034 48.279252704031464


### testing training our classifier on a huge (automatically labelled) training set
code works but the the classifier performance was lacking

In [24]:
header = ['polarity','tweet_id','time','query','user','tweet']
sent140_train = pd.read_csv('sentiment140/training.1600000.processed.noemoticon.csv',
                            error_bad_lines=False, encoding='latin-1',names=header,header=None)
# print(sent140_train.head())

print("dataset length=",len(sent140_train))

sent140_train = sent140_train[sent140_train.polarity != 2]
print("dataset length=",len(sent140_train))
sent140_train = sent140_train.sample(frac=1)

print("dataset length=",len(sent140_train))


sent140_train_processed = preprocess_all(sent140_train)

print("preprocessing done")


all_words = []
for i, row in sent140_train_processed.iterrows():
    all_words.extend(row['tweet'])
all_words = list(set(all_words)) #remove duplicates

def contains_feature(tweet):
    tweet_words = set(tweet)
    features = {}
    for word in all_words:
        features['contains(%s)' % word] = (word in tweet_words)
    return features


sent140_train_shuffled = sent140_train_processed.sample(frac=1)

training_set = nltk.classify.apply_features(contains_feature,
[(row['tweet'],'negative') if (row['polarity'] == 0 or row['polarity'] == 2) else (row['tweet'],'positive') for i,row in sent140_train_shuffled.iterrows()],labeled=True)


clf = nltk.NaiveBayesClassifier.train(training_set)


clf.show_most_informative_features(10)

probas = clf.prob_classify(contains_feature(['awesome']))
print(probas.prob('positive'),probas.prob('negative'))

dat = {
    'tweet' : ['bad things happen to terrible people','i like awesome stuff','so sentimental','i love you','what a sad world we live in','i liked kiwi and corolina of his songs']
}
dat = pd.DataFrame(dat)
dat_pre = preprocess_all(dat)
dat_feat = [contains_feature(sent) for sent in dat_pre['tweet']]


probas_l = clf.classify_many(dat_feat)
print("probas_l" + str(probas_l))

# for probas in probas_l:
#     print(probas.prob('positive'),probas.prob('negative'))
cv = cross_validation.KFold(len(training_set), n_folds=10, shuffle=True, random_state=None)

from operator import itemgetter



accuracies = []
for traincv, testcv in cv:
    # print("train indices: {} to {}\ntest indices: {} to {}".format(traincv[0],traincv[len(traincv)-1],testcv[0],testcv[len(testcv)-1]))
    classifier = nltk.NaiveBayesClassifier.train(itemgetter(*traincv)(training_set))
    accuracy = nltk.classify.util.accuracy(classifier, itemgetter(*testcv)(training_set))
    print('\naccuracy: {:.02f}'.format(accuracy*100))
    accuracies.append(accuracy)
print('\n\naverage accuracy: {:.02f}'.format(np.mean(accuracies)*100))

dataset length= 1600000
dataset length= 1600000


dataset length= 1600000


KeyboardInterrupt: 

### More detailed analysis and comparison to textblob on testset
including recall and precision

In [18]:
header = ['polarity','tweet_id','time','query','user','tweet']
sent140_test = pd.read_csv('sentiment140/testdata.manual.2009.06.14.csv',
                            error_bad_lines=False, encoding='latin-1',names=header,header=None)

sent140_test = sent140_test[sent140_test['polarity'] != 2]

sent140_test_pos = sent140_test[sent140_test['polarity'] == 0]

sent140_feat = preprocess_all(sent140_test)



def polarity_mapping(numb):
    if (numb == 0):
        return 'negative'
    elif (numb == 2):
        return 'neutral'
    else:
        return 'positive'


test_set = nltk.classify.apply_features(contains_feature,
[(row['tweet'],polarity_mapping(row['polarity'])) for i,row in sent140_feat.iterrows()],labeled=True)

errors = 0
positive = 0
actually_positive = 0
tp = 0
tn = 0
fp = 0
fn = 0
for i, row in sent140_test.iterrows():
    tb = TextBlob(text=row['tweet'])
    pol_tb = tb.polarity
    if (pol_tb < 0):
        pol_tb = 'negative'
    elif (pol_tb > 0):
        pol_tb = 'positive'
    else:
        pol_tb = 'neutral'
        
    pol_gt = polarity_mapping(row['polarity'])
        
    tweet_data = pd.DataFrame({'tweet' : [row['tweet']]});
    pol_clf = clf.classify(contains_feature(preprocess_all(tweet_data).at[0,'tweet']));
    
    
    if (pol_gt == 'positive'):
        if (pol_tb == 'positive'):
            tp += 1
        else:
            fn += 1
    else:
        if (pol_tb == 'negative'):
            tn += 1
        else:
            fp += 1
    
# print("{:.02f}".format(100-(errors/len(sent140_test_pos))*100))
print("Positive Recall {:.02f}\nPositive Precision {:.02f}\nNegative Recall {:.02f}\nNegative Precision {:.02f}\nAccuracy {:.02f}".format((tp/(tp+fn))*100,(tp/(tp+fp))*100,(tn/(fp+tn))*100,(tn/(tn+fn))*100,((tp+tn)/(tp+tn+fp+fn))*100))
# print("train indices: {} to {}\ntest indices: {} to {}".format(traincv[0],traincv[len(traincv)-1],testcv[0],testcv[len(testcv)-1]))
# classifier = nltk.NaiveBayesClassifier.train(training_set)
accuracy = nltk.classify.util.accuracy(clf, test_set)
print('\naccuracy: {:.02f}'.format(accuracy*100))


Positive Recall 79.12
Positive Precision 61.28
Negative Recall 48.59
Negative Precision 69.35
Accuracy 64.07



accuracy: 70.75


### creating the output datasets

In [19]:
df_usa_comments= pd.read_csv("./youtube/UScomments.csv",error_bad_lines=False)
len(df_usa_comments)

b'Skipping line 114463: expected 4 fields, saw 5\n'
b'Skipping line 142494: expected 4 fields, saw 8\nSkipping line 189730: expected 4 fields, saw 6\nSkipping line 245216: expected 4 fields, saw 7\n'


b'Skipping line 388428: expected 4 fields, saw 5\n'


691400

In [42]:
columns = df_usa_comments.columns.tolist()
columns[1] = 'tweet'
df_usa_comments.columns = columns
print(df_usa_comments.columns)

Index(['video_id', 'tweet', 'likes', 'replies'], dtype='object')


In [52]:
classified = []

def reverrse_polarity_mapping(text):
    if (text == 'negative'):
        return 0
    elif (text == 'neutral'):
        return 2
    else:
        return 4

def save_progress(classified):
    if len(classified) < 10:
        return 
    
    output = pd.DataFrame(classified)
    output = output[['video_id', 'comment', 'pol_textblob', 'pol_nlpWarriers']]
    output.to_csv('output/all_data.csv')

    output = pd.DataFrame([item for item in classified if 2 not in [item['pol_textblob'], item['pol_nlpWarriers']]])
    output = output[['video_id', 'comment', 'pol_textblob', 'pol_nlpWarriers']]
    output.to_csv('output/no_neutrals.csv')

    output = pd.DataFrame([item for item in classified if (2 not in [item['pol_textblob'], item['pol_nlpWarriers']])
                           and (item['pol_textblob'] == item['pol_nlpWarriers'])])
    output = output[['video_id', 'comment', 'pol_textblob', 'pol_nlpWarriers']]
    output.to_csv('output/nn_agree.csv')
    
    output = pd.DataFrame([item for item in classified if (2 not in [item['pol_textblob'], item['pol_nlpWarriers']])
                           and (item['pol_textblob'] != item['pol_nlpWarriers'])])
    output = output[['video_id', 'comment', 'pol_textblob', 'pol_nlpWarriers']]
    output.to_csv('output/nn_disagree.csv')
    
    output = pd.DataFrame([item for item in classified if (item['pol_textblob'] == item['pol_nlpWarriers'])])
    output = output[['video_id', 'comment', 'pol_textblob', 'pol_nlpWarriers']]
    output.to_csv('output/agree.csv')
    
    output = pd.DataFrame([item for item in classified if (item['pol_textblob'] != item['pol_nlpWarriers'])])
    output = output[['video_id', 'comment', 'pol_textblob', 'pol_nlpWarriers']]
    output.to_csv('output/disagree.csv')



disagree = 0
disagree_no_neutral = 0
no_neutral = 0
for i, row in df_usa_comments.iterrows():
    # cont_from = 70000
    # if (i < cont_from):
    #     continue
    # elif (i == cont_from):
    #     print("continue from {}".format(i))

    if (i == 20):
        save_progress(classified)

    tb = TextBlob(text=str(row['tweet']))
    pol_tb = tb.polarity
    if (pol_tb < 0):
        pol_tb = 'negative'
    elif (pol_tb > 0):
        pol_tb = 'positive'
    else:
        pol_tb = 'neutral'

    tweet_data = pd.DataFrame({'tweet': [str(row['tweet'])]})
    pol_clf = clf.classify(contains_feature(preprocess_all(tweet_data).at[0, 'tweet']))

    classified.append({'video_id': row['video_id'],
                       'comment': row['tweet'],
                       'pol_textblob': reverrse_polarity_mapping(pol_tb),
                       'pol_nlpWarriers': reverrse_polarity_mapping(pol_clf)
                       })

    if ('neutral' not in [pol_tb, pol_clf]):
        no_neutral += 1

    if (pol_tb != pol_clf):
        disagree += 1
        if ('neutral' not in [pol_tb, pol_clf]):
            disagree_no_neutral += 1

    # if ((i % 500) == 0):
    #     print(row['tweet'], pol_clf, pol_tb)

    if ((i % 10000) == 0):
        print(i, disagree, no_neutral, disagree_no_neutral, (disagree_no_neutral / max(no_neutral, 1)) * 100)
        save_progress(classified)

print(disagree, (disagree / len(df_usa_comments)) * 100, disagree_no_neutral, (disagree_no_neutral / no_neutral) * 100)

save_progress(classified)


0 1 0 0 0.0


10000 6717 5916 2632 44.4895199459094


20000 13220 11851 5070 42.78119989874272


30000 19806 17801 7606 42.72793663277344


40000 26368 23739 10106 42.57129617928304


50000 33020 29701 12720 42.82684084710952


60000 39649 35568 15216 42.7800269905533


70000 46239 41453 17691 42.67724893252599


80000 52794 47310 20103 42.49207355738744


90000 59421 53119 22539 42.43114516463036


100000 66102 58955 25056 42.500212026121616


110000 72711 64920 27630 42.56007393715342


120000 79400 70708 30107 42.57934038581207


130000 86180 76462 32641 42.68917893855772


140000 92936 82242 35177 42.77254930570755


150000 99713 88006 37718 42.85844146989978


160000 106548 93674 40221 42.937207763093284


170000 113380 99386 42765 43.02919928360131


180000 120117 105111 45227 43.027846752480706


190000 126518 111263 47780 42.94329651366582


200000 133178 116951 50128 42.86239536216022


210000 139672 123031 52702 42.83635831619673


220000 146310 128849 55158 42.80824841481113


230000 152811 134882 57692 42.772200886700965


240000 159370 140826 60195 42.744237569766945


250000 165963 146851 62813 42.773287209484444


260000 172567 152826 65392 42.78853074738592


270000 179201 158711 67911 42.789094643723494


280000 185833 164640 70472 42.80369290573372


290000 192642 170393 73034 42.86208940508119


300000 199407 176252 75658 42.92603771872092


310000 206007 182123 78129 42.89902977657957


320000 212850 187761 80610 42.932238324252644


330000 219409 193766 83174 42.92497135720405


340000 226173 199603 85775 42.97280101000486


350000 232702 205705 88406 42.97707882647481


360000 239507 211533 91039 43.037729337739265


370000 246096 217656 93751 43.07301429779101


380000 252818 223532 96349 43.10300091262101


390000 259311 229661 98971 43.0943869442352


400000 265922 235693 101614 43.11286291913633


410000 272635 241616 104250 43.14697702138931


420000 279259 247531 106789 43.14166710432229


430000 285766 253557 109322 43.11535473286086


440000 292306 259475 111780 43.07929472974275


450000 298740 265530 114269 43.034308741008545


460000 305281 271443 116723 43.00092468768765


470000 312006 277324 119329 43.0287317361642


480000 318433 283359 121791 42.981165235619834


490000 325087 289146 124232 42.96514563576878


500000 331617 295152 126768 42.95007318263132


510000 338145 300992 129136 42.90346587284712


520000 344736 306845 131580 42.88158516514853


530000 351339 312842 134180 42.890660461191274


540000 357914 318642 136555 42.85530469931773


550000 364452 324664 139115 42.84891457014021


560000 370998 330437 141434 42.80210751217327


570000 377604 336432 144035 42.8125148618443


580000 384291 342153 146443 42.80044307663531


590000 390935 348033 148967 42.80255033287073


600000 397545 353816 151360 42.77929771406607


610000 404063 359846 153908 42.770518499580376


620000 410750 365571 156320 42.760503431617934


630000 417264 371551 158814 42.74352646070122


640000 423970 377317 161286 42.74548986661084


650000 430721 383059 163779 42.75555462735505


660000 437513 388718 166230 42.76364871191969


670000 444095 394513 168607 42.73800863342906


680000 450849 400198 171046 42.740343529952675


690000 457389 406141 173529 42.726294562725755


458324 66.28926815157651 173874 42.726133431625506


### Analyzing our labelled versions of the output datasets

In [66]:
df= pd.read_csv("./output/agree.csv",error_bad_lines=False)
len(df)

233076

In [67]:
df = df[len(df)//2:]
print(len(df))
df_shuffled = df.sample(frac=1)
df_shuffled.to_csv("output/agree_shuffled_dennis.csv")

116538


In [89]:
test_set_youtube = pd.read_csv("./output/disagree_shuffled_dennis.csv",error_bad_lines=False)
test_set_youtube_2 = pd.read_csv("./output/disagree_shuffled_joe.csv",error_bad_lines=False)
test_set_youtube = pd.concat([test_set_youtube,test_set_youtube_2])
test_set_youtube.drop(test_set_youtube.columns[[0,1]], axis= 1, inplace=True)

def percentage(num,of):
    return (num/of)*100

no_neutral = 0
correct_clf = 0
correct_tb = 0
for i, row in test_set_youtube.iterrows():
    if row['pol_human'] != 2:
        no_neutral += 1
        if row['pol_textblob'] == row['pol_human']:
            correct_tb += 1
        elif row['pol_nlpWarriers'] == row['pol_human']:
            correct_clf += 1
            
print("Textblob|Our_classifier comparison\nTestset size: {} without neutral: {}\n"
      "Textblob Accuracy: {:.02f}\nOur Classifier Accuracy: {:.02f}".format(
    len(test_set_youtube),no_neutral,percentage(correct_tb,no_neutral),percentage(correct_clf,no_neutral)
))

Textblob|Our_classifier comparison
Testset size: 197 without neutral: 122
Textblob Accuracy: 25.41
Our Classifier Accuracy: 59.02


In [94]:
test_set_youtube = pd.read_csv("./output/agree_shuffled_dennis.csv",error_bad_lines=False)
test_set_youtube_2 = pd.read_csv("./output/agree_shuffled_joe.csv",error_bad_lines=False)
test_set_youtube = pd.concat([test_set_youtube,test_set_youtube_2])
test_set_youtube.drop(test_set_youtube.columns[[0,1]], axis= 1, inplace=True)

def percentage(num,of):
    return (num/of)*100

no_neutral = 0
correct = 0
for i, row in test_set_youtube.iterrows():
    if row['pol_human'] != 2:
        no_neutral += 1
        if row['pol_textblob'] == row['pol_human']:
            correct += 1
    
            
print("Textblob|Our_classifier accuracy when in agreement\nTestset size: {} without neutral: {}\n"
      "Accuracy: {:.02f}".format(
    len(test_set_youtube),no_neutral,percentage(correct,no_neutral)
))

Textblob|Our_classifier accuracy when in agreement
Testset size: 199 without neutral: 169
Accuracy: 90.53
