In [19]:
import re
import nltk
import operator
import numpy as np
import pandas as pd
import seaborn as sns
import collections as ct

from bs4 import BeautifulSoup
from nltk import pos_tag,word_tokenize
from textblob import TextBlob
from nltk.corpus import wordnet
from nltk.corpus import stopwords
from xgboost import XGBClassifier
from sklearn import metrics
from sklearn import naive_bayes ,svm
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold, StratifiedKFold,train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from gensim.models.keyedvectors import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec



%matplotlib inline

In [20]:
def ReadFile(inFile):
    with open(inFile, "r") as f:
        content = f.readlines()   
    content = [x.strip() for x in content]
    return (content)

In [21]:
def ExFeatures(df):
    tweet = df.Body
    df['len'] = [len(t) for t in df.Body]
    def count_URL(tweet):
        URL_count = 0
        for word in tweet.split() :
            if word.startswith('http'):
                URL_count+=1
        return URL_count
    df['URL_count'] = df["Body"].apply(count_URL)
    
    def pos_count(tweet):
        pos_count = 0
        for i in tweet.split():
            if i in positive_Words:
                pos_count+=1
        return pos_count
    df["positive_words_count"] = df["Body"].apply(pos_count)
    
    def neg_count(tweet):
        neg_count = 0
        for i in tweet.split():
            if i in negative_Words:
                neg_count+=1
        return neg_count 
    df["negative_words_count"] = df["Body"].apply(neg_count)
    
    def no_of_dots(tweet):
        liste = [val for c, val in ct.Counter(tweet).items() if c in '.' ]
        if liste == []: liste=[0]
        return liste[0]
    df['no_of_dots'] = df["Body"].apply(no_of_dots)
    
    def no_of_exclmarks(tweet):
        return sum(1 for c in tweet if c in '!' )
    df['no_of_exclmarks'] = df["Body"].apply(no_of_exclmarks)
    
    def no_of_quesmarks(tweet):
        return sum(1 for c in tweet if c in '?' )
    df['no_of_quesmarks'] = df["Body"].apply(no_of_quesmarks)
    
    def no_of_specialcharacters(tweet):
        return sum(1 for c in tweet if c in '$^~*"+=<>%&' )
    df['no_of_specialcharacters'] = df["Body"].apply(no_of_specialcharacters)
    
    def no_of_mentions(tweet):
        return sum(1 for c in tweet if c in '@' )
    df['no_of_mentions'] = df["Body"].apply(no_of_mentions)
    
    def no_of_hashtags(tweet):
        return sum(1 for c in tweet if c in '#' )
    df['no_of_hashtags'] = df["Body"].apply(no_of_hashtags)
    
    def no_of_UpperCase(tweet):
        return sum(1 for c in tweet if c.isupper())
    df['no_of_UpperCase'] = df["Body"].apply(no_of_UpperCase)
    
    def number_of_quotes(tweet):
        return sum(1 for c in tweet if c in '"' )
    df['number_of_quotes'] = df["Body"].apply(number_of_quotes)
    
    def countPosEmoji(tweet):
        numEmoji = 0
        for x in posEmoji:
            if (x[0] in tweet):
                numEmoji += tweet.count(x[0])
        return numEmoji
    df["posEmoji"] = df["Body"].apply(countPosEmoji)    
    
    def countNegEmoji(tweet):
        numEmoji = 0
        for x in negEmoji:
            if (x[0] in tweet):
                numEmoji += tweet.count(x[0])
        return numEmoji
    df["negEmoji"] = df["Body"].apply(countNegEmoji)

    def getPolarity(tweet):
        polarity = TextBlob(tweet).sentiment.polarity
        return polarity
    df["textBlobPolarity"] = df["Body"].apply(getPolarity)

    def getSubjectivity(tweet):
        subjectivity = TextBlob(tweet).sentiment.subjectivity
        return subjectivity
        
    df["textBlobSubjectivity"] = df["Body"].apply(getSubjectivity)   
    
# POS features     
    def is_adjective(tag):
        if tag == 'JJ' or tag == 'JJR' or tag == 'JJS':
            return True
        else:
            return False
    def adjective_count(tweet):
        return sum(is_adjective(i) for i in list(map(lambda x : x[1],pos_tag(tweet.split()))))
    df["adjective_count"] = df["Body"].apply(adjective_count)

    def is_adverb(tag):
        if tag == 'RB' or tag == 'RBR' or tag == 'RBS':
            return True
        else:
            return False
    def adverb_count(tweet):
        return sum(is_adverb(i) for i in list(map(lambda x : x[1],pos_tag(tweet.split()))))
    df["adverb_count"] = df["Body"].apply(adverb_count)

    def is_noun(tag):
        if tag == 'NN' or tag == 'NNS' or tag == 'NNP' or tag == 'NNPS':
            return True
        else:
            return False
    def noun_count(tweet):
        return sum(is_noun(i) for i in list(map(lambda x : x[1],pos_tag(tweet.split()))))
    df["noun_count"] = df["Body"].apply(noun_count)
    def is_verb(tag):
        if tag == 'VB' or tag == 'VBD' or tag == 'VBG' or tag == 'VBN' or tag == 'VBP' or tag == 'VBZ':
            return True
        else:
            return False
    def verb_count(tweet):
        return sum(is_verb(i) for i in list(map(lambda x : x[1],pos_tag(tweet.split()))))
    df["verb_count"] = df["Body"].apply(verb_count)
    return df

In [22]:
##POS sequence Extraction
def getPOSSeq(tweet,seqlen):
    POS = pos_tag(tweet.split())
    results = []
    for i in range(len(POS)-seqlen+1):
        buffPOS = ''
        for j in range(seqlen):
            buffPOS += POS[i+j][1]
        results.append(buffPOS)
    return(results)

def getTopPOSSequence(tweetslist,seqlen,n_top):
    topNSeqs = []
    POSdict = {'buff':0}
    seqN = 3
    for tweet in tweetslist:
        seqs = getPOSSeq(tweet,seqlen)
        for seq in seqs:
            if seq in POSdict.keys():
                POSdict[seq] += 1
            else:
                POSdict[seq] = 1
    sortedPOSSeq = sorted(POSdict.items(), key=operator.itemgetter(1))[-100:]
    sortedPOSSeqlist = []
    for x in sortedPOSSeq:
        sortedPOSSeqlist.append(x[0])
    return sortedPOSSeqlist

def POSSequenceCount(tweet,seqlen,seq):
    num = getPOSSeq(tweet,seqlen).count(seq)
    if(num):
        return num
    else:
        return 0


In [23]:
# Cleaning the tweets by removing URLs, usernames (mentions) , numbers and special characters 
def clean(tweet,emoticons):
    def Replace_unicoded(tweet):
        soup = BeautifulSoup(tweet, 'lxml')
        souped = soup.get_text()
        souped = ' '.join(re.sub("u2019","'", souped).split())
        souped = ' '.join(re.sub("u002c"," ", souped).split())
        return souped

    tweet = Replace_unicoded(tweet)
    
    def replaceEmoticons(tweet,emoticons):
        for emotion in emoticons:
            tweet = tweet.replace(emotion[0], " "+emotion[1]+" ",100)
        return tweet
    tweet = replaceEmoticons(tweet,emoticons)
    
    def Clean_tweet1(tweet):
        return ' '.join(re.sub("(\w+:\/\/\S+)|(&)|(%)|($)|(@[A-Za-z0-9_]+)|([^0-9A-Za-z_ \t])", "", tweet).split())
    
    tweet = Clean_tweet1(tweet)

    def Remove_numbers(tweet):
        return ' '.join(re.sub("(\s?[0-9]+\.?[0-9]*)", "", tweet).split())
    
    tweet = Remove_numbers(tweet)
    return tweet

In [24]:
positive_Words = ReadFile('positive-words.txt')
negative_Words = ReadFile('negative-words.txt')
posEmoji = [x.split(',') for x in ReadFile('posEmotions.txt')]
negEmoji = [x.split(',') for x in ReadFile('negEmotions.txt')]
emoticons = [x.split(',') for x in ReadFile('emotions.txt')]
positive_Words = positive_Words[35:]
negative_Words = negative_Words[35:]

In [25]:
# reading Dataset
tweets = pd.read_csv("twitter-2015+2013train.csv",names=['ID','Type','Body'])
tweets = tweets[['Type','Body']]
tweets.head()

Unnamed: 0,Type,Body
0,positive,Gas by my house hit $3.39!!!! I\u2019m going t...
1,negative,Theo Walcott is still shit\u002c watch Rafa an...
2,negative,its not that I\u2019m a GSP fan\u002c i just h...
3,negative,Iranian general says Israel\u2019s Iron Dome c...
4,neutral,Tehran\u002c Mon Amour: Obama Tried to Establi...


In [26]:
tweets = ExFeatures(tweets)
tweets.head(3)

Unnamed: 0,Type,Body,len,URL_count,positive_words_count,negative_words_count,no_of_dots,no_of_exclmarks,no_of_quesmarks,no_of_specialcharacters,...,no_of_UpperCase,number_of_quotes,posEmoji,negEmoji,textBlobPolarity,textBlobSubjectivity,adjective_count,adverb_count,noun_count,verb_count
0,positive,Gas by my house hit $3.39!!!! I\u2019m going t...,70,0,0,0,2,4,0,1,...,5,0,1,0,0.5,1.0,1,0,7,2
1,negative,Theo Walcott is still shit\u002c watch Rafa an...,81,0,0,0,1,0,0,0,...,5,0,0,0,0.0,0.0,1,1,7,1
2,negative,its not that I\u2019m a GSP fan\u002c i just h...,90,0,0,1,2,0,0,0,...,6,0,0,0,-0.8,0.9,0,2,8,2


In [27]:
tweets['Body'] = tweets['Body'].apply(clean,emoticons=emoticons)
tweets.head(3)

Unnamed: 0,Type,Body,len,URL_count,positive_words_count,negative_words_count,no_of_dots,no_of_exclmarks,no_of_quesmarks,no_of_specialcharacters,...,no_of_UpperCase,number_of_quotes,posEmoji,negEmoji,textBlobPolarity,textBlobSubjectivity,adjective_count,adverb_count,noun_count,verb_count
0,positive,Gas by my house hit Im going to Chapel Hill on...,70,0,0,0,2,4,0,1,...,5,0,1,0,0.5,1.0,1,0,7,2
1,negative,Theo Walcott is still shit watch Rafa and John...,81,0,0,0,1,0,0,0,...,5,0,0,0,0.0,0.0,1,1,7,1
2,negative,its not that Im a GSP fan i just hate Nick Dia...,90,0,0,1,2,0,0,0,...,6,0,0,0,-0.8,0.9,0,2,8,2


In [28]:
test = pd.read_csv('new_english_test.csv',names=['ID','Body'])
test.head(3)

Unnamed: 0,ID,Body
0,id,tweet
1,218775148495515649,Musical awareness: Great Big Beautiful Tomorro...
2,258965201766998017,On Radio786 100.4fm 7:10 Fri Oct 19 Labour ana...


In [29]:
test = ExFeatures(test)
test.head(3)

Unnamed: 0,ID,Body,len,URL_count,positive_words_count,negative_words_count,no_of_dots,no_of_exclmarks,no_of_quesmarks,no_of_specialcharacters,...,no_of_UpperCase,number_of_quotes,posEmoji,negEmoji,textBlobPolarity,textBlobSubjectivity,adjective_count,adverb_count,noun_count,verb_count
0,id,tweet,5,0,0,0,0,0,0,0,...,0,0,0,0,0.0,0.0,0,0,1,0
1,218775148495515649,Musical awareness: Great Big Beautiful Tomorro...,87,0,0,0,0,0,0,0,...,6,0,0,0,0.4125,0.4625,1,2,7,3
2,258965201766998017,On Radio786 100.4fm 7:10 Fri Oct 19 Labour ana...,141,1,0,1,2,0,0,0,...,8,0,0,0,0.0,0.0,0,0,13,0


In [30]:
test['Body'] = test['Body'].apply(clean,emoticons=emoticons)
test.head(3)

Unnamed: 0,ID,Body,len,URL_count,positive_words_count,negative_words_count,no_of_dots,no_of_exclmarks,no_of_quesmarks,no_of_specialcharacters,...,no_of_UpperCase,number_of_quotes,posEmoji,negEmoji,textBlobPolarity,textBlobSubjectivity,adjective_count,adverb_count,noun_count,verb_count
0,id,tweet,5,0,0,0,0,0,0,0,...,0,0,0,0,0.0,0.0,0,0,1,0
1,218775148495515649,Musical awareness Great Big Beautiful Tomorrow...,87,0,0,0,0,0,0,0,...,6,0,0,0,0.4125,0.4625,1,2,7,3
2,258965201766998017,On Radiofm Fri Oct Labour analyst Shawn Hattin...,141,1,0,1,2,0,0,0,...,8,0,0,0,0.0,0.0,0,0,13,0


In [96]:
seqlen = 4
topPOSSequence = getTopPOSSequence(tweets['Body'].values,seqlen,100)
for seq in topPOSSequence:    
    tweets[seq] = tweets["Body"].apply(POSSequenceCount,seqlen=seqlen,seq=seq)
    test[seq] = test["Body"].apply(POSSequenceCount,seqlen=seqlen,seq=seq)

In [97]:
vectorizer = CountVectorizer(min_df=4,max_df=0.8,ngram_range= (1,2))
vectorized_train = vectorizer.fit_transform(tweets['Body'])
vectorized_trainDF = pd.DataFrame(vectorized_train.toarray(),columns=vectorizer.get_feature_names())
vectorized_trainDF.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10173 entries, 0 to 10172
Columns: 10242 entries, aa to zulu
dtypes: int64(10242)
memory usage: 794.9 MB


In [98]:
vectorized_test = vectorizer.transform(test['Body'])
vectorized_testDF = pd.DataFrame(vectorized_test.toarray(),columns=vectorizer.get_feature_names())
vectorized_testDF.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3036 entries, 0 to 3035
Columns: 10242 entries, aa to zulu
dtypes: int64(10242)
memory usage: 237.2 MB


In [99]:
final_train = pd.concat([tweets['Type'],tweets.drop(['Type','Body'],axis=1) ,vectorized_trainDF],axis=1)
final_train.head()

Unnamed: 0,Type,len,URL_count,positive_words_count,negative_words_count,no_of_dots,no_of_exclmarks,no_of_quesmarks,no_of_specialcharacters,no_of_mentions,...,zap,zap all,zap catch,zap nba,zimmerman,zombie,zombies,zone,zuckerman,zulu
0,positive,70,0,0,0,2,4,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,negative,81,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,negative,90,0,0,1,2,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,negative,135,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,neutral,143,1,0,0,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [100]:
final_test = pd.concat([test.drop(['Body'],axis=1) ,vectorized_testDF],axis=1)
final_test.head()

Unnamed: 0,ID,len,URL_count,positive_words_count,negative_words_count,no_of_dots,no_of_exclmarks,no_of_quesmarks,no_of_specialcharacters,no_of_mentions,...,zap,zap all,zap catch,zap nba,zimmerman,zombie,zombies,zone,zuckerman,zulu
0,id,5,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,218775148495515649,87,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,258965201766998017,141,1,0,1,2,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,262926411352903682,141,0,1,0,3,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,254948834910818305,132,0,0,0,1,0,0,3,2,...,0,0,0,0,0,0,0,0,0,0


## Word Embedding

In [36]:
#glove2word2vec(glove_input_file="E:/glove.twitter.27B.200d.txt", word2vec_output_file="glove_vectors.txt")
glove_model = KeyedVectors.load_word2vec_format("glove_vectors.txt", binary=False)

In [37]:
def featureVec(words, model, num_features):
    # Pre-initialising empty numpy array for speed
    featureVec = np.zeros(num_features,dtype="float64")
    nwords = 0
    
    #Converting Index2Word which is a list to a set for better speed in the execution.
    index2word_set = set(model.wv.index2word)
    
    for word in  words:
        if word in index2word_set:
            nwords = nwords + 1
            featureVec = np.add(featureVec,model[word])
    
    # Dividing the result by number of words to get average
    featureVec = np.divide(featureVec, nwords)
    return featureVec

# Function for calculating the average feature vector
def getAvgFeatureVecs(tweets, model, num_features):
    counter = 0
    tweetFeatureVecs = np.zeros((len(tweets),num_features),dtype="float64")
    for tweet in tweets:  
        if counter%1000. == 0.:
            print ("Tweet %d of %d" % (counter, len(tweets)))
        tweetFeatureVecs[counter] = featureVec(tweet, model, num_features)
        counter = counter+1
        
    return tweetFeatureVecs

In [38]:
trainDataVecs = getAvgFeatureVecs(tweets['Body'], glove_model, 200)

Tweet 0 of 10173




Tweet 1000 of 10173
Tweet 2000 of 10173
Tweet 3000 of 10173
Tweet 4000 of 10173
Tweet 5000 of 10173
Tweet 6000 of 10173
Tweet 7000 of 10173
Tweet 8000 of 10173
Tweet 9000 of 10173
Tweet 10000 of 10173


In [39]:
testDataVecs = getAvgFeatureVecs(test['Body'], glove_model, 200)

Tweet 0 of 3036




Tweet 1000 of 3036
Tweet 2000 of 3036
Tweet 3000 of 3036


In [40]:
trainDataVecsDF=pd.DataFrame(trainDataVecs,columns = ['W2vFeature %s_' %i for i in range(trainDataVecs.shape[1])])
trainDataVecsDF.shape

(10173, 200)

In [41]:
testDataVecsDF=pd.DataFrame(testDataVecs,columns = ['W2vFeature %s_' %i for i in range(testDataVecs.shape[1])])
testDataVecsDF.shape

(3036, 200)

In [82]:
AFINN_list = [x.split('\t') for x in ReadFile('AFINN.txt')]
def AFINN_POL(tweet):
        AFINNN = []
        for x in AFINN_list:
            if(x[0] in tweet):
                AFINNN.append(int(x[1]))
        return sum(AFINNN)

tweets["AFINN_Polarity"] = tweets["Body"].apply(AFINN_POL)
test["AFINN_Polarity"] = test["Body"].apply(AFINN_POL)


In [83]:
tweets["AFINN_Polarity"].head()

0    0
1   -6
2    1
3    2
4   -2
Name: AFINN_Polarity, dtype: int64

In [101]:
bow_w2v = pd.concat([final_train, trainDataVecsDF,tweets["AFINN_Polarity"]], axis=1)
bow_w2v = pd.concat([final_train,tweets["AFINN_Polarity"]], axis=1)

bow_w2v.shape

#import collections
#duplicate = [item for item, count in collections.Counter(rrrr).items() if count > 1]
#bow_w2v.drop(duplicate,axis=1)
#rrrr = list(bow_w2v)
#print (len([item for item, count in collections.Counter(rrrr).items() if count > 1]))

(10173, 10465)

In [102]:
bow_w2v_test = pd.concat([final_test,testDataVecsDF,test["AFINN_Polarity"]], axis=1)
bow_w2v_test = pd.concat([final_test,test["AFINN_Polarity"]], axis=1)

bow_w2v_test.shape

(3036, 10465)

In [103]:
bow_w2vcopy = bow_w2v.copy()
bow_w2v_testcopy = bow_w2v_test.copy()
bow_w2vcopy.fillna(0, inplace=True)
bow_w2v_testcopy.fillna(0, inplace=True)

In [104]:

clf_gs = LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=-1,
          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

#clf_gs = XGBClassifier(nthread=-1,n_estimators=201,max_depth=20,objective="multi:softmax",learning_rate=.12)
clf_gs.fit(bow_w2vcopy.drop(["Type","no_of_mentions"],axis=1), bow_w2vcopy['Type'])
#clf_gs.fit(final_train.drop(["Type","no_of_mentions"],axis=1), bow_w2vcopy['Type'])

#clf_gs.fit()

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=-1,
          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [105]:
predicted = clf_gs.predict(bow_w2v_testcopy.drop(["ID","no_of_mentions"],axis=1))
#predicted = clf_gs.predict(final_test.drop(["ID","no_of_mentions"],axis=1))
predicted = predicted[1:]

In [106]:
results = pd.read_csv('new_english_test.csv')
results = results.assign(sentiment=predicted)
results.drop(['tweet'], axis=1)
results.to_csv(path_or_buf="results.csv",columns=['id','sentiment'],index=False)