# Text Sentiment Classification

## Preprocessing

First let's import the useful packages:

In [25]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup
import re
from pprint import pprint
from nltk.tokenize import WordPunctTokenizer
from wordcloud import WordCloud
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from time import time
from textblob import TextBlob
import csv as CSV

We used two different datasets as training sets: the “Sentiment140” dataset from Stanford University and the full training set giving to us from CrowdAI. Let's first import the Stanford dataset and delete the extra columns:

In [26]:
cols = ['sentiment','id','date','query_string','user','text']

data_frame_stanford = pd.read_csv("./data/stanford_train.csv",header=None, names=cols,encoding="ISO-8859–1")
data_frame_stanford.drop(['id','date','query_string','user'],axis=1,inplace=True)

data_frame_stanford.head()

Unnamed: 0,sentiment,text
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,is upset that he can't update his Facebook by ...
2,0,@Kenichan I dived many times for the ball. Man...
3,0,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all...."


Now, we define a function which will clean the tweets, by removing @, urls and hashtags

In [27]:
def tweet_cleaner(text):
    
    tok = WordPunctTokenizer() #tokenizer
    pat1 = r'@[A-Za-z0-9]+' #users
    pat2 = r'https?://[A-Za-z0-9./]+' #urls
    combined_pat = r'|'.join((pat1, pat2)) 
    soup = BeautifulSoup(text, 'lxml') #BeautifulSoup to save computotational 
    souped = soup.get_text()
    stripped = re.sub(combined_pat,'', souped)
    try:
        clean = stripped.decode("utf-8-sig").replace(u"\ufffd", "?")
    except:
        clean = stripped
    #remove if more than 2 consecutrive identive letters
    #re.sub(r'((\w)\2{2,})', r"\2", text)
    letters_only = re.sub("[^a-zA-Z]", " ", clean)
    lower_case = letters_only.lower()
    # During the letters_only process two lines above, it has created unnecessay white spaces,
    # I will tokenize and join together to remove unneccessary white spaces
    words = tok.tokenize(lower_case)
    
    return (" ".join(words)).strip()

We can now clean the Stanford dataset:

In [4]:
clean_text = []
for i in range(len(data_frame_stanford.text)):
    if((i+1)%100000 == 0 ):
        print("%d tweets of %d have been cleaned" % (i+1, len(data_frame_stanford.text)))                                                                  
    clean_text.append(tweet_cleaner(data_frame_stanford['text'][i]))


100000 tweets of 1600000 have been cleaned
200000 tweets of 1600000 have been cleaned
300000 tweets of 1600000 have been cleaned
400000 tweets of 1600000 have been cleaned
500000 tweets of 1600000 have been cleaned
600000 tweets of 1600000 have been cleaned
700000 tweets of 1600000 have been cleaned
800000 tweets of 1600000 have been cleaned
900000 tweets of 1600000 have been cleaned
1000000 tweets of 1600000 have been cleaned
1100000 tweets of 1600000 have been cleaned
1200000 tweets of 1600000 have been cleaned
1300000 tweets of 1600000 have been cleaned
1400000 tweets of 1600000 have been cleaned
1500000 tweets of 1600000 have been cleaned
1600000 tweets of 1600000 have been cleaned


And have a look at the cleaned tweets:

In [5]:
data_frame_stanford_cleaned = pd.DataFrame(clean_text, columns=['text'])
data_frame_stanford_cleaned['sentiment'] = data_frame_stanford.sentiment

data_frame_stanford_cleaned.head()

Unnamed: 0,text,sentiment
0,awww that s a bummer you shoulda got david car...,0
1,is upset that he can t update his facebook by ...,0
2,i dived many times for the ball managed to sav...,0
3,my whole body feels itchy and like its on fire,0
4,no it s not behaving at all i m mad why am i h...,0


In [6]:
data_frame_stanford_cleaned.tail()

Unnamed: 0,text,sentiment
1599995,just woke up having no school is the best feel...,4
1599996,thewdb com very cool to hear old walt intervie...,4
1599997,are you ready for your mojo makeover ask me fo...,4
1599998,happy th birthday to my boo of alll time tupac...,4
1599999,happy charitytuesday,4


A "positive sentiment" is noted 4, let's remplace this value by 1:

In [7]:
data_frame_stanford_cleaned.sentiment[data_frame_stanford_cleaned.sentiment == 4] = 1
data_frame_stanford_cleaned.tail()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,text,sentiment
1599995,just woke up having no school is the best feel...,1
1599996,thewdb com very cool to hear old walt intervie...,1
1599997,are you ready for your mojo makeover ask me fo...,1
1599998,happy th birthday to my boo of alll time tupac...,1
1599999,happy charitytuesday,1


Now, let's load and clean the CrowdAI dataset:

In [28]:
cols = ['text', 'sentiment']

#Load negative sentiment training set
neg = pd.read_csv('data/train_neg_full.txt',sep= "\n" ,header=None, names=cols)
neg = neg.loc[:, cols[::-1]]
neg['sentiment']= 0

#Load positive sentiment training set
pos = pd.read_csv('data/train_pos_full.txt',sep= "\n" ,header=None, names=cols)
pos = pos.loc[:, cols[::-1]]
pos['sentiment']= 1

#Merge and delete <usr> and <url>
frames = [neg, pos] 
data_frame_crowdai = pd.concat(frames, ignore_index = True)
data_frame_crowdai = data_frame_crowdai.drop_duplicates(['text'], keep='first')
data_frame_crowdai['text'] = data_frame_crowdai['text'].str.replace('<user>', '')
data_frame_crowdai['text'] = data_frame_crowdai['text'].str.replace('<url>', '')
data_frame_crowdai['text'] = data_frame_crowdai['text'].str.replace('#', '')
data_frame_crowdai = data_frame_crowdai.reset_index(drop=True)

data_frame_crowdai.head()

Unnamed: 0,sentiment,text
0,0,vinco tresorpack 6 ( difficulty 10 of 10 objec...
1,0,glad i dot have taks tomorrow ! ! thankful sta...
2,0,1-3 vs celtics in the regular season = were fu...
3,0,i could actually kill that girl i'm so sorry ...
4,0,i find that very hard to believe im afraid


And clean it:

In [29]:
clean_text = []
for i in range(len(data_frame_crowdai.text)):
    if((i+1)%100000 == 0 ):
        print("%d tweets of %d have been cleaned" % (i+1, len(data_frame_crowdai.text)))                                                                  
    clean_text.append(tweet_cleaner(data_frame_crowdai['text'][i]))

100000 tweets of 2226938 have been cleaned
200000 tweets of 2226938 have been cleaned
300000 tweets of 2226938 have been cleaned
400000 tweets of 2226938 have been cleaned
500000 tweets of 2226938 have been cleaned
600000 tweets of 2226938 have been cleaned
700000 tweets of 2226938 have been cleaned
800000 tweets of 2226938 have been cleaned
900000 tweets of 2226938 have been cleaned
1000000 tweets of 2226938 have been cleaned
1100000 tweets of 2226938 have been cleaned
1200000 tweets of 2226938 have been cleaned
1300000 tweets of 2226938 have been cleaned
1400000 tweets of 2226938 have been cleaned
1500000 tweets of 2226938 have been cleaned


KeyboardInterrupt: 

In [10]:
data_frame_crowdai_cleaned = pd.DataFrame(clean_text, columns = ['text'])
data_frame_crowdai_cleaned['sentiment'] = data_frame_crowdai.sentiment

# Convert positive labels from 4 to 1
data_frame_crowdai_cleaned.sentiment[data_frame_crowdai_cleaned.sentiment == 4] = 1
data_frame_crowdai_cleaned.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


Unnamed: 0,text,sentiment
0,vinco tresorpack difficulty of object disassem...,0
1,glad i dot have taks tomorrow thankful startho,0
2,vs celtics in the regular season were fucked i...,0
3,i could actually kill that girl i m so sorry,0
4,i find that very hard to believe im afraid,0


We can now merge both datasets and remove duplicates:

In [11]:
#data_frame_tot = pd.Series.append(data_frame_stanford_cleaned,data_frame_crowdai_cleaned)
data_frame_tot = data_frame_crowdai_cleaned
data_frame_tot = data_frame_tot.drop_duplicates(['text'], keep = 'first')
data_frame_tot = data_frame_tot.reset_index(drop = True)
data_frame_tot.dropna(inplace = True)
data_frame_tot.reset_index(drop = True, inplace = True)
data_frame_tot.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2142594 entries, 0 to 2142593
Data columns (total 2 columns):
text         object
sentiment    int64
dtypes: int64(1), object(1)
memory usage: 32.7+ MB


And save it as a csv file:

In [12]:
data_frame_tot.to_csv('./data/crowdai_cleaned_train.csv')

## Training of the classifier

Load data frame:

In [13]:
data_frame = pd.read_csv('./data/crowdai_cleaned_train.csv')
data_frame.head()

Unnamed: 0.1,Unnamed: 0,text,sentiment
0,0,vinco tresorpack difficulty of object disassem...,0
1,1,glad i dot have taks tomorrow thankful startho,0
2,2,vs celtics in the regular season were fucked i...,0
3,3,i could actually kill that girl i m so sorry,0
4,4,i find that very hard to believe im afraid,0


In [14]:
x = data_frame.text.astype('U')
y = data_frame.sentiment
SEED = 2000
x_train, x_validation_and_test, y_train, y_validation_and_test = train_test_split(x, y, test_size=.02, random_state=SEED)
x_validation, x_test, y_validation, y_test = train_test_split(x_validation_and_test, y_validation_and_test, test_size=.5, random_state = SEED)
print("Train set has total {0} entries with {1:.2f}% negative, {2:.2f}% positive".format(len(x_train), (len(x_train[y_train == 0]) / (len(x_train)*1.))*100, (len(x_train[y_train == 1]) / (len(x_train)*1.))*100))
print("Validation set has total {0} entries with {1:.2f}% negative, {2:.2f}% positive".format(len(x_validation),(len(x_validation[y_validation == 0]) / (len(x_validation)*1.))*100,(len(x_validation[y_validation == 1]) / (len(x_validation)*1.))*100))
print("Test set has total {0} entries with {1:.2f}% negative, {2:.2f}% positive".format(len(x_test),(len(x_test[y_test == 0]) / (len(x_test)*1.))*100,(len(x_test[y_test == 1]) / (len(x_test)*1.))*100))

Train set has total 2099742 entries with 50.38% negative, 49.62% positive
Validation set has total 21426 entries with 49.90% negative, 50.10% positive
Test set has total 21426 entries with 50.22% negative, 49.78% positive


Check TextBlob build-in classifier result:

In [15]:
tbresult = [TextBlob(i).sentiment.polarity for i in x_validation]
tbpred = [0 if n < 0 else 1 for n in tbresult]
conmat = np.array(confusion_matrix(y_validation, tbpred, labels=[1,0]))
confusion = pd.DataFrame(conmat, index=['positive', 'negative'],
                         columns=['predicted_positive','predicted_negative'])
print("Accuracy Score: {0:.2f}%".format(accuracy_score(y_validation, tbpred)*100))
print("-"*80)
print("Confusion Matrix\n")
print(confusion)
print("-"*80)
print("Classification Report\n")
print(classification_report(y_validation, tbpred))

Accuracy Score: 57.46%
--------------------------------------------------------------------------------
Confusion Matrix

          predicted_positive  predicted_negative
positive                9611                1123
negative                7992                2700
--------------------------------------------------------------------------------
Classification Report

             precision    recall  f1-score   support

          0       0.71      0.25      0.37     10692
          1       0.55      0.90      0.68     10734

avg / total       0.63      0.57      0.53     21426



In [22]:
def accuracy_summary(pipeline, x_train, y_train, x_test, y_test):
    if len(x_test[y_test == 0]) / (len(x_test)*1.) > 0.5:
        null_accuracy = len(x_test[y_test == 0]) / (len(x_test)*1.)
    else:
        null_accuracy = 1. - (len(x_test[y_test == 0]) / (len(x_test)*1.))
    t0 = time()
    sentiment_fit = pipeline.fit(x_train, y_train)
    y_pred = sentiment_fit.predict(x_test)
    train_test_time = time() - t0
    accuracy = accuracy_score(y_test, y_pred.round())
    print("null accuracy: {0:.2f}%".format(null_accuracy*100))
    print("accuracy score: {0:.2f}%".format(accuracy*100))
    if accuracy > null_accuracy:
        print("model is {0:.2f}% more accurate than null accuracy".format((accuracy-null_accuracy)*100))
    elif accuracy == null_accuracy:
        print("model has the same accuracy with the null accuracy")
    else:
        print("model is {0:.2f}% less accurate than null accuracy".format((null_accuracy-accuracy)*100))
    print("train and test time: {0:.2f}s".format(train_test_time))
    print("-"*80)
    return accuracy, train_test_time
cvec = CountVectorizer()

#Regressiontype
linear_classifier = SGDClassifier()

n_features = np.arange(70000,130001,10000)
def nfeature_accuracy_checker(vectorizer=cvec, n_features=n_features, stop_words=None, ngram_range=(1, 1), classifier=linear_classifier):
    result = []
    print(classifier)
    print("\n")
    for n in n_features:
        vectorizer.set_params(stop_words=stop_words, max_features=n, ngram_range=ngram_range)
        checker_pipeline = Pipeline([
            ('vectorizer', vectorizer),
            ('classifier', classifier)
        ])
        print("Validation result for {} features".format(n))
        nfeature_accuracy,tt_time = accuracy_summary(checker_pipeline, x_train, y_train, x_validation, y_validation)
        result.append((n,nfeature_accuracy,tt_time))
    return result


In [23]:
cvec = CountVectorizer()
cvec.fit(x)
neg_doc_matrix = cvec.transform(x[data_frame.sentiment == 0])
pos_doc_matrix = cvec.transform(x[data_frame.sentiment == 1])
neg_tf = np.sum(neg_doc_matrix,axis=0)
pos_tf = np.sum(pos_doc_matrix,axis=0)
neg = np.squeeze(np.asarray(neg_tf))
pos = np.squeeze(np.asarray(pos_tf))
term_freq_df = pd.DataFrame([neg,pos],columns=cvec.get_feature_names()).transpose()

In [24]:
print("RESULT FOR TRIGRAM WITH STOP WORDS\n")
feature_result_tg = nfeature_accuracy_checker(ngram_range=(1, 3))

RESULT FOR TRIGRAM WITH STOP WORDS

SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', max_iter=None, n_iter=None,
       n_jobs=1, penalty='l2', power_t=0.5, random_state=None,
       shuffle=True, tol=None, verbose=0, warm_start=False)


Validation result for 70000 features




null accuracy: 50.10%
accuracy score: 81.73%
model is 31.63% more accurate than null accuracy
train and test time: 232.67s
--------------------------------------------------------------------------------
Validation result for 80000 features




null accuracy: 50.10%
accuracy score: 81.82%
model is 31.72% more accurate than null accuracy
train and test time: 244.22s
--------------------------------------------------------------------------------
Validation result for 90000 features


KeyboardInterrupt: 

In [None]:
plt.plot(nfeatures_plot_tg.nfeatures, nfeatures_plot_tg.validation_accuracy,label='trigram')
plt.title("3gram test result : Accuracy")
plt.xlabel("Number of features")
plt.ylabel("Validation set accuracy")

In [6]:
df = pd.read_csv('158INVERSE.csv')
df.Prediction[df.Prediction == 1] = -2
df.Prediction[df.Prediction == -1] = 1
df.Prediction[df.Prediction == -2] = -1
df.to_csv('predictions.csv')