# Text Sentiment Classification

## Preprocessing

First let's import the useful packages:

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup
import re
from pprint import pprint
from nltk.tokenize import WordPunctTokenizer
from wordcloud import WordCloud
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from time import time
from textblob import TextBlob
import csv as CSV

We used two different datasets as training sets: the “Sentiment140” dataset from Stanford University and the full training set giving to us from CrowdAI. Let's first import the Stanford dataset and delete the extra columns, and initialize the sentiment field with the good value (actually, the positive are equal to 4, we want them equal to 1):

In [2]:
cols = ['sentiment','id','date','query_string','user','text']

data_frame_stanford = pd.read_csv("./data/stanford_train.csv",header=None, names=cols,encoding="ISO-8859–1")
data_frame_stanford.drop(['id','date','query_string','user'],axis=1,inplace=True)

data_frame_stanford.sentiment[data_frame_stanford.sentiment == 4] = 1

data_frame_stanford.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Unnamed: 0,sentiment,text
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,is upset that he can't update his Facebook by ...
2,0,@Kenichan I dived many times for the ball. Man...
3,0,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all...."


Then let's import the CrowdAI dataset and initizalise the sentiment field (=0 for the negative and =1 for the positive)

In [3]:
cols = ['text', 'sentiment']

#Load negative sentiment training set
neg_crowd = pd.read_csv('data/train_neg_full.txt',sep= "\n" ,header=None, names=cols)
neg_crowd = neg_crowd.loc[:, cols[::-1]]
neg_crowd['sentiment']= 0

#Load positive sentiment training set
pos_crowd = pd.read_csv('data/train_pos_full.txt',sep= "\n" ,header=None, names=cols)
pos_crows = pos_crowd.loc[:, cols[::-1]]
pos_crowd['sentiment']= 1

We combine now the different dataset

In [4]:
frames = [neg_crowd, data_frame_stanford, pos_crowd]
train = pd.concat(frames, ignore_index = True)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.


  


Now, we define a function which will clean the tweets, by removing @, < user >, urls and all caracteres that are not a letter

In [5]:
def clean(text):
    
    tok = WordPunctTokenizer() #tokenize
    del_user1 = r'@[A-Za-z0-9]+' #users
    del_user2 = r'<user>'
    del_url1 = r'https?://[A-Za-z0-9./]+' #urls
    del_url2 = r'<url>'
    combined_del = r'|'.join((del_user1, del_user2, del_url1, del_url2)) 
    soup = BeautifulSoup(text, 'lxml') #BeautifulSoup to save computotational 
    souped = soup.get_text()
    stripped = re.sub(combined_del,'', souped)
    try:
        clean = stripped.decode("utf-8-sig").replace(u"\ufffd", "?")
    except:
        clean = stripped
    #remove if more than 2 consecutrive identive letters (optional)
    #re.sub(r'((\w)\2{2,})', r"\2", text)
    letters_only = re.sub("[^a-zA-Z]", " ", clean)
    lower_case = letters_only.lower()
    words = tok.tokenize(lower_case)
    
    return (" ".join(words)).strip()

We can now clean the dataset:

In [6]:
clean_text = []
for i in range(len(train.text)):                                                          
    clean_text.append(clean(train['text'][i]))
    
data_frame_cleaned = pd.DataFrame(clean_text, columns=['text'])
data_frame_cleaned['sentiment'] = train.sentiment

data_frame_cleaned.head()

Unnamed: 0,text,sentiment
0,vinco tresorpack difficulty of object disassem...,0
1,glad i dot have taks tomorrow thankful startho,0
2,vs celtics in the regular season were fucked i...,0
3,i could actually kill that girl i m so sorry,0
4,i find that very hard to believe im afraid,0


We can now delete the duplicate tweet and the tweet with no more text

In [7]:
data_frame_tot = data_frame_cleaned.drop_duplicates(['text'], keep = 'first')
data_frame_tot = data_frame_tot.reset_index(drop = True)
data_frame_tot.dropna(inplace = True)
data_frame_tot.reset_index(drop = True, inplace = True)
data_frame_tot.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3655817 entries, 0 to 3655816
Data columns (total 2 columns):
text         object
sentiment    int64
dtypes: int64(1), object(1)
memory usage: 55.8+ MB


And have a look at the cleaned tweets:

In [8]:
data_frame_tot.head()

Unnamed: 0,text,sentiment
0,vinco tresorpack difficulty of object disassem...,0
1,glad i dot have taks tomorrow thankful startho,0
2,vs celtics in the regular season were fucked i...,0
3,i could actually kill that girl i m so sorry,0
4,i find that very hard to believe im afraid,0


In [9]:
data_frame_tot.tail()

Unnamed: 0,text,sentiment
3655812,a warning sign rt the negativity you bleed out...,1
3655813,ff too thank youuu,1
3655814,i just love shumpa that s my girl,1
3655815,the best way to start a day no matter what hap...,1
3655816,frenchieswant dtou i m not from french but don...,1


We save now the cleaned dataset in a csv file

In [10]:
data_frame_tot.to_csv('./data/dataset_cleaned_train.csv')

## Training of the classifier and validation

Load data frame:

In [11]:
data_frame = pd.read_csv('./data/dataset_cleaned_train.csv')
data_frame.head()

Unnamed: 0.1,Unnamed: 0,text,sentiment
0,0,vinco tresorpack difficulty of object disassem...,0
1,1,glad i dot have taks tomorrow thankful startho,0
2,2,vs celtics in the regular season were fucked i...,0
3,3,i could actually kill that girl i m so sorry,0
4,4,i find that very hard to believe im afraid,0


Here we decided to split the data as follow : 

- 98% training data
- 2% test data

In [12]:
#Split data
x = data_frame.text.astype('U')
y = data_frame.sentiment
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=.02)

#Regressiontype
#linear_classifier = SGDClassifier()
linear_classifier = LogisticRegression()
#linear_classifier = Ridge()

#Create count vectorizer and fit it to data
count_vectorizer = CountVectorizer()
count_vectorizer.fit(x)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

We can now set the parameters of the vectorizer for a certain number of words and differt size of N-grams:

In [13]:
#n_gram_range = (1, 1)
#n_gram_range = (1, 2)
n_gram_range = (1, 3)
maximum_number_of_words = 100000
count_vectorizer.set_params(max_features=maximum_number_of_words, ngram_range=n_gram_range)


CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=100000, min_df=1,
        ngram_range=(1, 3), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

Training:

In [14]:
pipeline = Pipeline([('vectorizer', count_vectorizer),('classifier', linear_classifier)])
sentiment_fit = pipeline.fit(x_train, y_train)

Prediction:

In [15]:
y_pred = sentiment_fit.predict(x_test)
accuracy = accuracy_score(y_test, y_pred.round())
print("Accuracy = ", accuracy)

Accuracy =  0.825772392193334


Note that this result is obtained by merging the Stanford dataset to the CrowdAI dataset. Higher accuracy is achieved by taking only the CrowdAI dataset.

## Cleaning of Test data for Run.py

In [4]:
cols = ['text', 'sentiment']
test_df = pd.read_csv('data/test_data.txt',sep= "\n" ,header=None, names=cols)

In [12]:
clean_text = []
for i in range(len(test_df.text)):                                                          
    clean_text.append(clean(test_df['text'][i]))
    
test_frame_cleaned = pd.DataFrame(clean_text, columns=['text'])
test_frame_cleaned.head()

Unnamed: 0,text
0,sea doo pro sea scooter sports with the portab...
1,shucks well i work all week so now i can t com...
2,i cant stay away from bug thats my baby
3,no ma am lol im perfectly fine and not contagi...
4,whenever i fall asleep watching the tv i alway...


In [13]:
test_frame_cleaned.to_csv('data/test_data_cleaned.csv')