# Dataset Import

In [8]:
import pandas as pd
import numpy as np

In [9]:
import csv

In [10]:
data_frame = pd.read_csv("training.txt",delimiter="\t", header=None)

In [11]:
data_frame.columns = ['Sentiment', 'Message']

In [12]:
data_frame.head()

Unnamed: 0,Sentiment,Message
0,1,The Da Vinci Code book is just awesome.
1,1,this was the first clive cussler i've ever rea...
2,1,i liked the Da Vinci Code a lot.
3,1,i liked the Da Vinci Code a lot.
4,1,I liked the Da Vinci Code but it ultimatly did...


In [13]:
one = data_frame[data_frame.Sentiment == 1]
zeros = data_frame[data_frame.Sentiment == 0]

In [14]:
one.shape

(3943, 2)

In [15]:
zeros.shape

(2975, 2)

In [16]:
data_frame.describe()

Unnamed: 0,Sentiment
count,6918.0
mean,0.569962
std,0.495117
min,0.0
25%,0.0
50%,1.0
75%,1.0
max,1.0


# Model building

## Librairies Import

In [17]:
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import RegexpTokenizer

token = RegexpTokenizer(r'[a-zA-Z0-9]+')
                        
cv = CountVectorizer(lowercase=True, stop_words='english', ngram_range = (1,1), tokenizer = token.tokenize)
                        
text_counts = cv.fit_transform(data_frame['Message'].values)

## For training the model and Cross-Validation

In [18]:
def trainAndScore(model, X, y) :
    list_test_size = [a/20.0 for a in list(range(0,20,1))][1:]
    scores = []
    for ts in list_test_size:
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=ts, random_state=0)
        clf = model.fit(X_train, y_train)
        scores.append(clf.score(X_test, y_test))
    print("scores : ",scores)
    print("Mean : ",np.array(scores).mean())
    print("Max : ",max(np.array(scores)))    

### Spliting the data into train and test datas

In [19]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(text_counts, data_frame['Sentiment'], test_size=0.3, random_state=1)

# Text Classification Model using Naive Bayes classifier 

In [20]:
from sklearn.naive_bayes import MultinomialNB

from sklearn import metrics

clf = MultinomialNB().fit(X_train, y_train)

predicted= clf.predict(X_test)
print("Naive Bayes classifier Accuracy:",metrics.accuracy_score(y_test, predicted)*100,"%")

Naive Bayes classifier Accuracy: 99.03660886319847 %


## Model Performance - Precision and Recall

In [21]:
from sklearn.metrics import average_precision_score
average_precision = average_precision_score(y_test, predicted)

print('Average precision-recall score: {0:0.2f}'.format(average_precision))

Average precision-recall score: 0.99


In [22]:
from sklearn.metrics import precision_recall_curve
import matplotlib.pyplot as plt
from inspect import signature

precision, recall, _ = precision_recall_curve(y_test, predicted)

# In matplotlib < 1.5, plt.fill_between does not have a 'step' argument
step_kwargs = ({'step': 'post'}
               if 'step' in signature(plt.fill_between).parameters
               else {})
plt.step(recall, precision, color='b', alpha=0.2,
         where='post')
plt.fill_between(recall, precision, alpha=0.2, color='b', **step_kwargs)

plt.xlabel('Recall')
plt.ylabel('Precision')
plt.ylim([0.0, 1.05])
plt.xlim([0.0, 1.0])
plt.title('2-class Precision-Recall curve: AP={0:0.2f}'.format(average_precision))

Text(0.5, 1.0, '2-class Precision-Recall curve: AP=0.99')

In [23]:
model = MultinomialNB()
trainAndScore(model, text_counts, data_frame['Sentiment'])

scores :  [0.9826589595375722, 0.9841040462427746, 0.98747591522158, 0.9869942196531792, 0.9872832369942196, 0.9884393063583815, 0.9876135425268373, 0.9859104046242775, 0.9849068721901092, 0.9855449551893611, 0.9844940867279894, 0.9853047458443749, 0.9837669557482767, 0.9841007639892628, 0.9841973405280401, 0.9821138211382113, 0.9790851895936066, 0.9760719447567047, 0.9774836452152746]
Mean :  0.984081576425265
Max :  0.9884393063583815


# Text Classification Model using TF-IDF

In [24]:
from sklearn.feature_extraction.text import TfidfVectorizer
tf=TfidfVectorizer()
text_tf= tf.fit_transform(data_frame['Message'])

In [25]:
X_train, X_test, y_train, y_test = train_test_split(text_tf, data_frame['Sentiment'], test_size=0.10, random_state=1)

In [26]:
clf = MultinomialNB().fit(X_train, y_train)

predicted= clf.predict(X_test)
print("Naive Bayes classifier Accuracy:",metrics.accuracy_score(y_test, predicted)*100,"%")

Naive Bayes classifier Accuracy: 97.83236994219652 %


In [27]:
trainAndScore(model, text_tf, data_frame['Sentiment'])

scores :  [0.9595375722543352, 0.9653179190751445, 0.9730250481695568, 0.976878612716763, 0.9803468208092485, 0.9821772639691715, 0.9797687861271677, 0.9797687861271677, 0.9788053949903661, 0.9783174327840416, 0.9763469119579501, 0.9780775716694773, 0.9775405826106293, 0.9756349370225067, 0.9751397186355752, 0.9725383920505871, 0.9709233123618433, 0.9694877147904287, 0.9621177544500228]
Mean :  0.9743026596090518
Max :  0.9821772639691715


# Logistic Regression Model

In [28]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
trainAndScore(model, text_counts, data_frame['Sentiment'])



scores :  [0.9884393063583815, 0.9913294797687862, 0.9932562620423893, 0.9934971098265896, 0.9953757225433526, 0.9932562620423893, 0.9946325350949629, 0.993135838150289, 0.9926140012845215, 0.9924833766984678, 0.9913272010512484, 0.9915682967959528, 0.9902156993551257, 0.9907082386950238, 0.9884370784351513, 0.9857271906052394, 0.9863968712803944, 0.9849044483700016, 0.9818956336528222]
Mean :  0.9904842395816363
Max :  0.9953757225433526


# Random Forest Model

In [29]:
from sklearn.ensemble import RandomForestClassifier

## Finding the best parameters

In [30]:
from sklearn.model_selection import GridSearchCV

rfc = RandomForestClassifier(n_jobs=-1,max_features= 'sqrt' ,n_estimators=50, oob_score = True) 

param_grid = { 
    'n_estimators': [30, 150],
    'max_depth' : [2, 40],
    'min_samples_split': [3, 20],
    'max_features': ['auto', 'sqrt', 'log2']
}

CV_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid, cv= 5)
CV_rfc.fit(text_counts, data_frame['Sentiment'])
print(CV_rfc.best_params_)

{'max_depth': 40, 'max_features': 'sqrt', 'min_samples_split': 20, 'n_estimators': 30}


In [31]:
model = RandomForestClassifier(n_estimators = 150, max_depth= 40 , min_samples_split = 3, max_features='sqrt')
trainAndScore(model, text_counts, data_frame['Sentiment'])

scores :  [0.9826589595375722, 0.9841040462427746, 0.9855491329479769, 0.9855491329479769, 0.9884393063583815, 0.9884393063583815, 0.9896779521056978, 0.9898843930635838, 0.9871547848426461, 0.9887250650477016, 0.9886990801576873, 0.9894001445434835, 0.9897709584167222, 0.9876109849266983, 0.9884370784351513, 0.9848238482384823, 0.9858867539534093, 0.9841014934960656, 0.9794614331355546]
Mean :  0.9867565186713656
Max :  0.9898843930635838


In [32]:
model = RandomForestClassifier(n_estimators = 30, max_depth= 40 , min_samples_split = 20, max_features='auto')
trainAndScore(model, text_counts, data_frame['Sentiment'])

scores :  [0.9855491329479769, 0.9826589595375722, 0.9884393063583815, 0.9841040462427746, 0.9872832369942196, 0.9869942196531792, 0.9880264244426095, 0.9844653179190751, 0.9858702633269107, 0.9820757444348077, 0.98580814717477, 0.9884365213201638, 0.9882143651323104, 0.9861655998348131, 0.9849682019656967, 0.9808491418247516, 0.9814657371195374, 0.9804079010759595, 0.9730716567777271]
Mean :  0.984465996004381
Max :  0.9884393063583815
