**Importing packages**

In [98]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics

**Reading the dataset**

In [99]:
features = ['Unnamed: 0' , 'Unnamed: 0.1' , 'Unnamed: 0.1.1' , 'Unnamed: 0.1.1.1' , 'Unnamed: 0.1.1.1.1' , 'ID' , 'label' , 'statement' , 'subject' , 'speaker' , 'speaker_job' , 'state_info' , 'party_affiliation' , 'barely_true_counts' , 'false_counts' , 'half_true_counts' , 'mostly_true_counts' , 'pants_on_fire_counts' , 'context' , 'sentiment' , 'sentiment_score' , 'sentiment_magnitude' , 'anger' , 'fear' , 'joy' , 'disgust' , 'sad' , 'speaker_id' , 'list' , 'sentiment_code']
path_train = 'train_final.csv'
Train_Data = pd.read_csv(path_train, usecols = features)
path_test = 'test_final.csv'
Test_Data = pd.read_csv(path_test, usecols = features)
# convert label to a numerical variable (# false equal to 0 , true equal to 1)
Train_Data['label_num'] = Train_Data.label.map({'half-true':1, 'false':0, 'mostly-true':1, 'true':1, 'barely-true':0, 'pants-fire':0})
Test_Data['label_num'] = Test_Data.label.map({'half-true':1, 'false':0, 'mostly-true':1, 'true':1, 'barely-true':0, 'pants-fire':0})
# how to define X and y (from the SMS data) for use with COUNTVECTORIZER
X_train = Train_Data.statement
y_train = Train_Data.label_num
X_test = Test_Data.statement
y_test = Test_Data.label_num

**Vectorizing our dataset**

In [100]:
# instantiate the vectorizer
vect = CountVectorizer()
# fit and transform training data
X_train_dtm = vect.fit_transform(X_train)
# transform testing data (using fitted vocabulary) into a document-term matrix
X_test_dtm = vect.transform(X_test)

**Building a model**

In [101]:
# instantiate a Multinomial Naive Bayes model
nb = MultinomialNB()
# train the model 
%time nb.fit(X_train_dtm, y_train)
# make class predictions for X_test_dtm
y_pred_class = nb.predict(X_test_dtm)

CPU times: user 7.56 ms, sys: 0 ns, total: 7.56 ms
Wall time: 10.2 ms


**Evaluating the model**

In [102]:
# calculate : Accuracy , Recall , Precision , F1 measure
confusion_matrix = metrics.confusion_matrix(y_test, y_pred_class)
Precision = confusion_matrix[0][0]/(confusion_matrix[0][0]+confusion_matrix[0][1])
Recall = confusion_matrix[0][0]/(confusion_matrix[0][0]+confusion_matrix[1][0])
Accuracy = (confusion_matrix[0][0]+confusion_matrix[1][1])/(confusion_matrix[0][0]+confusion_matrix[1][0]+confusion_matrix[0][1]+confusion_matrix[1][1])
F1_measure = (2*Precision*Recall)/(Recall+Precision)
# Print : Confusion matrix , Accuracy , Recall , Precision , F1 measure
print("Confusion matrix is = \n",confusion_matrix,"\n","----------")
print("Accuracy is = \n",100*Accuracy," %","\n","----------")
print("Recall is = \n",Recall,"\n","----------")
print("Precision is = \n",Precision,"\n","----------")
print("F1 measure is = \n",F1_measure)

Confusion matrix is = 
 [[263 290]
 [202 512]] 
 ----------
Accuracy is = 
 61.16811365430151  % 
 ----------
Recall is = 
 0.5655913978494623 
 ----------
Precision is = 
 0.4755877034358047 
 ----------
F1 measure is = 
 0.5166994106090373
