In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics

In [67]:
# Import data and split
data = pd.read_csv('~/Desktop/sentiment labelled sentences/imdb_labelled.txt', delimiter="\t", header = None)
X_train, X_test, y_train, y_test = train_test_split(data[0], data[1], test_size=0.2)

In [68]:
# Counting occurance
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_train)
print(X_train_counts.shape)
print(count_vect.vocabulary_.get(u'sad'))

(598, 2623)
None


In [69]:
# Counting frequency
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
print(X_train_tfidf.shape)

(598, 2623)


In [70]:
# Learning from features
clf = MultinomialNB(alpha = 0.6, fit_prior = True).fit(X_train_tfidf, y_train)

In [71]:
# Model Testing
X_test_counts = count_vect.transform(X_test)
X_test_tfidf = tfidf_transformer.transform(X_test_counts)
predicted = clf.predict(X_test_tfidf)

In [72]:
# Model Evaluation
accuracy = np.mean(predicted == y_test)
print('Accuracy: ', accuracy)

Accuracy:  0.76


In [46]:
print('Classification Report: ', '\n')
print(metrics.classification_report(y_test, predicted))

Classification Report:  

             precision    recall  f1-score   support

          0       0.71      0.78      0.75       102
          1       0.81      0.74      0.77       123

avg / total       0.76      0.76      0.76       225



In [14]:
print('Confusion Matrix: ', '\n')
print(metrics.confusion_matrix(y_test, predicted))

Confusion Matrix:  

[[81 19]
 [33 92]]
