In [55]:
# Import relevant packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# 
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

# Datasets
from sklearn.datasets import fetch_20newsgroups
import utils

In [56]:
news = True
dataset = 'amazon'
data = pd.read_pickle(f'../data/{dataset}_preprocessed')

In [57]:
if news:
    newsgroups_train = fetch_20newsgroups(subset='train')
    newsgroups_test = fetch_20newsgroups(subset='test')
    Y_train = newsgroups_train.target
    Y_test = newsgroups_test.target

    X_train = pd.read_pickle(f'../data/news_train_preprocessed')
    X_test = pd.read_pickle(f'../data/news_test_preprocessed')
    X_train = X_train['review']
    X_test = X_test['review']
else:
    X_train, X_test, Y_train, Y_test = train_test_split(data['review'], data['sentiment'], test_size=0.2, random_state=5)

In [58]:
text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', MultinomialNB()),
                     ])

In [59]:
text_clf.fit(X_train, Y_train)
predicted = text_clf.predict(X_test)

print(metrics.classification_report(Y_test, predicted))

              precision    recall  f1-score   support

           0       0.84      0.55      0.67       319
           1       0.79      0.73      0.76       389
           2       0.83      0.70      0.76       394
           3       0.69      0.80      0.74       392
           4       0.85      0.84      0.85       385
           5       0.90      0.79      0.84       395
           6       0.95      0.71      0.81       390
           7       0.88      0.94      0.91       396
           8       0.95      0.95      0.95       398
           9       0.95      0.94      0.94       397
          10       0.93      0.98      0.95       399
          11       0.64      0.97      0.77       396
          12       0.83      0.64      0.72       393
          13       0.92      0.84      0.88       396
          14       0.88      0.93      0.90       394
          15       0.55      0.97      0.70       398
          16       0.63      0.96      0.76       364
          17       0.92    