In [13]:
# Import relevant packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# 
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.datasets import fetch_20newsgroups

In [14]:
news = True
dataset = 'imdb'
data = pd.read_pickle(f'../data/{dataset}_preprocessed')

In [15]:
data

Unnamed: 0,review,sentiment
0,one review ha mention watch 1 Oz episod youll ...,1
1,wonder littl product film techniqu veri unassu...,1
2,thought thi wa wonder way spend time hot summe...,1
3,basic famili littl boy jake think zombi hi clo...,0
4,petter mattei love time money visual stun film...,1
...,...,...
49995,thought thi movi right good job wasnt creativ ...,1
49996,bad plot bad dialogu bad act idiot direct anno...,0
49997,cathol taught parochi elementari school nun ta...,0
49998,Im go disagre previou comment side maltin thi ...,0


In [16]:
if news:
    newsgroups_train = fetch_20newsgroups(subset='train')
    newsgroups_test = fetch_20newsgroups(subset='test')
    Y_train = newsgroups_train.target
    Y_test = newsgroups_test.target

    X_train = pd.read_pickle(f'../data/news_train_preprocessed')
    X_test = pd.read_pickle(f'../data/news_test_preprocessed')
    X_train = X_train['review']
    X_test = X_test['review']
else:
    X_train, X_test, Y_train, Y_test = train_test_split(data['review'], data['sentiment'], test_size=0.2, random_state=5)

In [17]:
text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', RandomForestClassifier(n_estimators=100)),
                     ])

In [18]:
text_clf.fit(X_train, Y_train)
predicted = text_clf.predict(X_test)

print(metrics.classification_report(Y_test, predicted))

              precision    recall  f1-score   support

           0       0.71      0.60      0.65       319
           1       0.60      0.74      0.66       389
           2       0.68      0.78      0.73       394
           3       0.66      0.67      0.66       392
           4       0.77      0.76      0.76       385
           5       0.79      0.72      0.75       395
           6       0.69      0.89      0.78       390
           7       0.87      0.80      0.83       396
           8       0.91      0.91      0.91       398
           9       0.87      0.93      0.90       397
          10       0.91      0.94      0.92       399
          11       0.89      0.90      0.90       396
          12       0.72      0.54      0.62       393
          13       0.85      0.72      0.78       396
          14       0.86      0.88      0.87       394
          15       0.62      0.91      0.74       398
          16       0.65      0.87      0.74       364
          17       0.94    

In [19]:
X_train.shape

(11314,)

In [20]:
X_test.shape

(7532,)