## Import libraries

In [11]:
import numpy as np
import pandas as pd

In [12]:
df = pd.read_csv('./TextFiles/moviereviews.tsv',sep='\t')



print('Missing values: ')
print(df.isnull().sum(),'\n')

print('Categories: ',df['label'].unique(),'\n')


print('Rate of each category: ')
print(df['label'].value_counts())

Missing values: 
label      0
review    35
dtype: int64 

Categories:  ['neg' 'pos'] 

Rate of each category: 
neg    1000
pos    1000
Name: label, dtype: int64


## Data Preprocessing

#### Drop Missing Reviews

In [13]:
df.dropna(inplace=True)
print('Missing values: ')
print(df.isnull().sum(),'\n')

Missing values: 
label     0
review    0
dtype: int64 



#### Finding Empty Reviews

In [14]:
blanks=[]
for i, lb, rv in df.itertuples():
    if rv.isspace():
        blanks.append(i)
print('Index of empty reviews: ',blanks)
df.drop(blanks,inplace=True)

Index of empty reviews:  [57, 71, 147, 151, 283, 307, 313, 323, 343, 351, 427, 501, 633, 675, 815, 851, 977, 1079, 1299, 1455, 1493, 1525, 1531, 1763, 1851, 1905, 1993]


In [15]:
df.head()

Unnamed: 0,label,review
0,neg,how do films like mouse hunt get into theatres...
1,neg,some talented actresses are blessed with a dem...
2,pos,this has been an extraordinary year for austra...
3,pos,according to hollywood movies made in last few...
4,neg,my first press screening of 1998 and already i...


#### Features

In [16]:
X = df['review']

#### Labels

In [17]:
y = df['label']

#### Data Splitting

In [18]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=0)

#### Shape of train and test data

In [19]:
print('train data shape:', X_train.shape)
print('test data shape:', X_test.shape)

train data shape: (1356,)
test data shape: (582,)


## Classifier ML model pipeline

#### Training and Prediction

In [9]:
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer

model = RandomForestClassifier(n_estimators=50)

model.fit(X_train,y_train)

predictions = model.predict(X_test)

#### Confusion Matrix and Results

In [10]:
from sklearn import metrics
df_conf_mat= pd.DataFrame(metrics.confusion_matrix(y_test, predictions), index=['ham','spam'], columns=['ham','spam'])
print(df_conf_mat,'\n')

clf_report = metrics.classification_report(y_test, predictions)
print(clf_report,'\n')

acc = metrics.accuracy_score(y_test,predictions)
print('Model accuracy: ', acc*100)

       ham  spam
ham   1370    81
spam   104   117 

              precision    recall  f1-score   support

         ham       0.93      0.94      0.94      1451
        spam       0.59      0.53      0.56       221

    accuracy                           0.89      1672
   macro avg       0.76      0.74      0.75      1672
weighted avg       0.88      0.89      0.89      1672
 

Model accuracy:  88.93540669856459
