In [12]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn import metrics
import random

In [13]:
# moviereviews.tsv
# ar_reviews_100k.tsv
df = pd.read_csv('moviereviews.tsv', sep='\t')

In [14]:
print(df)

     label                                             review
0      neg  how do films like mouse hunt get into theatres...
1      neg  some talented actresses are blessed with a dem...
2      pos  this has been an extraordinary year for austra...
3      pos  according to hollywood movies made in last few...
4      neg  my first press screening of 1998 and already i...
...    ...                                                ...
1995   pos  i like movies with albert brooks , and i reall...
1996   pos  it might surprise some to know that joel and e...
1997   pos  the verdict : spine-chilling drama from horror...
1998   pos  i want to correct what i wrote in a former ret...
1999   pos  a couple of months ago , when i first download...

[2000 rows x 2 columns]


In [15]:
len(df)

2000

In [16]:
df.isnull().sum()

label      0
review    35
dtype: int64

In [17]:
df.dropna(inplace=True)
len(df)

1965

In [18]:
blanks = []

for i, label, reviews in df.itertuples():
  if type (reviews) == str:
    if reviews.isspace():
      blanks.append(i)

print(len(blanks), 'blanks: ', blanks)

27 blanks:  [57, 71, 147, 151, 283, 307, 313, 323, 343, 351, 427, 501, 633, 675, 815, 851, 977, 1079, 1299, 1455, 1493, 1525, 1531, 1763, 1851, 1905, 1993]


In [19]:
df.drop(blanks, inplace= True)

In [20]:
X = df['review']
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [21]:
text_clf_nb = Pipeline([('tfidf', TfidfVectorizer()), ('clf', MultinomialNB())])
text_clf_nb.fit(X_train, y_train)
predictions = text_clf_nb.predict(X_test)

print(f"Accuracy : {metrics.accuracy_score(y_test, predictions)}")
print("Classification Report :")
print(metrics.classification_report(y_test, predictions))

Accuracy : 0.7640625
Classification Report :
              precision    recall  f1-score   support

         neg       0.69      0.93      0.79       308
         pos       0.91      0.61      0.73       332

    accuracy                           0.76       640
   macro avg       0.80      0.77      0.76       640
weighted avg       0.80      0.76      0.76       640



In [22]:
text_clf_lsvc = Pipeline([('tfidf', TfidfVectorizer()), ('clf', LinearSVC())])
text_clf_lsvc.fit(X_train, y_train)
predictions = text_clf_lsvc.predict(X_test)

print(f"Accuracy : {metrics.accuracy_score(y_test, predictions)}")
print("Classification Report :")
print(metrics.classification_report(y_test, predictions))



Accuracy : 0.846875
Classification Report :
              precision    recall  f1-score   support

         neg       0.84      0.84      0.84       308
         pos       0.85      0.85      0.85       332

    accuracy                           0.85       640
   macro avg       0.85      0.85      0.85       640
weighted avg       0.85      0.85      0.85       640

