In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, accuracy_score
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import TruncatedSVD
from string import digits

In [2]:
train = pd.read_csv('data/train.csv')
valid = pd.read_csv('data/valid.csv')

In [3]:
def remove_digits(s: str) -> str:
    remove_digits = str.maketrans('', '', digits)
    res = s.translate(remove_digits)
    return res

In [4]:
train['text'] = train['text'].apply(remove_digits)
valid['text'] = valid['text'].apply(remove_digits)

In [5]:
vectorizer = CountVectorizer(stop_words='english', lowercase=True,
                             ngram_range=(1, 1), min_df=2, binary=True)

train_features = vectorizer.fit_transform(train['text'])
train_labels = train['class']

valid_features = vectorizer.transform(valid['text'])
valid_labels = valid['class']

In [6]:
train_features.shape

(9131, 8446)

In [7]:
model = BernoulliNB()
model.fit(train_features, train_labels)

valid_preds = model.predict(valid_features)
print(classification_report(valid_labels, valid_preds))
accuracy_score(valid_labels, valid_preds)

              precision    recall  f1-score   support

    negative       0.71      0.74      0.72       403
    positive       0.85      0.83      0.84       739

   micro avg       0.80      0.80      0.80      1142
   macro avg       0.78      0.79      0.78      1142
weighted avg       0.80      0.80      0.80      1142



0.8003502626970228

In [63]:
bad_entries = valid.loc[(valid_preds != valid_labels), ['text', 'class']].copy()

In [64]:
bad_entries.loc[bad_entries['class'] == 'negative', 'text']

5       Situated at half minute walking distance from ...
9       I had never visited this place yet but, eager ...
24      Pretty decent party place in camac street, pan...
25      Quite a good and affordable place for a quick ...
40      Good place for snacks and light items. I order...
55      I actually wanted to give it 3.5 star but thei...
75      If you are craving for some quality biriyani a...
77      Good food... great service\nFood quality is no...
89      Dude it's Christmas time, specially winter😍😍. ...
93      A very old and a well known dhaba, I must say....
101     Located at sodepur. Good place for casual meet...
104     Our visit to Irish house turned out to be a de...
127     Ordered Special Chicken Biryani and Chicken Re...
141     been here several times. wow momo was one of m...
147     During my school days.. i used to frequently v...
183     Have been here quite a few times here since it...
189     Poochkawala, just 2 minutes walk from Girish P...
203     Have o

In [83]:
bad_entries.loc[1021, 'text']

"Ordered Fish in Garlic Sauce and Kung Pao Chicken. Food quantity is good and quality is satisfactory but even after giving directions that there should be no chillies in the food, the Kung Pao Chicken was alot spicier compared to what i've had in other places."