In [32]:
#Importing necessary libraries
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [33]:
df = pd.read_csv('Restaurant_Reviews.tsv',sep='\t')

In [34]:
df

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1
...,...,...
995,I think food should have flavor and texture an...,0
996,Appetite instantly gone.,0
997,Overall I was not impressed and would not go b...,0
998,"The whole experience was underwhelming, and I ...",0


In [35]:
df['Review'] = df['Review'].str.lower()

In [36]:
stop_words = set(stopwords.words("english"))

In [37]:
#Function to remove stopwords
def remove_stopwords(text):
    words = nltk.word_tokenize(text)
    words_without_stopwords = [word for word in words if not word in stop_words]
    return " ".join(words_without_stopwords)

In [38]:
df['Review'] = df['Review'].apply(remove_stopwords)

In [39]:
lemmatizer = WordNetLemmatizer()

In [40]:
#Function for lemmatization
def my_lemmatizer(a):
    lemmatizer = WordNetLemmatizer()
    c=nltk.word_tokenize(a)
    my_list=[]
    for i in c:
        my_list.append(lemmatizer.lemmatize(i))
    my_new_string=' '.join(my_list)
    return my_new_string

In [41]:
df['Review'] = df['Review'].apply(my_lemmatizer)

In [42]:
#Function for removing numbers
def remove_numbers(x):
    my_tokenized_string = word_tokenize(x)    
    temp = []
    for i in my_tokenized_string:
        if i.isdigit():
            pass
        else:
            temp.append(i)
    my_new_string = ' '.join(temp)
    return my_new_string

In [43]:
df['Review'] = df['Review'].apply(remove_numbers)

In [44]:
df_x = df['Review']
df_y = df['Liked']

In [45]:
#Divinding data into training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(df_x, df_y, test_size=0.33, random_state=42)

In [46]:
vectorizer = CountVectorizer()

In [47]:
vectorizer.fit(X_train)

In [48]:
vectorizer.transform(X_train)

<670x1418 sparse matrix of type '<class 'numpy.int64'>'
	with 3700 stored elements in Compressed Sparse Row format>

In [49]:
vectorizer.get_feature_names_out()[0:100]

array(['10', '40min', '4ths', '5lb', '70', '85', 'absolute', 'absolutely',
       'absolutley', 'accident', 'accomodate', 'accordingly',
       'accountant', 'acknowledged', 'actual', 'actually', 'added',
       'affordable', 'afternoon', 'ago', 'ala', 'all', 'allergy',
       'almost', 'alone', 'also', 'although', 'always', 'amazing',
       'ambiance', 'ambience', 'amount', 'ample', 'and', 'andddd',
       'another', 'anymore', 'anyone', 'anything', 'anytime', 'anyway',
       'anyways', 'apart', 'apologize', 'apology', 'app', 'appalling',
       'apparently', 'appealing', 'appetite', 'appetizer', 'apple',
       'area', 'arepas', 'aria', 'around', 'array', 'arrives', 'arriving',
       'ask', 'asked', 'asking', 'assure', 'ate', 'atmosphere',
       'attached', 'attack', 'attention', 'attentive', 'attitude', 'auju',
       'authentic', 'average', 'avocado', 'avoid', 'avoided', 'away',
       'awesome', 'awful', 'awkward', 'awkwardly', 'baby', 'back',
       'bacon', 'bad', 'bagel', '

In [50]:
X_train_vector = vectorizer.transform(X_train)

In [51]:
X_train_vector=X_train_vector.toarray()

In [52]:
X_test_vector = vectorizer.transform(X_test)
X_test_vector=X_test_vector.toarray()

In [53]:
X_train_vector

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [54]:
X_test_vector

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [55]:
le = LabelEncoder()

In [56]:
y_train

703    1
311    0
722    1
629    1
0      1
      ..
106    1
270    1
860    1
435    0
102    1
Name: Liked, Length: 670, dtype: int64

In [57]:
le.fit(y_train)

In [58]:
le.transform(y_train)

array([1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1,
       1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1,
       0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0,
       0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0,
       0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1,
       0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1,
       1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0,
       1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1,
       1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0,
       0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1,
       0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0,
       0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0,
       0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0,

In [59]:
y_train_encoded = le.transform(y_train)
y_test_encoded = le.transform(y_test)

In [60]:
lr = LogisticRegression()

lr_model = lr.fit(X_train_vector, y_train_encoded)

lr.predict(X_test_vector)

lr_model_predicted = lr.predict(X_test_vector)

In [61]:
#Accuracy score of model
accuracy_score(y_test_encoded, lr_model_predicted)

0.7787878787878788