In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('../../datasets/Restaurant_Reviews.tsv', sep='\t')

In [3]:
df.head()

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [4]:
r1 = 'Wow... Loved this place.'
# lower
# Remove useless chars
# Remove stopwords
# Stemming - finding out root words

In [5]:
r1 = r1.lower()
r1

'wow... loved this place.'

In [6]:
import re

In [7]:
r1 = re.sub('[^a-z]', ' ', r1)
r1

'wow    loved this place '

In [8]:
r1.split()

['wow', 'loved', 'this', 'place']

In [9]:
import nltk

In [10]:
stopwords = nltk.corpus.stopwords.words('english')

In [11]:
words = [word for word in r1.split() if not word in stopwords]
words

['wow', 'loved', 'place']

In [12]:
stemmer = nltk.stem.PorterStemmer()
stemmer.stem('loving')

'love'

In [13]:
stemmer.stem('loved')

'love'

In [14]:
stemmer.stem('lovely')

'love'

In [15]:
stemmer.stem('tasty')

'tasti'

In [16]:
words = [stemmer.stem(word) for word in r1.split() if not word in stopwords]
words

['wow', 'love', 'place']

In [17]:
' '.join(words)

'wow love place'

In [18]:
def preprocess(r1):
    r1 = r1.lower()
    r1 = re.sub('[^a-z]', ' ', r1)
    words = [stemmer.stem(word) for word in r1.split() if not word in stopwords]
    return ' '.join(words)

In [19]:
preprocess('Wow... Loved this place.')

'wow love place'

In [20]:
preprocess('Crust is not good.')

'crust good'

In [21]:
df.head()

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [22]:
preprocessed_reviews = df['Review'].apply(preprocess)

In [23]:
preprocessed_reviews

0                                         wow love place
1                                             crust good
2                                     tasti textur nasti
3      stop late may bank holiday rick steve recommen...
4                                select menu great price
                             ...                        
995                        think food flavor textur lack
996                               appetit instantli gone
997                         overal impress would go back
998    whole experi underwhelm think go ninja sushi n...
999    wast enough life pour salt wound draw time too...
Name: Review, Length: 1000, dtype: object

In [24]:
from sklearn.feature_extraction.text import CountVectorizer

In [68]:
vectorizer = CountVectorizer(ngram_range=(1,2))
vectorizer.fit(preprocessed_reviews)

CountVectorizer(ngram_range=(1, 2))

In [70]:
len(vectorizer.get_feature_names())

5634

In [71]:
bow_table = vectorizer.transform(preprocessed_reviews)
bow_table = bow_table.toarray()

In [72]:
bow_table.shape

(1000, 5634)

In [73]:
y = df['Liked']

In [74]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier()
model.fit(bow_table,y)

RandomForestClassifier()

In [76]:
model.score(bow_table,y)

0.997

In [77]:
reviews = ['Good Service',
           'lovely food, awesome ambience,must visit',
           'Bueger is Delicious but veg wrap not good in taste',
           'good taste but the worst service',
           'Not bad !',
           'Fantastic location']

In [78]:
# Predictions on real reviews
X_test = [preprocess(review) for review in reviews]
X_test

['good servic',
 'love food awesom ambienc must visit',
 'bueger delici veg wrap good tast',
 'good tast worst servic',
 'bad',
 'fantast locat']

In [79]:
X_test_vectors = vectorizer.transform(X_test).toarray()
X_test_vectors.shape

(6, 5634)

In [80]:
model.predict(X_test_vectors)

array([1, 1, 1, 0, 0, 1], dtype=int64)