In [17]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nltk

In [18]:
df = pd.read_csv("./Data/Food_reviews/fitchai.csv")
df.head()

Unnamed: 0,Reviews
0,awesome taste
1,nothing
2,ok
3,taste is good
4,Totally tasteless


In [19]:
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import re

#### Text Cleaning

In [20]:
corpus = []
 
for i in range(0, len(df)):
     
    # column : "Review", row ith
    review = re.sub('[^a-zA-Z]', ' ', df['Reviews'][i])
     
    # convert all cases to lower cases
    review = review.lower()
     
    # split to array(default delimiter is " ")
    review = review.split()
     
    # creating PorterStemmer object to
    # take main stem of each word
    ps = PorterStemmer()
     
    # loop for stemming each word
    # in string array at ith row   
    review = [ps.stem(word) for word in review
                if not word in set(stopwords.words('english'))]
                 
    # rejoin all string array elements
    # to create back into a string
    review = ' '.join(review) 
     
    # append each string to create
    # array of clean text
    corpus.append(review)

In [21]:
df['cleaned_reviews'] = corpus
df.head()

Unnamed: 0,Reviews,cleaned_reviews
0,awesome taste,awesom tast
1,nothing,noth
2,ok,ok
3,taste is good,tast good
4,Totally tasteless,total tasteless


#### Tokenization

In [22]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 1500)
X = cv.fit_transform(corpus).toarray()
y = df.iloc[:, 1].values

#### Splitting the dataset into training and testing

In [23]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25)

#### Model Fitting

In [25]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators = 501, criterion = 'entropy')
							
model.fit(X_train, y_train)


#### Predicition

In [29]:
y_pred = model.predict(X_test)

y_pred


array(['total tasteless', 'noth', 'noth', 'tast awesom healthi', 'noth',
       'worest tea tast also good',
       'awesom tast one tri enjoy authent tast indian chai',
       'worest tea tast also good'], dtype=object)

In [34]:
y_test

array(['salt spice bread sweet never tasteless omelett',
       'super healthi tasti fitchai',
       'tri golden milkshak tulasi milkshak', 'awesom tast',
       'wonder tea love', 'tast good',
       'one best chai shop kondapur raghavendra coloni area tri fit chai',
       'tea tast like burnt'], dtype=object)

#### Accuracy

In [32]:
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test, y_pred)


In [33]:
cm

array([[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0],
       [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], dtype=int64)