In [175]:
# import libraries

import pandas as pd 
import numpy as np
import nltk
from nltk.corpus import stopwords
import os

In [176]:
# import data 

df = pd.read_csv('Restaurant_Reviews.tsv',sep = "\t")
df

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1
...,...,...
995,I think food should have flavor and texture an...,0
996,Appetite instantly gone.,0
997,Overall I was not impressed and would not go b...,0
998,"The whole experience was underwhelming, and I ...",0


In [177]:
# check missing values 

df.isnull().sum()

Review    0
Liked     0
dtype: int64

In [178]:
# shape 

df.shape

(1000, 2)

In [179]:
# count 

df['Liked'].value_counts()

0    500
1    500
Name: Liked, dtype: int64

In [180]:
# seperate feature and target 

X = df.Review
y = df.Liked

In [181]:
# text cleaning 

import string 

def text_cleaning(text):
    
    # remove punchuation 
    clean = []
    for char in text:
        if char not in string.punctuation:
            clean.append(char)
            
    clean = " ".join(clean)
    
      
    cleaning = []
    
    for word in clean.split():
        if word.lower() not in stopwords.words('english'):
            cleaning.append(word.lower())
    
            
    return cleaning

In [182]:
data = text_cleaning(X)
data

['wow...',
 'loved',
 'place.',
 'crust',
 'good.',
 'tasty',
 'texture',
 'nasty.',
 'stopped',
 'late',
 'may',
 'bank',
 'holiday',
 'rick',
 'steve',
 'recommendation',
 'loved',
 'it.',
 'selection',
 'menu',
 'great',
 'prices.',
 'getting',
 'angry',
 'want',
 'damn',
 'pho.',
 'honeslty',
 'taste',
 'fresh.)',
 'potatoes',
 'like',
 'rubber',
 'could',
 'tell',
 'made',
 'ahead',
 'time',
 'kept',
 'warmer.',
 'fries',
 'great',
 'too.',
 'great',
 'touch.',
 'service',
 'prompt.',
 'would',
 'go',
 'back.',
 'cashier',
 'care',
 'ever',
 'say',
 'still',
 'ended',
 'wayyy',
 'overpriced.',
 'tried',
 'cape',
 'cod',
 'ravoli,',
 'chicken,',
 'cranberry...mmmm!',
 'disgusted',
 'pretty',
 'sure',
 'human',
 'hair.',
 'shocked',
 'signs',
 'indicate',
 'cash',
 'only.',
 'highly',
 'recommended.',
 'waitress',
 'little',
 'slow',
 'service.',
 'place',
 'worth',
 'time,',
 'let',
 'alone',
 'vegas.',
 'like',
 'all.',
 'burrittos',
 'blah!',
 'food,',
 'amazing.',
 'service',
 '

In [183]:
from sklearn.feature_extraction.text import CountVectorizer

In [184]:
bow_transformer = CountVectorizer()

In [185]:
data =bow_transformer.fit_transform(data)


In [186]:
data.shape

(5826, 1970)

In [187]:
bow_transformer.vocabulary_

{'wow': 1949,
 'loved': 1008,
 'place': 1288,
 'crust': 411,
 'good': 740,
 'tasty': 1712,
 'texture': 1725,
 'nasty': 1130,
 'stopped': 1651,
 'late': 953,
 'may': 1049,
 'bank': 130,
 'holiday': 836,
 'rick': 1446,
 'steve': 1643,
 'recommendation': 1402,
 'it': 909,
 'selection': 1517,
 'menu': 1072,
 'great': 754,
 'prices': 1332,
 'getting': 722,
 'angry': 61,
 'want': 1879,
 'damn': 423,
 'pho': 1274,
 'honeslty': 839,
 'taste': 1708,
 'fresh': 690,
 'potatoes': 1320,
 'like': 978,
 'rubber': 1463,
 'could': 381,
 'tell': 1717,
 'made': 1023,
 'ahead': 41,
 'time': 1755,
 'kept': 931,
 'warmer': 1883,
 'fries': 696,
 'too': 1770,
 'touch': 1779,
 'service': 1530,
 'prompt': 1344,
 'would': 1947,
 'go': 733,
 'back': 121,
 'cashier': 280,
 'care': 271,
 'ever': 584,
 'say': 1495,
 'still': 1645,
 'ended': 564,
 'wayyy': 1897,
 'overpriced': 1215,
 'tried': 1792,
 'cape': 267,
 'cod': 334,
 'ravoli': 1384,
 'chicken': 306,
 'cranberry': 396,
 'mmmm': 1095,
 'disgusted': 491,
 'pret

In [188]:
token = bow_transformer.get_feature_names()
token

['00',
 '10',
 '100',
 '11',
 '12',
 '15',
 '17',
 '1979',
 '20',
 '2007',
 '23',
 '30',
 '30s',
 '35',
 '40',
 '40min',
 '45',
 '4ths',
 '5lb',
 '70',
 '85',
 '90',
 '99',
 'absolute',
 'absolutely',
 'absolutley',
 'accident',
 'accommodations',
 'accomodate',
 'accordingly',
 'accountant',
 'ache',
 'acknowledged',
 'across',
 'actual',
 'actually',
 'added',
 'affordable',
 'afternoon',
 'again',
 'ago',
 'ahead',
 'airline',
 'airport',
 'ala',
 'albondigas',
 'all',
 'allergy',
 'almonds',
 'almost',
 'alone',
 'also',
 'although',
 'always',
 'amazing',
 'ambiance',
 'ambience',
 'amount',
 'ample',
 'and',
 'andddd',
 'angry',
 'annoying',
 'another',
 'anticipated',
 'anymore',
 'anyone',
 'anything',
 'anytime',
 'anyway',
 'anyways',
 'apart',
 'apologize',
 'apology',
 'app',
 'appalling',
 'apparently',
 'appealing',
 'appetite',
 'appetizer',
 'appetizers',
 'apple',
 'approval',
 'are',
 'area',
 'arepas',
 'aria',
 'around',
 'array',
 'arrived',
 'arrives',
 'arriving'

In [189]:
# Convert data 

title_bow = bow_transformer.transform(X)
title_bow.toarray().shape

(1000, 1970)