#### Importing libraries

In [1]:
import numpy as np
import pandas as pd

### Importing dataset 

In [2]:
dataset = pd.read_csv('yelp_review.csv')
dataset.head()

Unnamed: 0,review_id,user_id,business_id,stars,date,text,useful,funny,cool
0,vkVSCC7xljjrAI4UGfnKEQ,bv2nCi5Qv5vroFiqKGopiw,AEx2SYEUJmTxVVB18LlCwA,5,2016-05-28,Super simple place but amazing nonetheless. It...,0,0,0
1,n6QzIUObkYshz4dz2QRJTw,bv2nCi5Qv5vroFiqKGopiw,VR6GpWIda3SfvPC-lg9H3w,5,2016-05-28,Small unassuming place that changes their menu...,0,0,0
2,MV3CcKScW05u5LVfF6ok0g,bv2nCi5Qv5vroFiqKGopiw,CKC0-MOWMqoeWf6s-szl8g,5,2016-05-28,Lester's is located in a beautiful neighborhoo...,0,0,0
3,IXvOzsEMYtiJI0CARmj77Q,bv2nCi5Qv5vroFiqKGopiw,ACFtxLv8pGrrxMm6EgjreA,4,2016-05-28,Love coming here. Yes the place always needs t...,0,0,0
4,L_9BTb55X0GDtThi6GlZ6w,bv2nCi5Qv5vroFiqKGopiw,s2I_Ni76bjJNK9yG60iD-Q,4,2016-05-28,Had their chocolate almond croissant and it wa...,0,0,0


In [3]:
dataset.shape

(5261668, 9)

In [4]:
dataset = dataset.loc[dataset.index >= 5220000]

In [5]:
dataset.shape

(41668, 9)

In [6]:
dataset.columns

Index(['review_id', 'user_id', 'business_id', 'stars', 'date', 'text',
       'useful', 'funny', 'cool'],
      dtype='object')

In [7]:
list_of_column_positions = [5]
dataset = dataset.iloc[:, list_of_column_positions]

In [8]:
dataset.head()

Unnamed: 0,text
5220000,This place has a really great atmosphere and i...
5220001,Had a super fun experience. It was exhilaratin...
5220002,It's great. I dont think you can really mess u...
5220003,"Considering just the food, I have to say that ..."
5220004,Its a first come first serve restaurant where ...


### Data cleaning

In [9]:
import re
import nltk

nltk.download('stopwords')

from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

all_stopwords = stopwords.words('english')
all_stopwords.remove('not')

[nltk_data] Downloading package stopwords to C:\Users\Digvijay
[nltk_data]     Mohite\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [10]:
corpus=[]

for i in range(5220000, 5261668):
  review = re.sub('[^a-zA-Z]', ' ', dataset['text'][i])
  review = review.lower()
  review = review.split()
  review = [ps.stem(word) for word in review if not word in set(all_stopwords)]
  review = ' '.join(review)
  corpus.append(review)

### Data transformation

In [11]:
# Loading BoW dictionary
from sklearn.feature_extraction.text import CountVectorizer
import pickle
cvFile='c1_BoW_Sentiment_Model.pkl'
# cv = CountVectorizer(decode_error="replace", vocabulary=pickle.load(open('./drive/MyDrive/Colab Notebooks/2 Sentiment Analysis (Basic)/3.1 BoW_Sentiment Model.pkl', "rb")))
cv = pickle.load(open(cvFile, "rb"))

In [12]:
X_fresh = cv.transform(corpus).toarray()
X_fresh.shape

(41668, 1420)

### Predictions (via sentiment classifier)

In [13]:
import joblib
classifier = joblib.load('c2_Classifier_Sentiment_Model')

In [14]:
y_pred = classifier.predict(X_fresh)
print(y_pred)

[4 5 5 ... 4 4 2]


In [15]:
dataset['predicted_label'] = y_pred.tolist()
dataset.head()

Unnamed: 0,text,predicted_label
5220000,This place has a really great atmosphere and i...,4
5220001,Had a super fun experience. It was exhilaratin...,5
5220002,It's great. I dont think you can really mess u...,5
5220003,"Considering just the food, I have to say that ...",1
5220004,Its a first come first serve restaurant where ...,5


In [16]:
# dataset.to_csv("c3_Predicted_Sentiments_Fresh_Dump.tsv", sep='\t', encoding='UTF-8', index=False)

In [17]:
dataset.head(5220000)

Unnamed: 0,text,predicted_label
5220000,This place has a really great atmosphere and i...,4
5220001,Had a super fun experience. It was exhilaratin...,5
5220002,It's great. I dont think you can really mess u...,5
5220003,"Considering just the food, I have to say that ...",1
5220004,Its a first come first serve restaurant where ...,5
...,...,...
5261663,"Bought groupon $39 for 4 months from groupon, ...",1
5261664,"Spring rolls was pretty good, cod was a bit ra...",5
5261665,"Had a 8 dish set meal, was enough for 10 peopl...",4
5261666,"A small, cozy family run Authentic korean rest...",4
