In [1]:
import pandas as pd
from time import time
from sklearn.datasets import load_files
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
import re

# Task - 1

### Read Data

In [2]:
df = pd.read_csv('DS3_C3_S1_Yelp_Data_Practice.csv')

In [3]:
df.head()

Unnamed: 0,business_id,date,review_id,stars,text,type,user_id,cool,useful,funny
0,9yKzy9PApeiPPOUJEtnvkg,2011-01-26,fWKvX83p0-ka4JS3dc6E5A,5,My wife took me here on my birthday for breakf...,review,rLtl8ZkDX5vH5nAx9C3q5Q,2,5,0
1,ZRJwVLyzEJq1VAihDhYiow,2011-07-27,IjZ33sJrzXqU-0X6U8NwyA,5,I have no idea why some people give bad review...,review,0a2KyEL0d3Yb1V6aivbIuQ,0,0,0
2,6oRAC4uyJCsJl1X0WZpVSA,2012-06-14,IESLBzqUCLdSzSqm0eCSxQ,4,love the gyro plate. Rice is so good and I als...,review,0hT2KtfLiobPvh6cDC8JQg,0,1,0
3,_1QQZuf4zZOyFCvXc0o6Vg,2010-05-27,G-WvGaISbqqaMHlNnByodA,5,"Rosie, Dakota, and I LOVE Chaparral Dog Park!!...",review,uZetl9T0NcROGOyFfughhg,1,2,0
4,6ozycU1RpktNG2-1BroVtw,2012-01-05,1uJFq2r5QfJG_6ExMRCaGw,5,General Manager Scott Petello is a good egg!!!...,review,vYmM4KTsC8ZfQBg-j5MWkw,0,0,0


In [4]:
df.tail()

Unnamed: 0,business_id,date,review_id,stars,text,type,user_id,cool,useful,funny
9995,VY_tvNUCCXGXQeSvJl757Q,2012-07-28,Ubyfp2RSDYW0g7Mbr8N3iA,3,First visit...Had lunch here today - used my G...,review,_eqQoPtQ3e3UxLE4faT6ow,1,2,0
9996,EKzMHI1tip8rC1-ZAy64yg,2012-01-18,2XyIOQKbVFb6uXQdJ0RzlQ,4,Should be called house of deliciousness!\n\nI ...,review,ROru4uk5SaYc3rg8IU7SQw,0,0,0
9997,53YGfwmbW73JhFiemNeyzQ,2010-11-16,jyznYkIbpqVmlsZxSDSypA,4,I recently visited Olive and Ivy for business ...,review,gGbN1aKQHMgfQZkqlsuwzg,0,0,0
9998,9SKdOoDHcFoxK5ZtsgHJoA,2012-12-02,5UKq9WQE1qQbJ0DJbc-B6Q,2,My nephew just moved to Scottsdale recently so...,review,0lyVoNazXa20WzUyZPLaQQ,0,0,0
9999,pF7uRzygyZsltbmVpjIyvw,2010-10-16,vWSmOhg2ID1MNZHaWapGbA,5,4-5 locations.. all 4.5 star average.. I think...,review,KSBFytcdjPKZgXKQnYQdkA,0,0,0


### Create def function for remove stop words and punctuation

In [5]:
import string
import spacy
nlp = spacy.load('en_core_web_sm')
from nltk.corpus import stopwords

def text_process(text):
    nopunc = [char for char in text if char not in string.punctuation]
    nopunc = ''.join(nopunc)
    return [word for word in nopunc.split() if word.lower() not in stopwords.words('english')]

In [6]:
"""X = df['text'].apply(text_process)
X.head()"""

"X = df['text'].apply(text_process)\nX.head()"

In [7]:
new = ' '.join(df['text'])

In [8]:
x = text_process(new[:100000])
print(x)



### Create def function for convert list to sring

In [9]:
def listTostring(s):
    str1=" "
    return (str1.join(s))

In [10]:
"""X = X.apply(listTostring)
X.head()"""

'X = X.apply(listTostring)\nX.head()'

In [11]:
final = listTostring(x)
final



### Remove  special characters

In [12]:
pattern = r'[^a-zA-Z0-9\s]'

clean_text = re.sub(pattern, '', final)

print(clean_text)



### Remove extra white spaces 

In [13]:
text_new = re.sub(r'\s+',' ', clean_text)                    
                              
print('\nAfter removing extra White Spaces:\n\n',text_new)


After removing extra White Spaces:



### Split data into training and test sets

In [14]:
X_train, X_test, y_train, y_test = train_test_split(df['text'],df['stars'],test_size = 0.2, random_state = 42)

In [15]:
print(type(y_train))
print(type(y_test))

<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>


In [16]:
print(len(y_train))
print(len(y_test))

8000
2000


### Vectorize movie reviews using TF-IDF

In [17]:
myvector = TfidfVectorizer()
myobject = myvector.fit_transform(X_train)

### Calculate the time taken to build Naive Bayes Classification Model

In [18]:
t = time()

clf = MultinomialNB()
clf.fit(myobject, y_train)

training_time = time() - t
print("train time: %0.3fs" % training_time)

train time: 0.023s


### Calculate the time taken for prediction

In [19]:
t = time()
y_pred = clf.predict(myobject)
test_time = time() - t
print("test time:  %0.3fs" % test_time)

test time:  0.022s


### Calculate accuracy, recall, f1-score, precision

In [20]:
print('Accuracy:', accuracy_score(y_test, y_pred[:2000]))
print('Recall:', recall_score(y_test, y_pred[:2000], average='weighted'))
print('F1-score:', f1_score(y_test, y_pred[:2000], average='weighted'))
print('Precision:', precision_score(y_test, y_pred[:2000],average='weighted'))

Accuracy: 0.374
Recall: 0.374
F1-score: 0.28817878097058225
Precision: 0.2614207704789867


  _warn_prf(average, modifier, msg_start, len(result))


# Task - 2 

### Split data into training and test sets

In [21]:
X_train1, X_test1, y_train1, y_test1 = train_test_split(df['text'],df['stars'],test_size = 0.2, random_state = 42)

### Vectorize movie reviews using TF-IDF

In [22]:
myvector1 = TfidfVectorizer(sublinear_tf=True, max_df=0.5, min_df=1,stop_words=stopwords.words('english'),use_idf=True, smooth_idf=True)
myobject1 = myvector1.fit_transform(X_train1)

### Calculate the time taken to build Naive Bayes Classification Model

In [23]:
t = time()

clf1 = MultinomialNB()
clf1.fit(myobject1, y_train1)

training_time1 = time() - t
print("train time: %0.3fs" % training_time1)

train time: 0.018s


### Calculate the time taken for prediction

In [24]:
t = time()
y_pred1 = clf1.predict(myobject1)
test_time1 = time() - t
print("test time:  %0.3fs" % test_time1)

test time:  0.010s


### Confusion matrix

In [25]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test1, y_pred1[:2000])
cm

array([[  0,   0,   0, 104,  38],
       [  0,   0,   0, 124,  42],
       [  0,   0,   0, 218,  71],
       [  1,   0,   0, 547, 191],
       [  0,   0,   0, 466, 198]], dtype=int64)

### Calculate accuracy, recall, f1-score, precision

In [26]:
print('Accuracy:', accuracy_score(y_test1, y_pred1[:2000]))
print('Recall:', recall_score(y_test1, y_pred1[:2000], average='weighted'))
print('F1-score:', f1_score(y_test1, y_pred1[:2000], average='weighted'))
print('Precision:', precision_score(y_test1, y_pred1[:2000],average='weighted'))

Accuracy: 0.3725
Recall: 0.3725
F1-score: 0.29310547643735324
Precision: 0.26026417637651356


  _warn_prf(average, modifier, msg_start, len(result))


### Actual and predicted value in dataframe 

In [27]:
df = pd.DataFrame({'Actual':y_test1,'Predicted':y_pred1[:2000]})
df.head(10)

Unnamed: 0,Actual,Predicted
6252,4,4
4684,5,4
1731,3,4
4742,1,5
4521,5,4
6340,4,5
576,4,4
5202,4,4
6363,5,4
439,1,5
