In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
reviews_df= pd.read_csv('amazon_alexa.tsv', sep = '\t')

In [None]:
print(reviews_df)

In [None]:
reviews_df.info()

In [None]:
reviews_df.describe()

In [None]:
reviews_df['verified_reviews']

In [None]:
sns.heatmap(reviews_df.isnull(), yticklabels = False, cbar = False, cmap = 'Blues')

In [None]:
reviews_df.hist(bins = 30 , figsize = (13,5), color = 'r')

In [None]:
reviews_df['length'] = reviews_df['verified_reviews'].apply(len)

In [None]:
reviews_df.head()

In [None]:
reviews_df['length'].plot(bins = 100, kind = 'hist')

In [None]:
reviews_df.length.describe()

In [None]:
#longest
reviews_df[reviews_df['length'] == 2851]['verified_reviews'].iloc[0]

In [None]:
#shortest
reviews_df[reviews_df['length'] == 1]['verified_reviews'].iloc[0]

In [None]:
#average/mean
reviews_df[reviews_df['length'] == 132]['verified_reviews'].iloc[0]

In [None]:
#postive reviews
postive = reviews_df[reviews_df['feedback']==1]

In [None]:
#negative reviews
negative = reviews_df[reviews_df['feedback']==0]

In [None]:
sns.countplot(reviews_df['feedback'], label = 'count')

In [None]:
sns.countplot(x = 'rating' , data = reviews_df)

In [None]:
plt.figure(figsize=(40,15))
sns.barplot(x = 'variation' , y = 'rating' , data = reviews_df, palette = 'deep')

In [None]:
sentences = reviews_df['verified_reviews'].tolist()
sentences

In [None]:
sentences_as_one_string = " ".join(sentences)

In [None]:
sentences_as_one_string

In [None]:
from wordcloud import WordCloud
plt.figure(figsize = (20,20))
plt.imshow(WordCloud().generate(sentences_as_one_string))

In [None]:
negative_list = negative['verified_reviews'].tolist()
negative_list

In [None]:
negative_sentences_as_one_string = " ".join(negative_list)
negative_sentences_as_one_string 

In [None]:
plt.figure (figsize = (20,20))
plt.imshow(WordCloud().generate(negative_sentences_as_one_string))

In [None]:
reviews_df.head()

In [None]:
reviews_df = reviews_df.drop (['date', 'rating','length'],axis = 1)

In [None]:
variation_dummies = pd.get_dummies(reviews_df['variation'], drop_first = True)

In [None]:
print (variation_dummies)

In [None]:
reviews_df.drop(['variation'], axis =1 , inplace = True)

In [None]:
reviews_df = pd.concat([reviews_df,variation_dummies], axis =1)

In [None]:
print (reviews_df)

In [None]:
import string
string.punctuation

In [None]:
#removing punctuation marks
reviews_df['punctuaution_removed']=reviews_df['verified_reviews'].str.replace('[^\w\s]', '')

In [None]:
reviews_df.head()

In [None]:
reviews_df.drop(['verified_reviews'], axis =1 , inplace = True)

In [None]:
reviews_df.head()

In [None]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words

In [None]:
reviews_df['punctuaution_removed_cleaned']=reviews_df['punctuaution_removed'].apply(lambda x: " ".join(word for word in x.split() if word not in stop_words) )

In [None]:
reviews_df.head()

In [None]:
#value counts of words top 30
pd.Series(" ".join(reviews_df['punctuaution_removed_cleaned']).split()).value_counts()[:60]

In [None]:
other_stop_words=['I','Im', 'We','It','test','this','So','even','far', 'Its','this','also']
len(other_stop_words)

In [None]:
reviews_df['cleanview']=reviews_df['punctuaution_removed_cleaned'].apply(lambda x: " ".join(word for word in x.split() if word not in other_stop_words))

In [None]:
reviews_df.head()

In [None]:
from textblob import Word

In [None]:
reviews_df['lemmatized']=reviews_df['cleanview'].apply(lambda x: " ".join( Word(word).lemmatize() for word in x.split()))

In [None]:
reviews_df.head()

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
vectorizer = CountVectorizer()

In [None]:
reviews_count_vectorizer = vectorizer.fit_transform(reviews_df['lemmatized'])

In [None]:
# print (vectorizer.get_feature_names())

In [None]:
print (reviews_count_vectorizer.toarray())

In [None]:
reviews_count_vectorizer.shape

In [None]:
reviews_df.drop(['punctuaution_removed','punctuaution_removed_cleaned','cleanview','lemmatized'], axis = 1, inplace = True)

In [None]:
reviews = pd.DataFrame(reviews_count_vectorizer.toarray())

In [None]:
reviews_df = pd.concat([reviews_df,reviews], axis = 1)

In [None]:
reviews_df.head()

In [None]:
X = reviews_df.drop(['feedback'], axis = 1)

In [None]:
y = reviews_df['feedback']

In [None]:
X.shape

In [None]:
y.shape

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size = 0.2)

In [None]:
from sklearn.naive_bayes import MultinomialNB

In [None]:
NB_classifier = MultinomialNB()
model = NB_classifier.fit(X_train, y_train)

In [None]:
from sklearn.metrics import confusion_matrix, classification_report
y_predict_train = model.predict(X_train)

In [None]:
y_predict_train

In [None]:
cm = confusion_matrix(y_train, y_predict_train)
sns.heatmap(cm, annot = True)

In [None]:
y_predict_test = model.predict(X_test)
cm = confusion_matrix(y_test, y_predict_test)
sns.heatmap(cm, annot = True)


In [None]:
print (classification_report(y_test, y_predict_test))

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score


In [None]:
model = LogisticRegression()
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)

In [None]:
print ('Acuracy = {}%'.format(100*accuracy_score(y_pred,y_test)))

In [None]:
cm = confusion_matrix(y_pred,y_test)
sns.heatmap(cm,annot = True)

In [None]:
print (classification_report (y_test, y_pred))