In [None]:
import pandas as pd
import random
import os
import json
import pickle
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn import svm
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler,LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import f1_score
from sklearn.utils import resample
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
%matplotlib inline

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GlobalAveragePooling1D, Dense, Dropout, LSTM, Bidirectional

In [None]:
# patio = pd.read_csv('datasets/SciLearn/AmazonSales/patio_csv.csv')
# patio.head()

In [None]:
files = [file for file in os.listdir('datasets/SciLearn/AmazonSales')]
for file in files:
    print(file)

In [None]:
files = [file for file in os.listdir('datasets/SciLearn/AmazonSales')]

combined_data = pd.DataFrame()

for file in files:
    df = pd.read_csv('datasets/SciLearn/AmazonSales/' + file)
    combined_data = pd.concat([combined_data, df])

combined_data.to_csv('datasets/SciLearn/combined_Sales.csv', index = False)

In [None]:
combined_data.tail()

In [None]:
required_columns = combined_data[['reviewText', 'overall', 'Category']]
required_columns.head()

In [None]:
required_columns = required_columns.sample(frac = 1).reset_index(drop = True)

In [None]:
def sentiment(column):
    if column['overall'] >= 4:
        val = 'POSITIVE'
    elif column['overall'] <= 2:
        val = 'NEGATIVE'
    else:
        val = 'NEUTRAL'
    return val

In [None]:
required_columns['sentiment'] = required_columns.apply(sentiment, axis = 1)
required_columns.head(10)

In [None]:
# cloths = required_columns[required_columns['Category'] == 'Clothing']

In [None]:
# len(cloths)

## Prepare train/test data and pre-process text

In [None]:
train_review, test_review, train_category, test_category = train_test_split(required_columns['reviewText'],
                                                                           required_columns['Category'], test_size = 0.2,
                                                                           random_state = 42)

## Bag of Words Vectorization

In [None]:
vectorizer = TfidfVectorizer()

train_review_vectors = vectorizer.fit_transform(train_review)

test_review_vectors = vectorizer.transform(test_review)

print(train_review[0])
print(train_review_vectors[0].toarray())

## Classification

In [None]:
clf_svm = svm.SVC(kernel = 'linear')

clf_svm.fit(train_review_vectors, train_category)

# print(test_msg[0])
print(clf_svm.predict(test_review_vectors[0]))

In [None]:
clf_dec = DecisionTreeClassifier(random_state = 0)

clf_dec.fit(train_review_vectors, train_category)

# print(test_x[0])
print(clf_dec.predict(test_review_vectors[0]))

In [None]:
clf_naive = GaussianNB()

clf_naive.fit(train_review_vectors.toarray(), train_category)

# print(test_x[0])
print(clf_naive.predict(test_review_vectors[0].toarray()))

In [None]:
clf_rand = RandomForestClassifier(max_depth=2, random_state=0)

clf_rand.fit(train_review_vectors, train_category)

# print(test_x[0])
print(clf_rand.predict(test_review_vectors[0]))

In [None]:
clf_neural = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(15,), random_state=1)

clf_neural.fit(train_review_vectors, train_category)

# print(test_x[0])
print(clf_neural.predict(test_review_vectors[0]))

In [None]:
clf_neigh = KNeighborsClassifier()

clf_neigh.fit(train_review_vectors, train_category)

# print(test_x[0])
print(clf_neigh.predict(test_review_vectors[0]))

In [None]:
test_sample = ['A very interesting reading I have never had before', 
               'The tailor was awesome, I felt like the product was specifically meant for me', 
               'The holder mounted perfectly on my weber grill.  Now I dont have to get my table messy with sauce from the brush.  It is also great for  storing my cooking tools and grill brush when Im not grilling',
               'The device mounted perfectly on the wall, the display was good, I could even watch games even from my kitchen',
               'This stuff is great on just about everything except popcorn (it doesnt stick at all). It has a slightly cheesy-buttery taste and I particularly like using it mixed with mashed potatoes, steamed vegetables and plain white rice. Yum!']
test_sample1 = vectorizer.transform(test_sample)

In [None]:
test_sample1 = vectorizer.transform(test_sample)
print(clf_neural.predict(test_sample1))

In [None]:
test_sample1 = vectorizer.transform(test_sample)
print(clf_dec.predict(test_sample1))

In [None]:
test_sample = ['A very interesting reading I have never had before', 
               'The tailor was awesome, I felt like the product was specifically meant for me', 
               'The holder mounted perfectly on my weber grill.  Now I dont have to get my table messy with sauce from the brush.  It is also great for  storing my cooking tools and grill brush when Im not grilling',
               'The device mounted perfectly on the wall, the display was good, I could even watch games even from my kitchen',
               'This stuff is great on just about everything except popcorn (it doesnt stick at all). It has a slightly cheesy-buttery taste and I particularly like using it mixed with mashed potatoes, steamed vegetables and plain white rice. Yum!']

test_sample1 = vectorizer.transform(test_sample)
print(clf_naive(test_sample1.toarray()))

In [None]:
test_sample1 = vectorizer.transform(test_sample)
print(clf_rand.predict(test_sample1))

In [None]:
test_sample1 = vectorizer.transform(test_sample)
print(clf_neural.predict(test_sample1))

In [None]:
test_sample1 = vectorizer.transform(test_sample)
print(clf_neigh.predict(test_sample1))

### Saving the Models

In [None]:
with open('./datasets/SciLearn/models/AmazonReviewCategory/clf_svm.pkl', 'wb') as f:
    pickle.dump(clf_svm, f)

In [None]:
with open('./datasets/SciLearn/models/AmazonReviewCategory/clf_dec.pkl', 'wb') as f:
    pickle.dump(clf_dec, f)

In [None]:
with open('./datasets/SciLearn/models/AmazonReviewCategory/clf_naive.pkl', 'wb') as f:
    pickle.dump(clf_naive, f)

In [None]:
with open('./datasets/SciLearn/models/AmazonReviewCategory/clf_neigh.pkl', 'wb') as f:
    pickle.dump(clf_neigh, f)

In [None]:
with open('./datasets/SciLearn/models/AmazonReviewCategory/clf_neural.pkl', 'wb') as f:
    pickle.dump(clf_neural, f)

In [None]:
with open('./datasets/SciLearn/models/AmazonReviewCategory/clf_rand.pkl', 'wb') as f:
    pickle.dump(clf_rand, f)