# <b>Model creation</b>

## <u> Loading data and cleaning</u>

In [2]:
# Importing main libraries
import numpy as np
import pandas as pd
import pickle

In [3]:
# Loading Reviews
df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/AMZ_recommendation_model/amazon_reviews.csv",sep=',')

In [4]:
# Start with removing unnecesaary features
df.drop(['userName','image','reviewTime'], axis=1, inplace=True)

# Convert 'verfied' to categorical-numerical and remove unverified rows
df = df.replace({'verified':{True:1, False:0}})
df = df.loc[df['verified'] > 0]

In [5]:
# identifying columns as groups and due to lack of time, removing the more complex features. "Complex", meaning longer train times, longer htperparam tunning duration.
nested_cols = ['description','feature']

unnecessary_features = ['verified','vote','itemName','brand']

df.drop(columns=nested_cols, axis=1, inplace=True)
cln_df = df.drop(columns=unnecessary_features, axis=1, inplace=False)

In [6]:
# Remove '$' from price column
cln_df['price'] = cln_df['price'].str.replace('$','')
cln_df['price'] = cln_df['price'].str.replace('.','')
cln_df['price'] = cln_df['price'].fillna('-1')
cln_df['price'] = pd.to_numeric(cln_df['price'],errors='coerce')

  cln_df['price'] = cln_df['price'].str.replace('$','')
  cln_df['price'] = cln_df['price'].str.replace('.','')


In [7]:
# Take care of misisng data
cln_df.interpolate(method ='linear', limit_direction ='forward', inplace=True)

## <u> Train/Test split & features encoding</u>

In [8]:
#First, we gonna take a sample from db as there are too much observations and learning will take forever

sample_size = 20000
cln_df = cln_df.sample(n=sample_size, replace=False, random_state=71)

In [9]:
X = cln_df.drop(columns=['rating'], axis=1)
y = cln_df['rating'].map(int)

In [10]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 42)

In [11]:
X_train.shape,X_test.shape

((16000, 4), (4000, 4))

## Note: From this point on, not touching test set at all.

In [12]:
# Encoding Label a.k.a 'rating'
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

y_train = le.fit_transform(y_train)

In [13]:
# Encoding numerical features - in our case only price
from sklearn.preprocessing import MinMaxScaler

mm = MinMaxScaler()
X_train_numeric = mm.fit_transform(X_train['price'].array.reshape(-1,1))

In [14]:
# Encoding 'category' using MultiLabel
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer()

X_train_cat = mlb.fit_transform(X_train['category'])

In [15]:
#Encoding textual features
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [16]:
#Encoding textual features

textual_cols = ['summary','reviewText']

X_train_text = X_train[textual_cols]

ps = PorterStemmer()

# load stopwords from nltk
nltk_stopwords =set(stopwords.words('english'))

train_preproc_text = pd.DataFrame(columns=['preprocessed text'])
#corpus = []
i=0
for index,raw in X_train_text.iterrows():
    raw_review = str(raw)
    review = re.sub('[^a-zA-Z]', ' ', raw_review) # replace characters that are not letters to space
    review = review.lower()
    review = review.split() # tokenization by word - split string by spaces
    review = [ps.stem(word) for word in review if not word in nltk_stopwords] # stopwords removal + stemming
    review = ' '.join(review) # combine tokens back to a single string of a review
    #corpus.append(review)
    train_preproc_text.loc[i] = {'preprocessed text': review}
    i = i + 1

ngram_vectorizer = TfidfVectorizer(norm='l2')
ngram_vectorizer.fit(train_preproc_text['preprocessed text'])
x_train_ngram = ngram_vectorizer.transform(train_preproc_text['preprocessed text']).toarray()


In [17]:
# Unite all train features into one dataframe
final_train = np.hstack((X_train_numeric
                         ,X_train_cat
                         ,x_train_ngram))

In [18]:
final_train.shape,y_train.shape

((16000, 7657), (16000,))

In [19]:
# store training sets to avoid computing them again
''''
with open("/content/drive/MyDrive/Colab Notebooks/AMZ_recommendation_model/x_train.pkl", 'wb') as file:
    pickle.dump(final_train, file)

with open("/content/drive/MyDrive/Colab Notebooks/AMZ_recommendation_model/y_train.pkl", 'wb') as file:
    pickle.dump(y_train, file)
'''

'\'\nwith open("/content/drive/MyDrive/Colab Notebooks/AMZ_recommendation_model/x_train.pkl", \'wb\') as file:\n    pickle.dump(final_train, file)\n\nwith open("/content/drive/MyDrive/Colab Notebooks/AMZ_recommendation_model/y_train.pkl", \'wb\') as file:\n    pickle.dump(y_train, file)\n'

In [20]:
# load train sets form binary files
'''
with open('/content/drive/MyDrive/Colab Notebooks/AMZ_recommendation_model/x_train.pkl', 'rb') as file:
  final_train = pickle.load(file)

with open('/content/drive/MyDrive/Colab Notebooks/AMZ_recommendation_model/y_train.pkl', 'rb') as file:
  y_train = pickle.load(file)
'''

"\nwith open('/content/drive/MyDrive/Colab Notebooks/AMZ_recommendation_model/x_train.pkl', 'rb') as file:\n  final_train = pickle.load(file)\n\nwith open('/content/drive/MyDrive/Colab Notebooks/AMZ_recommendation_model/y_train.pkl', 'rb') as file:\n  y_train = pickle.load(file)\n"

## <u>Running dummy classifier just to test</u>

In [21]:
from sklearn.dummy import DummyClassifier

dc = DummyClassifier(strategy="most_frequent")
dc.fit(final_train, y_train)
dc.score(final_train,y_train)

0.69625

## Model 1 - Random Forest

In [22]:
#hyperparam tunning

from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier()

params = {
  'n_estimators': [50,100],
  'max_depth': [50,200],
  'min_samples_split': [50,200],
  'min_samples_leaf': [50,200],
  'max_features': ['sqrt','log2'],
  'max_leaf_nodes':[500,3000],
}
'''
model_1_grid = RandomizedSearchCV(rfc,param_distributions=params,cv=5,n_iter=100)
model_1_grid.fit(final_train,y_train)
'''

'\nmodel_1_grid = RandomizedSearchCV(rfc,param_distributions=params,cv=5,n_iter=100)\nmodel_1_grid.fit(final_train,y_train)\n'

In [23]:
'''
with open("/content/drive/MyDrive/Colab Notebooks/AMZ_recommendation_model/rf_model.pkl", 'wb') as file:
    pickle.dump(model_1_grid, file)
    '''

'\nwith open("/content/drive/MyDrive/Colab Notebooks/AMZ_recommendation_model/rf_model.pkl", \'wb\') as file:\n    pickle.dump(model_1_grid, file)\n    '

## Model 2 - SVM

In [None]:
from sklearn import svm
from sklearn.model_selection import GridSearchCV

svm = svm.SVC()

params = {
    'C': [0.1, 1, 10, 100, 1000],
    'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
    'kernel': ['poly', 'rbf']}

model_2_grid = GridSearchCV(svm,param_grid=params,cv=5,n_jobs=-1)
model_2_grid.fit(final_train,y_train)

In [None]:

with open("/content/drive/MyDrive/Colab Notebooks/AMZ_recommendation_model/svm_model.pkl", 'wb') as file:
    pickle.dump(model_2_grid, file)

## Encode Test set

In [None]:
y_test = le.transform(y_test)
X_test_numeric = mm.transform(X_test['price'].array.reshape(-1,1))
X_test_cat = mlb.transform(X_test['category'])

X_test_text = X_test[textual_cols]
test_preproc_text = pd.DataFrame(columns=['preprocessed text'])
i=0
for index,raw in X_test_text.iterrows():
    raw_review = str(raw)
    review = re.sub('[^a-zA-Z]', ' ', raw_review) # replace characters that are not letters to space
    review = review.lower()
    review = review.split() # tokenization by word - split string by spaces
    review = [ps.stem(word) for word in review if not word in nltk_stopwords] # stopwords removal + stemming
    review = ' '.join(review) # combine tokens back to a single string of a review
    #corpus.append(review)
    test_preproc_text.loc[i] = {'preprocessed text': review}
    i = i + 1

x_test_ngram = ngram_vectorizer.transform(test_preproc_text['preprocessed text']).toarray()

final_test = np.hstack((X_train_numeric
                         ,X_train_cat
                         ,x_train_ngram))

final_test.shape,y_test.shape

## <u>Comparing Models and choosing the best one</u>

In [None]:
# load models
'''
with open('/content/drive/MyDrive/Colab Notebooks/AMZ_recommendation_model/x_train.pkl', 'rb') as file:
  rf = pickle.load(file)

with open('/content/drive/MyDrive/Colab Notebooks/AMZ_recommendation_model/y_train.pkl', 'rb') as file:
  knn = pickle.load(file)
'''

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

In [None]:
models = [rf, svm]

res_df = pd.DataFrame(columns=['model_name', 'train_acc', 'test_acc',
                               'test_precis', 'test_recall'])

for model in models:
  model.fit(final_train, y_train)
  y_train_pred = model.predict(final_train)
  y_test_pred = model.predict(final_test)

  train_acc_score = accuracy_score(y_train, y_train_pred)
  test_acc_score = accuracy_score(y_test, y_test_pred)

  test_prec = precision_score(y_test, y_test_pred, average='binary')
  test_recall = recall_score(y_test, y_test_pred, average='binary')

  new_row = pd.Series({'model_name': model.__class__.__name__,
                       'train_acc':train_acc_score, 'test_acc':test_acc_score,
                       'test_precis':test_prec, 'test_recall':test_recall})
  res_df = pd.concat([res_df, new_row.to_frame().T], ignore_index=True)

res_df.sort_values('test_acc', ascending=False)

# the Winner is:!

## <u> Feature Importance (if have time...)