# Problem 2: Amazon Review Data (2018)
*Course*: DS807 \
*Authors*: August E. Wennerwald, Kasper Lin Hannberg, Oliver Klejst, Søren Pico, Thomas Fischer

## Modules and data

In [28]:
#Import relevant packages
import tensorflow as tf
import pandas as pd
import requests
import gzip
import io
import numpy as np
from sklearn.model_selection import train_test_split

import json
import random

import matplotlib.pyplot as plt
import seaborn as sns


## Change your file path to your own 

In [29]:
#Import datasets
file_All_beauty = "/Users/oliverrasmussen/Desktop/AMAZON/All_Beauty_5.json"
file_Fashion = "/Users/oliverrasmussen/Desktop/AMAZON/AMAZON_FASHION_5.json"
file_luxury_beauty = "/Users/oliverrasmussen/Desktop/AMAZON/Luxury_Beauty_5.json"

# Load the gzipped JSON file into a pandas DataFrame
all_beauty = pd.read_json(file_All_beauty, lines=True)
fashion =  pd.read_json(file_Fashion, lines=True)
luxury_beauty = pd.read_json(file_luxury_beauty, lines=True)

#Check shape of datasets:
print(all_beauty.shape,
      fashion.shape,
      luxury_beauty.shape
    )

ValueError: Expected object or value

In [30]:
print(all_beauty.head(3))

   overall  verified   reviewTime      reviewerID        asin  \
0        5      True   09 1, 2016  A3CIUOJXQ5VDQ2  B0000530HU   
1        5      True  11 14, 2013  A3H7T87S984REU  B0000530HU   
2        1      True  08 18, 2013  A3J034YH7UG4KT  B0000530HU   

                                               style  reviewerName  \
0  {'Size:': ' 7.0 oz', 'Flavor:': ' Classic Ice ...      Shelly F   
1  {'Size:': ' 7.0 oz', 'Flavor:': ' Classic Ice ...  houserules18   
2  {'Size:': ' 7.0 oz', 'Flavor:': ' Classic Ice ...          Adam   

                                          reviewText            summary  \
0                   As advertised. Reasonably priced         Five Stars   
1  Like the oder and the feel when I put it on my...  Good for the face   
2  I bought this to smell nice after I shave.  Wh...       Smells awful   

   unixReviewTime vote image  
0      1472688000  NaN   NaN  
1      1384387200  NaN   NaN  
2      1376784000  NaN   NaN  


In [31]:
all_beauty.dtypes

overall            int64
verified            bool
reviewTime        object
reviewerID        object
asin              object
style             object
reviewerName      object
reviewText        object
summary           object
unixReviewTime     int64
vote              object
image             object
dtype: object

In [32]:
#Now checking for missing values in column "reviewText and update the datasets accordingly

def missing_values(dataset):
    missing = []
    for column in dataset.columns:
        missing.append(dataset[column].isna().sum())
    return missing

print("Missing values in all_beauty:", missing_values(all_beauty))
print("Missing values in fashion:", missing_values(fashion))
print("Missing values in luxury_beauty:", missing_values(luxury_beauty))

Missing values in all_beauty: [0, 0, 0, 0, 0, 645, 0, 5, 5, 0, 4717, 5171]
Missing values in fashion: [0, 0, 0, 0, 0, 69, 0, 16, 0, 0, 2879, 3070]


NameError: name 'luxury_beauty' is not defined

In [None]:
#Drop rows with Missing values in column "reviewText"
all_beauty.dropna(subset=['reviewText'],inplace=True)
fashion.dropna(subset=['reviewText'],inplace=True)
luxury_beauty.dropna(subset=['reviewText'],inplace=True)

#Check updated shape of datasets:
print(all_beauty.shape,
      fashion.shape,
      luxury_beauty.shape
    )

In [None]:
#Drop duplicates review based on certain columns
all_beauty.drop_duplicates(subset=['overall', 'reviewText', 'asin', 'reviewerID'], inplace=True)
fashion.drop_duplicates(subset=['overall', 'reviewText', 'asin', 'reviewerID'], inplace=True)
luxury_beauty.drop_duplicates(subset=['overall', 'reviewText', 'asin', 'reviewerID'], inplace=True)

#Check updated shape of datasets:
print(all_beauty.shape,
      fashion.shape,
      luxury_beauty.shape
    )

In [None]:
#Convert text in "reviewText" to lower case
all_beauty['reviewText'] = all_beauty.reviewText.str.lower()
fashion['reviewText'] = fashion.reviewText.str.lower()
luxury_beauty['reviewText'] = luxury_beauty.reviewText.str.lower()

#Check first 5 columns of all_beauty to validate
print(all_beauty['reviewText'].head(5))

#check shape to validate rows are keeped
print(all_beauty.shape,
      fashion.shape,
      luxury_beauty.shape
    )

In [None]:
#Distribution analysis of our datasets:
datasets = [all_beauty, fashion, luxury_beauty]
colors = ['blue', 'red', 'green']

for i, dataset in enumerate(datasets):
    plt.hist(dataset['overall'], color=colors[i], edgecolor='black')
    plt.title(f'Histogram')
    plt.xlabel('Stars')
    plt.ylabel('Number of reviews')
    plt.show()

In [None]:
#Length of the textual reviews
#average, min and max lengths of reviews

def review_lengths(df):
    print(f'Average review length = {round(df["reviewText"].str.len().mean(),0)} characters')
    print(f'Min review length = {df["reviewText"].str.len().min()} characters')
    print(f'Max review length = {df["reviewText"].str.len().max()} characters')

review_lengths(all_beauty)
review_lengths(fashion)
review_lengths(luxury_beauty)



In [None]:
#The Textvectorization layer - to see the most frequent words in sentences.
max_tokens = 5214 #
output_sequence_length = 500 # to capture the average reviews in luxury_beauty
pad_to_max_tokens = True

encoder = tf.keras.layers.TextVectorization(max_tokens=max_tokens,
                                            output_sequence_length=output_sequence_length,
                                            pad_to_max_tokens=pad_to_max_tokens)


#Vocabulary and frequent words:
def vocabulary_words(dataset):
    text_dataset = dataset['reviewText']
    encoder.adapt(text_dataset)
    vocabulary = np.array(encoder.get_vocabulary())
    print(f'Vocabulary size = {len(vocabulary)}')
    print(f'20 most common words: {vocabulary[:20]}')

vocabulary_words(all_beauty)
vocabulary_words(fashion)
vocabulary_words(luxury_beauty)

In [None]:
#Now to check for similarities with luxury_beauty in the top 20:
all_beauty_words = ['', '[UNK]', 'the', 'i', 'and', 'it', 'a', 'this', 'to', 'is', 'my', 'of', 'for', 'in', 'that', 'but', 'love', 'product', 'hair', 'with']
fashion_words = ['', '[UNK]', 'i', 'and', 'the', 'a', 'for', 'these', 'shoes', 'my', 'them', 'to', 'are', 'they', 'comfortable', 'in', 'of', 'very', 'fit', 'is']
luxury_beauty_words = ['', '[UNK]', 'the', 'i', 'a', 'it', 'and', 'to', 'is', 'this', 'my', 'of', 'for', 'that', 'on', 'in', 'but', 'with', 'skin', 'not']

common_words_all_beauty = set(all_beauty_words) & set(luxury_beauty_words)
common_words_fashion = set(fashion_words) & set(luxury_beauty_words) 

print(f'Number of shared words with all_beauty: {len(common_words_all_beauty)}')
print(f'Common words: {common_words_all_beauty}')

print(f'Number of shared words with fashion: {len(common_words_fashion)}')
print(f'Common words: {common_words_fashion}')

In [None]:
#Based on the top 20, we remove words we are not interested in by adding the words to a list.
words = [#at first common words in the English language
    #based on all_beauty:
    '', 'to', 'with', 'this', 'it', '[UNK]', 'my', 'for', 'and', 'i', 'that',
    'the', 'a', 'is', 'of', 'in', 'but', 'have', 'on', 'so', 'as', 'you', 'was', 'use',
    'its', 'just', 'are', 'has', 'one', 'be', 'will',
    'or', 'using', 'they', 'when', 'if',
    'get', 'would', 'your', 'at', 'me',
    'from', 'out', 'used', 'find',
    'time',
    #based on fashion:
    'these', 'them', 
    'had', 'than', 'ive', 'day',
    'im', 'am',
    'because', 'an',
    #based on luxury_beauty
    'can', 'does', 'do',

    #secondly, we remove commonly used words associated with shoes
    #based on fashion:
    'shoes', 'comfortable', 'fit', 'shoe', 'size', 'feet', 'wear', 'pair', 'support',
    'nike', 'weight', 'lightweight',
    'training', 'running'
    ]

#Now we run our frequency test again with the filtered vocabulary 
#We run the function several times and for each time filter out the words we are not interested in:
#each line in "words" represent an iteration with the words left out.
def vocabulary_words_filtered(dataset):
    text_dataset = dataset['reviewText']
    encoder.adapt(text_dataset)
    vocabulary = np.array(encoder.get_vocabulary())
    vocabulary_filtered = [word for word in vocabulary if word not in words]
    print(f'Vocabulary size = {len(vocabulary_filtered)}')
    print(f'20 most common words: {vocabulary_filtered[:20]}')

vocabulary_words_filtered(all_beauty)
vocabulary_words_filtered(fashion)
vocabulary_words_filtered(luxury_beauty)

In [None]:
#Now to check for similarities with luxury_beauty in the top 20:
all_beauty_words = ['love', 'product', 'hair', 'great', 'not', 'like', 'very', 'good', 'skin', 'shampoo', 'body', 'scent', 'all', 'smell', 'more', 'smells', 'really', 'well', 'soap', 'no']
fashion_words = ['very', 'love', 'great', 'not', 'light', 'like', 'good', 'perfect', 'all', 'too', 'really', 'more', 'nice', 'feel', 'expected', 'super', 'color', 'work', 'no', 'dont']
luxury_beauty_words = ['skin', 'not', 'product', 'like', 'very', 'color', 'hair', 'more', 'really', 'face', 'all', 'great', 'love', 'good', 'well', 'after', 'products', 'dont', 'no', 'up']

common_words_all_beauty = set(all_beauty_words) & set(luxury_beauty_words)
common_words_fashion = set(fashion_words) & set(luxury_beauty_words) 

print(f'Number of shared words with all_beauty: {len(common_words_all_beauty)}')
print(f'Common words: {common_words_all_beauty}')

print(f'Number of shared words with fashion: {len(common_words_fashion)}')
print(f'Common words: {common_words_fashion}')

In [None]:
#Analyzing column "summary"
#Some data preparation - lower case words (we dont check for non values this time as this would remove rows containing text in "reviewText")

all_beauty['summary'] = all_beauty.summary.str.lower()
fashion['summary'] = fashion.summary.str.lower()
luxury_beauty['summary'] = luxury_beauty.summary.str.lower()

print(all_beauty['summary'].head(5))

In [None]:
#Length of texts in "summary":
def summary_lengths(df):
    print(f'Average review length = {round(df["summary"].str.len().mean(),0)} characters')
    print(f'Min review length = {df["summary"].str.len().min()} characters')
    print(f'Max review length = {df["summary"].str.len().max()} characters')

summary_lengths(all_beauty)
summary_lengths(fashion)
summary_lengths(luxury_beauty)

In [None]:
#Prevalent words in summaries:
max_tokens = 5214 #
output_sequence_length = 209 # to capture the average review length in luxury_beauty
pad_to_max_tokens = True

encoder = tf.keras.layers.TextVectorization(max_tokens=max_tokens,
                                            output_sequence_length=output_sequence_length,
                                            pad_to_max_tokens=pad_to_max_tokens)

#We run the frequency_test again by using the already existing function with a little modification as we are interested in column "summary"
#We can use the filtered vocabulary from textReview again.
def vocabulary_words_filtered(dataset):
    text_dataset = dataset['summary']
    #replace nan-values with " "
    text_dataset = text_dataset.replace(np.nan, " ")
    encoder.adapt(text_dataset)
    vocabulary = np.array(encoder.get_vocabulary())
    vocabulary_filtered = [word for word in vocabulary if word not in words]
    print(f'Vocabulary size = {len(vocabulary_filtered)}')
    print(f'20 most common words: {vocabulary_filtered[:20]}')

vocabulary_words_filtered(all_beauty)
vocabulary_words_filtered(fashion)
vocabulary_words_filtered(luxury_beauty)

In [None]:
#To check for similarities with luxury_beauty
all_beauty_words = ['stars', 'five', 'great', 'love', 'product', 'hair', 'good', 'scent', 'shampoo', 'best', 'body', 'favorite', 'wash', 'really', 'nice', 'like', 'shower', 'soap', 'smells', 'smell']
fashion_words = ['stars', 'five', 'love', 'great', 'good', 'four', 'very', 'not', 'perfect', 'nice', 'what', 'like', 'wide', 'three', 'sneakers', 'light', 'color', 'best', 'feel', 'super']
luxury_beauty_words = ['stars', 'great', 'five', 'color', 'love', 'not', 'skin', 'good', 'product', 'nice', 'very', 'works', 'like', 'scent', 'best', 'well', 'hair', 'cream', 'too', 'no']

common_words_all_beauty = set(all_beauty_words) & set(luxury_beauty_words)
common_words_fashion = set(fashion_words) & set(luxury_beauty_words) 

print(f'Number of shared words with all_beauty: {len(common_words_all_beauty)}')
print(f'Common words: {common_words_all_beauty}')

print(f'Number of shared words with fashion: {len(common_words_fashion)}')
print(f'Common words: {common_words_fashion}')

### Shallow learner 1

## Random Forest

In [33]:
from sklearn.metrics import mean_squared_error
from sklearn import ensemble
import tqdm
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix
from scipy.sparse import vstack
from sklearn.model_selection import cross_val_score

In [34]:
# Define a function to prepare the data
def prepare_data(df):
    df['reviewText'] = df['reviewText'].fillna('').astype(str)
    X = df['reviewText']
    y = df['overall']  # Subtract 1 to make labels 0-based
    return X, y

# Loading datasets
fashion = pd.read_csv('Training.csv')
luxury_beauty = pd.read_csv('luxury_beauty_dataset.csv')

# Preparing training and validation data from 'fashion'
X_train_fashion, y_train_fashion = prepare_data(fashion)

# Preparing test data from 'luxury_beauty'
X_test_luxury_beauty, y_test_luxury_beauty = prepare_data(luxury_beauty)

# Initialize a new TfidfVectorizer and fit-transform on both training and test data
tfidf = TfidfVectorizer()
X_train_vectorized = tfidf.fit_transform(X_train_fashion)
X_test_vectorized = tfidf.transform(X_test_luxury_beauty)

# Splitting into training and validation sets
X_train_split, X_val, y_train_split, y_val = train_test_split(X_train_vectorized, y_train_fashion, test_size=0.2, random_state=42)

print(f"Training set: {X_train_split.shape}, {y_train_split.shape}")
print(f"Validation set: {X_val.shape}, {y_val.shape}")
print(f"Test set: {X_test_vectorized.shape}, {y_test_luxury_beauty.shape}")

# Initialize and train the Random Forest Classifier
rf_classifier = RandomForestClassifier(random_state=42)
rf_classifier.fit(X_train_split, y_train_split)

# Predictions and evaluation on validation set
y_val_pred = rf_classifier.predict(X_val)
print("Validation Report:")
print(classification_report(y_val, y_val_pred))  

# Predictions and evaluation on test set
y_test_pred = rf_classifier.predict(X_test_vectorized)
print("Test Report:")
print(classification_report(y_test_luxury_beauty, y_test_pred))  


Training set: (5826, 6179), (5826,)
Validation set: (1457, 6179), (1457,)
Test set: (29955, 6179), (29955,)
Validation Report:
              precision    recall  f1-score   support

           1       1.00      0.95      0.98        44
           2       1.00      0.84      0.91        31
           3       1.00      0.82      0.90        87
           4       0.97      0.78      0.86       158
           5       0.95      1.00      0.97      1137

    accuracy                           0.96      1457
   macro avg       0.98      0.88      0.92      1457
weighted avg       0.96      0.96      0.96      1457

Test Report:
              precision    recall  f1-score   support

           1       0.67      0.00      0.00       986
           2       0.00      0.00      0.00      1379
           3       0.00      0.00      0.00      3534
           4       0.22      0.00      0.00      7114
           5       0.57      1.00      0.72     16942

    accuracy                           0.56  

In [35]:
# Basline without HPT
y_train_pred = rf_classifier.predict(X_train_split)
train_accuracy = accuracy_score(y_train_split, y_train_pred)
print(f"Accuracy on training dataset: {train_accuracy * 100:.2f}%")

# Predict and evaluate on validation set
y_val_pred = rf_classifier.predict(X_val)
val_accuracy = accuracy_score(y_val, y_val_pred)
print(f"Accuracy on validation dataset: {val_accuracy * 100:.2f}%")

# Predict and evaluate on test set
y_test_pred = rf_classifier.predict(X_test_vectorized)
test_accuracy = accuracy_score(y_test_luxury_beauty, y_test_pred)
print(f"Accuracy on test dataset: {test_accuracy * 100:.2f}%")

Accuracy on training dataset: 99.55%
Accuracy on validation dataset: 95.74%
Accuracy on test dataset: 56.45%


In [36]:
# Parameter list
n_estimators_list = [10, 20, 200, 500]
min_samples_split_list = [10, 15, 20]
min_samples_leaf_list = [3, 5, 10, 15]

results = []

for n_estimators in tqdm.tqdm(n_estimators_list):
    for min_samples_split in min_samples_split_list:
        for min_samples_leaf in min_samples_leaf_list:
            rf_classifier = RandomForestClassifier(
                n_estimators=n_estimators,
                min_samples_split=min_samples_split,
                min_samples_leaf=min_samples_leaf,
                random_state=42
            )
            # Apply cross-validation
            cross_val_scores = cross_val_score(rf_classifier, X_train_split, y_train_split, cv=5)
            avg_accuracy = cross_val_scores.mean()

            results.append([avg_accuracy, n_estimators, min_samples_split, min_samples_leaf])

# Convert the results into a DataFrame
results_df = pd.DataFrame(results, columns=['Average Accuracy', 'n_estimators', 'min_samples_split', 'min_samples_leaf'])
print(results_df)


 25%|██▌       | 1/4 [00:18<00:54, 18.12s/it]


KeyboardInterrupt: 

In [None]:
# Test with RandomizedSearchCV. Not a better result.
# Define the parameter range for Random Search
param_dist = {
    'n_estimators': [10, 20, 200, 500],
    'min_samples_split': [10, 15, 20],
    'min_samples_leaf': [3, 5, 10, 15]
}

# Initialize RandomForestClassifier
rf_classifier = RandomForestClassifier(random_state=42)

# Initialize RandomizedSearchCV
random_search = RandomizedSearchCV(rf_classifier, param_distributions=param_dist, n_iter=10, cv=5, random_state=42)

# Perform Random Search
random_search.fit(X_train_split, y_train_split)

# Print the best parameters and corresponding accuracy
print("Best parameters:", random_search.best_params_)
print("Best score:", random_search.best_score_)

# Use the best model to predict on the validation set
best_rf_classifier = random_search.best_estimator_
y_val_pred = best_rf_classifier.predict(X_val)
accuracy = accuracy_score(y_val, y_val_pred)

print(f'Validation Accuracy with Best Model: {accuracy}')


In [None]:
# Find the index of the row with the maximum average accuracy
max_index = results_df['Average Accuracy'].idxmax()

print("Index of the row with the maximum value in Average Accuracy:", max_index)

max_row = results_df.loc[max_index]
print("Row with the maximum value in Average Accuracy:")
print(max_row)

In [None]:
# Create a new model with the optimal hyperparameters
optimal_rf_classifier = RandomForestClassifier(
    n_estimators=500,
    min_samples_split=10,
    min_samples_leaf=3,
    random_state=42
)

# Train the model on the full training set
optimal_rf_classifier.fit(X_train_vectorized, y_train_fashion)

# Predict on the test set
y_test_pred = optimal_rf_classifier.predict(X_test_vectorized)


test_report = classification_report(y_test_luxury_beauty, y_test_pred, zero_division=1)
print("Test Report:")
print(test_report)


In [None]:
# Predict on the training set
y_train_pred = optimal_rf_classifier.predict(X_train_vectorized)
accuracy_train = accuracy_score(y_train_fashion, y_train_pred)
print(f'Accuracy on the training dataset with the optimal model: {round(accuracy_train * 100, 2)}%')

# Re-evaluate on the validation set with the chosen model
accuracy_val = accuracy_score(y_val, y_val_pred)
print(f'Accuracy on the validation dataset with the optimal model: {round(accuracy_val * 100, 2)}%')

# Calculate the accuracy on the test dataset
accuracy_test = accuracy_score(y_test_luxury_beauty, y_test_pred)
print(f'Accuracy on the test dataset with the optimal model: {round(accuracy_test * 100, 2)}%')


### Shallow learner 2

## SVM


In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn import svm
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import roc_curve, auc
from sklearn.preprocessing import label_binarize
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import cross_val_score


In [None]:
def prepare_data(df):
    df['reviewText'] = df['reviewText'].fillna('').astype(str)
    X = df['reviewText']
    y = df['overall'] 
    return X, y

# Loading datasets
fashion_df = pd.read_csv('Training.csv')
luxury_beauty_df = pd.read_csv('luxury_beauty_dataset.csv')

# Preparing training data (fashion)
X_train_fashion, y_train_fashion = prepare_data(fashion_df)

# Text vectorization for training data
tfidf = TfidfVectorizer()
X_train_vectorized = tfidf.fit_transform(X_train_fashion)

# Splitting into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train_vectorized, y_train_fashion, test_size=0.2, random_state=42)

# Preparing test data (luxury beauty)
X_test_luxury_beauty, y_test_luxury_beauty = prepare_data(luxury_beauty_df)

# Vectorization of test data 
X_test_vectorized = tfidf.transform(X_test_luxury_beauty)

print(f"Training set: {X_train.shape}, {y_train.shape}")
print(f"Validation set: {X_val.shape}, {y_val.shape}")
print(f"Test set: {X_test_vectorized.shape}, {y_test_luxury_beauty.shape}")

In [None]:
# Initialize the SVM model with default hyperparameters or a baseline configuration
svm_model = svm.SVC()

# Train the SVM model on the training set
svm_model.fit(X_train, y_train)

# Predict and evaluate on the training set
y_train_pred = svm_model.predict(X_train)
train_accuracy = accuracy_score(y_train, y_train_pred)
print(f"Baseline accuracy on training dataset: {train_accuracy * 100:.2f}%")

# Predict and evaluate on the validation set
y_val_pred = svm_model.predict(X_val)
val_accuracy = accuracy_score(y_val, y_val_pred)
print(f"Baseline accuracy on validation dataset: {val_accuracy * 100:.2f}%")


In [None]:
# Grid search
kernels = ['linear', 'poly', 'rbf']
Cs = [0.1, 1.0, 10.0]
decision_functions = ['ovr', 'ovo']
degrees = [2, 3, 4]  # For the 'poly' kernel
gammas = [0.1, 1, 10, 'scale', 'auto']  # For the 'rbf' kernel

results = []

for kernel in kernels:
    for C in Cs:
        for decision_function in decision_functions:
            if kernel == 'poly':
                for degree in degrees:
                    svm_current = svm.SVC(kernel=kernel, C=C, degree=degree, decision_function_shape=decision_function, probability=True)
                    svm_current.fit(X_train, y_train)
                    y_val_hat = svm_current.predict(X_val)
                    accuracy = accuracy_score(y_val_hat, y_val)
                    results.append([accuracy, kernel, C, decision_function, degree, 'NA'])
            elif kernel == 'rbf':
                for gamma in gammas:
                    svm_current = svm.SVC(kernel=kernel, C=C, gamma=gamma, decision_function_shape=decision_function, probability=True)
                    svm_current.fit(X_train, y_train)
                    y_val_hat = svm_current.predict(X_val)
                    accuracy = accuracy_score(y_val_hat, y_val)
                    results.append([accuracy, kernel, C, decision_function, 'NA', gamma])
            else:
                # For 'linear' and other kernels where degree and gamma are not relevant
                svm_current = svm.SVC(kernel=kernel, C=C, decision_function_shape=decision_function, probability=True)
                svm_current.fit(X_train, y_train)
                y_val_hat = svm_current.predict(X_val)
                accuracy = accuracy_score(y_val_hat, y_val)
                results.append([accuracy, kernel, C, decision_function, 'NA', 'NA'])

# Convert the results into a DataFrame
results_df = pd.DataFrame(results, columns=['Accuracy', 'Kernel', 'C', 'Decision function', 'Degree', 'Gamma'])

# Sort the results
print(results_df.sort_values(by='Accuracy', ascending=False))


In [None]:
# Top accuracy
top_accuracy = results_df['Accuracy'].max()
top_models = results_df[results_df['Accuracy'] == top_accuracy]
print(top_models)


In [None]:
# POLY kernel with various hyperparameters
Cs = [0.1, 1.0, 10.0]  
degrees = [2, 3, 4, 5]  
results = []

# Train models with different combinations of C and degree
for C in Cs:
    for degree in degrees:
        svm_current = svm.SVC(kernel='poly', C=C, degree=int(degree))  # Ensure degree is an integer
        svm_current.fit(X_train, y_train)  # Training on the training set
        y_val_hat = svm_current.predict(X_val)  # Predicting on the validation set
        accuracy = accuracy_score(y_val_hat, y_val)  # Calculating accuracy
        results.append([accuracy, C, degree])  # Storing results

# Convert results to a DataFrame for analysis
results_df = pd.DataFrame(results, columns=['Accuracy', 'C', 'Degree'])

# Identify the best model configuration
best_model_config = results_df.sort_values(by='Accuracy', ascending=False).iloc[0]
best_C = best_model_config['C']
best_degree = int(best_model_config['Degree'])  # Ensure degree is an integer

# Initialize and train the best model configuration on the full training set
svm_best = svm.SVC(kernel='poly', C=best_C, degree=best_degree, probability=True)
svm_best.fit(X_train, y_train)  

# Predictions on the test set 
y_test_pred = svm_best.predict(X_test_vectorized)  

# Evaluate performance on the test set
accuracy_test = accuracy_score(y_test_luxury_beauty, y_test_pred)  # Evaluate accuracy on the test set
print(f'Accuracy on test dataset with the best model: {round(accuracy_test * 100, 2)}%')


In [None]:
# Predict on the training set
y_train_pred = svm_best.predict(X_train)
accuracy_train = accuracy_score(y_train, y_train_pred)
print(f'Accuracy on training dataset with the best model: {round(accuracy_train * 100, 2)}%')

# Re-evaluate on the validation set with the chosen model
y_val_pred = svm_best.predict(X_val)
accuracy_val = accuracy_score(y_val, y_val_pred)
print(f'Accuracy on validation dataset with the best model: {round(accuracy_val * 100, 2)}%')

# The accuracy on the test set has already been calculated
print(f'Accuracy on test dataset with the best model: {round(accuracy_test * 100, 2)}%')

In [None]:
# Compute and print the confusion matrix
cm = confusion_matrix(y_test_luxury_beauty, y_test_pred)
print("Confusion Matrix:")
print(cm)

In [None]:
# Compute precision and recall with specified averaging method and handling of zero division
precision = precision_score(y_test_luxury_beauty, y_test_pred, average='weighted', zero_division=0)
recall = recall_score(y_test_luxury_beauty, y_test_pred, average='weighted')

# Print precision and recall
print('Precision:', precision)
print('Recall:', recall)

In [None]:
# ROC and AUC

# Binarizing the output labels for multiclass
y_test_binarized = label_binarize(y_test_luxury_beauty, classes=np.unique(y_test_luxury_beauty))
n_classes = y_test_binarized.shape[1]

# Predict probabilities for each class
y_pred_prob = svm_best.predict_proba(X_test_vectorized)

# Compute ROC curve and ROC area for each class
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_test_binarized[:, i], y_pred_prob[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

# Plotting all ROC curves
plt.figure(figsize=(8, 6))
colors = ['blue', 'red', 'green', 'purple', 'orange', 'brown'] 
for i, color in zip(range(n_classes), colors):
    plt.plot(fpr[i], tpr[i], color=color, lw=2, label=f'ROC curve of class {i+1} (AUC = {roc_auc[i]:0.2f})')

plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic for multi-class')
plt.legend(loc="lower right")
plt.show()

# Print AUC scores for each class
for i in range(n_classes):
    print(f'AUC score for class {i+1}: {roc_auc[i]:0.2f}')


### How to Handle Class Imbalance

#### SMOTE

In [None]:
from sklearn import svm
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import pandas as pd

# Data preparation
X_train_fashion, y_train_fashion = prepare_data(fashion_df)
X_test_luxury_beauty, y_test_luxury_beauty = prepare_data(luxury_beauty_df)

# Text vectorization
tfidf = TfidfVectorizer()
X_train_vectorized = tfidf.fit_transform(X_train_fashion)
X_test_vectorized = tfidf.transform(X_test_luxury_beauty)

# Splitting into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train_vectorized, y_train_fashion, test_size=0.2, random_state=42)

# Define SVM model (assuming best_C and best_degree are already defined)
svm_model = svm.SVC(kernel='poly', C=best_C, degree=best_degree, probability=True)

# Training and evaluation on original data
svm_model.fit(X_train, y_train)
y_pred_original = svm_model.predict(X_test_vectorized)
print('Original SVM Classification report:\n', classification_report(y_test_luxury_beauty, y_pred_original, zero_division=1))

# Apply SMOTE
sm = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = sm.fit_resample(X_train, y_train)

# Training and evaluation on SMOTE-treated data
svm_model.fit(X_train_resampled, y_train_resampled)
y_pred_resampled = svm_model.predict(X_test_vectorized)
print('SMOTE SVM Classification report:\n', classification_report(y_test_luxury_beauty, y_pred_resampled, zero_division=1))


In [None]:
# Predict on the training set and calculate accuracy
y_train_pred = svm_model.predict(X_train)
train_accuracy = accuracy_score(y_train, y_train_pred)
print(f'Accuracy on training dataset: {train_accuracy:.2f}')

# Predict on the validation set and calculate accuracy
y_val_pred = svm_model.predict(X_val)
val_accuracy = accuracy_score(y_val, y_val_pred)
print(f'Accuracy on validation dataset: {val_accuracy:.2f}')

# Predict on the test set and calculate accuracy (already done)
y_test_pred = svm_model.predict(X_test_vectorized)
test_accuracy = accuracy_score(y_test_luxury_beauty, y_test_pred)
print(f'Accuracy on test dataset: {test_accuracy:.2f}')

# Classification report for the test dataset
print('Test SVM Classification report:\n', classification_report(y_test_luxury_beauty, y_test_pred, zero_division=1))

# Apply SMOTE and retrain
sm = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = sm.fit_resample(X_train, y_train)
svm_model.fit(X_train_resampled, y_train_resampled)

# Predict and calculate accuracy on the test set with the SMOTE-treated model
y_test_pred_smote = svm_model.predict(X_test_vectorized)
test_accuracy_smote = accuracy_score(y_test_luxury_beauty, y_test_pred_smote)
print(f'Accuracy on test dataset with SMOTE: {test_accuracy_smote:.2f}')

# Classification report for the test dataset with SMOTE
print('SMOTE SVM Classification report:\n', classification_report(y_test_luxury_beauty, y_test_pred_smote, zero_division=1))


In [None]:
# Predictions on the training set with SMOTE-treated model
y_train_pred_smote = svm_model.predict(X_train_resampled)

# Predictions on the validation set with SMOTE-treated model
# Note: SMOTE is typically only applied to the training data, not the validation data
y_val_pred_smote = svm_model.predict(X_val)

# Accuracy calculations
train_accuracy_smote = accuracy_score(y_train_resampled, y_train_pred_smote)
val_accuracy_smote = accuracy_score(y_val, y_val_pred_smote)
test_accuracy_smote = accuracy_score(y_test_luxury_beauty, y_pred_resampled)

# Printing accuracies
print(f'Accuracy on training dataset with SMOTE: {train_accuracy_smote:.2f}')
print(f'Accuracy on validation dataset with SMOTE: {val_accuracy_smote:.2f}')
print(f'Accuracy on test dataset with SMOTE: {test_accuracy_smote:.2f}')


In [None]:
import matplotlib.pyplot as plt
import pandas as pd
from imblearn.over_sampling import SMOTE

# Count of samples in each class before SMOTE
class_counts_before = y_train.value_counts().sort_index()

# Apply SMOTE
sm = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = sm.fit_resample(X_train, y_train)

# Count of samples in each class after SMOTE
class_counts_after = pd.Series(y_train_resampled).value_counts().sort_index()

# Plot class balance before and after SMOTE
plt.figure(figsize=(10, 5))

# Before SMOTE
plt.subplot(1, 2, 1)
class_counts_before.plot(kind='bar', color='coral')
plt.title('Classes Before SMOTE')
plt.xlabel('Class')

plt.xticks(range(len(class_counts_before)), range(1, len(class_counts_before) + 1))  # Adjusting x-axis labels

# After SMOTE
plt.subplot(1, 2, 2)
class_counts_after.plot(kind='bar', color='teal')
plt.title('Classes After SMOTE')
plt.xlabel('Class')

plt.xticks(range(len(class_counts_after)), range(1, len(class_counts_after) + 1))  # Adjusting x-axis labels

plt.tight_layout()
plt.show()


In [None]:
# Number of data points before SMOTE
num_samples_before = X_train.shape[0]

# Number of data points after SMOTE
num_samples_after = X_train_resampled.shape[0]

print(f"Number of data points before SMOTE: {num_samples_before}")
print(f"Number of data points after SMOTE: {num_samples_after}")



In [None]:
# SVM model on the expanded training data after SMOTE
svm_model_after_smote = svm.SVC(kernel='poly', C=best_C, degree=best_degree, probability=True)
svm_model_after_smote.fit(X_train_resampled, y_train_resampled)

# Predict on the test dataset
y_pred_after_smote = svm_model_after_smote.predict(X_test_vectorized)

# Calculate the accuracy after SMOTE
accuracy_after_smote = accuracy_score(y_test_luxury_beauty, y_pred_after_smote)

# Compare the accuracy before and after SMOTE
print(f'Accuracy before SMOTE: {round(accuracy_test * 100, 2)}%')
print(f'Accuracy after SMOTE: {round(accuracy_after_smote * 100, 2)}%')


In [None]:
# SVM model on the resampled data (after SMOTE)
svm_model.fit(X_train_resampled, y_train_resampled)

# predictions on the test set
y_pred_resampled = svm_model.predict(X_test_vectorized)

# Evaluate the performance of the model on the test data
print('SVM with SMOTE Classification report:\n', classification_report(y_test_luxury_beauty, y_pred_resampled))


## RNN

In [None]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import tensorflow as tf
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import text_to_word_sequence
from sklearn.model_selection import train_test_split
import json
from functools import lru_cache
import keras_tuner as kt
from keras import backend as K

In [None]:
All_beauty = "AMAZON/All_Beauty_5.json"
Fash = "AMAZON/AMAZON_FASHION_5.json"
Lux = "AMAZON/Luxury_Beauty_5.json"

In [None]:
def get_cols(json_path):
    data = []

    with open(json_path) as f:
        
        for line in f:
            doc = json.loads(line)
            #skip lines with None data, as it would be filtered out anyway
            if 'overall' in doc and 'reviewText' in doc and 'summary' in doc:
                lst = [doc['overall'], doc['reviewText'], doc['summary']]
                data.append(lst)
    return data


In [None]:
def create_full_df(paths):
    full = []
    for p in paths:
        full.extend(get_cols(p))
    return(full)

In [None]:
full_df = pd.DataFrame(create_full_df([All_beauty, Fash]), columns=['overall', 'reviewText', 'summary'])

In [None]:
plt.hist(full_df["overall"])
plt.title("Positive review skew")

plt.show()

In [None]:
full_df = data_resample


In [None]:
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

# Getting relevant stopword packages

In [None]:

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words('english')
stopwords.remove("not") #as with shallow learners, not might imply negative review
stopwords = set(stopwords)


# Loading lemmatizer

In [None]:
wnl = WordNetLemmatizer()
lemmatize = lru_cache(maxsize=50000)(wnl.lemmatize) #cahceing might increase performance
stemmed_list = []
words_to_lemmatize = ["n", "v", "r", "a"]

In [None]:
def preprocess_words(word_list, words_to_lemmatize, stopwords, lemmatizer):
    cleaned_list = []
    for i in word_list:
        new_string = ''.join((x for x in i if not x.isdigit()))
        word_tokens = word_tokenize(new_string)
        # converts the words in word_tokens to lower case and then checks whether 
        #they are present in stop_words or not
        filtered_sentence = [w for w in word_tokens if w.lower() not in stopwords]
        for j in words_to_lemmatize:
            stem = [lemmatizer(word, j) for word in filtered_sentence]
        #stems sentences after stopwords are removed
        cleaned_list.append(' '.join(stem))

    return cleaned_list

In [None]:
full_df["cleaned"] = preprocess_words(full_df["reviewText"], words_to_lemmatize, stopwords, lemmatize)

In [None]:
lux_df = pd.DataFrame(get_cols(Lux), columns=["overall", "reviewText", "summary"])
lux_df["negative"] = lux_df["overall"].apply(lambda x: 1 if x <= 3 else 0)
lux_df["cleaned"] = preprocess_words(lux_df["reviewText"], words_to_lemmatize, stopwords, lemmatize)

In [None]:
vocab_list = full_df["cleaned"]
max_tokens = 5214 
output_sequence_length = 209 # adapted from the shallow learner EDA
pad_to_max_tokens = True

encoder = tf.keras.layers.TextVectorization(max_tokens=max_tokens,
                                            output_sequence_length=output_sequence_length,
                                            pad_to_max_tokens=pad_to_max_tokens)



#Build a vocabulary of all string terms from tokens seen in the dataset.
encoder.adapt(list(vocab_list), batch_size=128)

#Retrieves the computed vocabulary
vocab = np.array(encoder.get_vocabulary()) 
print(f'The reviews vocabulary: \n{vocab}\n')
print(f'length of vocabulary: {len(vocab)}')

In [None]:
reviews = full_df["cleaned"]

In [None]:
max_length_train = len(max(reviews, key=len))

max_length = max_length_train


In [None]:
X_train, X_test, y_train, y_test = train_test_split(np.array(full_df['cleaned']), full_df['negative'], test_size=0.25, random_state=42)

In [None]:
def encode_and_pad(data, max_length):
    encoded_example = encoder(data).numpy() 
    padded_reviews = pad_sequences(encoded_example, maxlen=max_length,
                               padding='post')
    return padded_reviews

In [None]:
train_padded_reviews = encode_and_pad(X_train, max_length)
test_padded_reviews = encode_and_pad(X_test, max_length)

In [None]:
tf.random.set_seed(42)

def model_builder(hp):
    #clearing backend to help memory leakage
    K.clear_session()
    model = tf.keras.models.Sequential()
    hp_embedding_dim = hp.Choice('embedding_dimension', values=[128, 256]) 
    model.add(tf.keras.layers.Embedding(input_dim=len(vocab), 
                              output_dim=hp_embedding_dim,
                              input_length=max_length,
                              name="embedding"))
    hp_neurons = hp.Int("units_1", min_value=32, max_value=64, step=16)                         
    model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(hp_neurons, return_sequences=True)))
    hp_neurons_2 = hp.Int("units_2", min_value=16, max_value=32, step=16)   
    model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(hp_neurons_2)))
    model.add(tf.keras.layers.Dropout(0.2))
    model.add(tf.keras.layers.Dense(32, activation='relu'))
    model.add(tf.keras.layers.BatchNormalization())
    model.add(tf.keras.layers.Dense(1, activation='sigmoid'))

    hp_learning_rate = hp.Choice('learning_rate', values=[0.0001, 0.001, 0.01])
    hp_weight_decay = hp.Choice('weight_decay', values=[0.001, 0.01])
    adam = tf.keras.optimizers.Adam(learning_rate=hp_learning_rate, weight_decay=hp_weight_decay)

    model.compile(optimizer=adam, loss='binary_crossentropy', 
                        metrics=["accuracy", tf.keras.metrics.Precision(), tf.keras.metrics.Recall()]
                        )
    return model

In [None]:
import math

print(f'Tuning will run approximately {round(50 * (math.log(50, 10) ** 2))} epochs')

In [None]:
tuner = kt.Hyperband(model_builder,
                     objective='val_loss',
                     max_epochs=30,
                     factor=5,
                     seed=42,
                     directory='Tuning results',
                     project_name='RNN hyperband tuning1')

In [None]:
es = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5)

tuner.search(train_padded_reviews, y_train, epochs=20, batch_size=256, validation_data=[test_padded_reviews, y_test], callbacks=[es])

# Get the optimal hyperparameters
best_hps=tuner.get_best_hyperparameters(num_trials=1)[0]

print(f"""
Search complete! 
\nOptimal amount of neurons in the first biLSTM layer: {best_hps.get('units_1')}
\nOptimal amount of neurons in the second biLSTM layer: {best_hps.get('units_2')}
\nOptimal embedding dimension: {best_hps.get('embedding_dimension')} 
\nOptimal learning rate: {best_hps.get('learning_rate')}
\nOptimal weight decay: {best_hps.get('weight_decay')}
""")

## recording search results

encoded manually, so search doesn't have to be run again, if system crashes or similar

In [None]:
embedding_dim = 256

biLSTM1 = 32

biLSTM2 = 32

lr = 0.01

wd = 0.01

In [None]:
K.clear_session()
tf.random.set_seed(42)
tuned_model = tf.keras.models.Sequential([
    tf.keras.layers.Embedding(input_dim=len(vocab), 
                              output_dim=embedding_dim,
                              input_length=max_length,
                              name="embedding"),                      
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(biLSTM1, return_sequences=True), name="bidirectional_LSTM_1"),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(biLSTM2), name="bidirectional_LSTM_2"),
    tf.keras.layers.Dropout(0.2, name="dropout"),
    tf.keras.layers.Dense(32, activation='relu', name="Dense_layer"),
    tf.keras.layers.BatchNormalization(name="batch_norm"),
    tf.keras.layers.Dense(1, activation='sigmoid', name="output")
    ])
adam = tf.keras.optimizers.Adam(learning_rate=lr, weight_decay=wd)

tuned_model.compile(optimizer=adam, loss='binary_crossentropy', 
                        metrics=["accuracy", tf.keras.metrics.Precision(), tf.keras.metrics.Recall()]
                        )

tuned_model.summary()

In [None]:
import pydot
import graphviz

tf.keras.utils.plot_model(tuned_model, to_file="rnn.png",  show_shapes=True, show_layer_names=True)

# Training model

In [None]:
num_epochs = 20
es = tf.keras.callbacks.EarlyStopping(monitor="val_loss", patience=num_epochs*0.2, restore_best_weights=True, verbose=1)

history = tuned_model.fit(train_padded_reviews, y_train, batch_size=64, epochs=num_epochs, validation_data=(test_padded_reviews, y_test), callbacks=[es])

In [None]:
val_min = min(history.history["val_loss"])

best_epoch = history.history["val_loss"].index(val_min)

plt.plot(history.history['loss'], label="loss")
plt.plot(history.history['val_loss'], label="val_loss")
plt.scatter(best_epoch, val_min, label=f'minimum loss: {round(val_min, 3)}', color="red")
plt.title('Tuned model performance')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend()
plt.show()

In [None]:
from sklearn.metrics import roc_curve
y_pred_keras = tuned_model.predict(test_padded_reviews).ravel()
fpr_keras, tpr_keras, thresholds_keras = roc_curve(y_test, y_pred_keras)

In [None]:
from sklearn.metrics import auc
auc_keras = auc(fpr_keras, tpr_keras)

In [None]:
plt.figure(1)
plt.plot([0, 1], [0, 1], 'k--')
plt.plot(fpr_keras, tpr_keras, label='tuned model (area = {:.3f})'.format(auc_keras))
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.title('ROC curve for test data prediction')
plt.legend(loc='best')
plt.show()

In [None]:
encoded_example_test = encoder(lux_df["cleaned"]).numpy() 
val_padded_reviews = pad_sequences(encoded_example_test, maxlen=max_length,
                               padding='post')
val_labels = np.array(lux_df["negative"])

In [None]:
val_labels = np.array(lux_df["negative"])

# Evaluating model on unseen data

In [None]:
loss, acc, precision, recall = tuned_model.evaluate(val_padded_reviews, val_labels)

print(f'loss: {loss}\naccuracy: {acc}')

In [None]:
prediction = tuned_model.predict(val_padded_reviews)

In [None]:
y_pred_keras = prediction.ravel()
fpr_keras, tpr_keras, thresholds_keras = roc_curve(val_labels, y_pred_keras)
auc_score = auc(fpr_keras, tpr_keras)


In [None]:
y_pred_keras = prediction.ravel()
fpr_keras, tpr_keras, thresholds_keras = roc_curve(val_labels, y_pred_keras)
auc_score = auc(fpr_keras, tpr_keras)


plt.figure(1)
plt.plot([0, 1], [0, 1], 'k--')
plt.plot(fpr_keras, tpr_keras, label='tuned model (area = {:.3f})'.format(auc_score))
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.title('ROC curve for luxury beauty prediction')
plt.legend(loc='best')
plt.show()

In [None]:
tuned_model.save('Models/tuned_model.keras')

In [None]:
tf.random.set_seed(42)
GRU_model = tf.keras.models.Sequential([
    tf.keras.layers.Embedding(input_dim=len(vocab), 
                              output_dim=128,
                              input_length=max_length,
                              name="embedding"),                      
    tf.keras.layers.Bidirectional(tf.keras.layers.GRU(16), name="bidirectional_GRU"),
    tf.keras.layers.Dropout(0.2, name="dropout"),
    tf.keras.layers.Dense(32, activation='relu', name="Dense_layer"),
    tf.keras.layers.BatchNormalization(name="batch_norm"),
    tf.keras.layers.Dense(1, activation='sigmoid', name="output")
    ])
adam = tf.keras.optimizers.Adam(learning_rate=0.001, weight_decay=wd)

GRU_model.compile(optimizer=adam, loss='binary_crossentropy', 
                        metrics=["accuracy", tf.keras.metrics.Precision(), tf.keras.metrics.Recall()]
                        )

GRU_model.summary()

In [None]:

tf.keras.utils.plot_model(GRU_model, to_file="rnn2.png",  show_shapes=True, show_layer_names=True)

In [None]:
num_epochs = 50
es = tf.keras.callbacks.EarlyStopping(monitor="val_loss", patience=num_epochs*0.2, restore_best_weights=True, verbose=1)

GRU_history = GRU_model.fit(train_padded_reviews, y_train, batch_size=64, epochs=num_epochs, validation_data=(test_padded_reviews, y_test), callbacks=[es])

In [None]:
val_min = min(GRU_history.history["val_loss"])

best_epoch = GRU_history.history["val_loss"].index(val_min)

plt.plot(GRU_history.history['loss'], label="loss")
plt.plot(GRU_history.history['val_loss'], label="val_loss")
plt.scatter(best_epoch, val_min, label=f'minimum loss: {round(val_min, 3)}', color="red")
plt.title('GRU model performance')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend()
plt.show()

In [None]:
plt.plot(GRU_history.history['accuracy'], label="accuracy")
plt.plot(GRU_history.history['val_accuracy'], label="val_accuracy")
plt.vlines(best_epoch, ymin=min(history.history["accuracy"]), ymax=1, label="best epoch", linestyle="dotted", color="red")
plt.title("GRU model accuracy")
plt.legend()
plt.show()

In [None]:
y_pred_keras = GRU_model.predict(test_padded_reviews).ravel()
fpr_keras, tpr_keras, thresholds_keras = roc_curve(y_test, y_pred_keras)
auc_keras = auc(fpr_keras, tpr_keras)

plt.figure(1)
plt.plot([0, 1], [0, 1], 'k--')
plt.plot(fpr_keras, tpr_keras, label='GRU model (area = {:.3f})'.format(auc_keras))
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.title('ROC curve for test data prediction')
plt.legend(loc='best')
plt.show()

In [None]:
loss, acc, precision, recall = GRU_model.evaluate(val_padded_reviews, val_labels)

print(f'loss: {loss}\naccuracy: {acc}')

In [None]:
prediction = GRU_model.predict(val_padded_reviews)
y_pred_keras = prediction.ravel()
fpr_keras, tpr_keras, thresholds_keras = roc_curve(val_labels, y_pred_keras)
auc_score = auc(fpr_keras, tpr_keras)


plt.figure(1)
plt.plot([0, 1], [0, 1], 'k--')
plt.plot(fpr_keras, tpr_keras, label='GRU model (area = {:.3f})'.format(auc_score))
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.title('ROC curve for luxury beauty prediction')
plt.legend(loc='best')
plt.show()

In [None]:
GRU_model.save('Models/GRU_model.keras')