In [1]:
import boto3
import pandas as pd
from sagemaker import get_execution_role

role = get_execution_role()

bucket = 'gcu-ml2-team2'
s3client = boto3.client('s3')

response = s3client.get_object(Bucket=bucket, Key='df_modcloth.csv')
df = pd.read_csv(response['Body'])
df.head()

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml


Unnamed: 0,item_id,user_id,rating,timestamp,size,fit,user_attr,model_attr,category,brand,year,split
0,7443,Alex,4,2010-01-21 08:00:00+00:00,,,Small,Small,Dresses,,2012,0
1,7443,carolyn.agan,3,2010-01-27 08:00:00+00:00,,,,Small,Dresses,,2012,0
2,7443,Robyn,4,2010-01-29 08:00:00+00:00,,,Small,Small,Dresses,,2012,0
3,7443,De,4,2010-02-13 08:00:00+00:00,,,,Small,Dresses,,2012,0
4,7443,tasha,4,2010-02-18 08:00:00+00:00,,,Small,Small,Dresses,,2012,0


# Data Preprocessing

## 1. Feature Selection & Handling Missing Value

In [2]:
# Collaborate Filtering method uses - userId, itemId, rating
# Content Filtering method uses - fit, year, catetory, brand

# Drop features that will not be used
df.drop(['timestamp', 'size', 'user_attr', 'model_attr', 'split'], axis=1, inplace=True)

# Drop row with NaN as categorical value
df.dropna(subset=['fit', 'brand'], inplace=True)

df

Unnamed: 0,item_id,user_id,rating,fit,category,brand,year
280,21296,Petra,5,Just right,Bottoms,ModCloth,2013
281,21296,ejs,4,Slightly small,Bottoms,ModCloth,2013
292,21296,Brandi,5,Just right,Bottoms,ModCloth,2013
318,21296,Momo,5,Slightly small,Bottoms,ModCloth,2013
323,21296,Jackie,5,Just right,Bottoms,ModCloth,2013
...,...,...,...,...,...,...,...
99879,135555,tania,5,Just right,Outerwear,ModCloth,2016
99880,86073,Foucault,3,Slightly small,Outerwear,ModCloth,2017
99882,71607,nadgee,5,Just right,Outerwear,Jack by BB Dakota,2016
99884,154353,roshelle,5,Just right,Outerwear,ModCloth,2018


## 2. Encoding Categorical Data

In [3]:
# Check types of categorical data
print(df['user_id'].unique())
print(df['fit'].unique())
print(df['category'].unique())
print(df['brand'].unique())


# Convert categorcal value to numerical
# df['user_id'] = df['user_id'].astype('category').cat.codes
df['fit'] = df['fit'].astype('category').cat.codes  #['Just right': 0, 'Slightly small': 1, 'Slightly large': 2, 'Very small': 3, 'Very large': 4]
df['category'] = df['category'].astype('category').cat.codes  #['Bottoms': 0, 'Dresses': 1, 'Outerwear': 2, 'Tops': 3]
df['brand'] = df['brand'].astype('category').cat.codes

df

['Petra' 'ejs' 'Brandi' ... 'Foucault' 'nadgee' 'roshelle']
['Just right' 'Slightly small' 'Slightly large' 'Very small' 'Very large']
['Bottoms' 'Dresses' 'Outerwear' 'Tops']
['ModCloth' 'Retrolicious' 'Steve Madden' 'Ryu' 'Chi Chi London'
 'Out of Print' 'Kin Ship' 'Jack by BB Dakota' 'Pink Martini'
 'Miss Candyfloss' 'Emily and Fin' 'Daisey Natives' 'Hell Bunny' 'Banned'
 'Sugarhill Boutique' 'Wrangler' 'Wendy Bird' 'Pepaloves' 'Collectif'
 'Compania Fantastica' 'Closet London' 'Eliza J' 'BB Dakota' "Alice's Pig"
 'Louche' "Effie's Heart" 'Miss Patina' 'Mata Traders' "Rolla's" 'Yumi'
 'Blue Platypus']


Unnamed: 0,item_id,user_id,rating,fit,category,brand,year
280,21296,Petra,5,0,0,19,2013
281,21296,ejs,4,2,0,19,2013
292,21296,Brandi,5,0,0,19,2013
318,21296,Momo,5,2,0,19,2013
323,21296,Jackie,5,0,0,19,2013
...,...,...,...,...,...,...,...
99879,135555,tania,5,0,2,19,2016
99880,86073,Foucault,3,2,2,19,2017
99882,71607,nadgee,5,0,2,13,2016
99884,154353,roshelle,5,0,2,19,2018


# Model Based Filtering

In [4]:
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from scipy.sparse.linalg import svds
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.decomposition import NMF

# Dataset Split: Training (70%), Validation (15%), Testing (15%)
train_data, temp_data = train_test_split(df, test_size=0.3, random_state=42)
validation_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42)

## Implement SVD directly (Not used SVD model)

In [5]:
# Create a user-item rating matrix (fill unrated values with 0)
train_user_item_matrix = train_data.pivot_table(index='user_id', columns='item_id', values='rating').fillna(0)
train_user_item_sparse = csr_matrix(train_user_item_matrix.values)

# Split the matrix into three using SVD
U_train, sigma_train, Vt_train = svds(train_user_item_sparse, k=50)
sigma_train = np.diag(sigma_train)

# Predict ratings and convert to DataFrame
predicted_ratings_train = np.dot(np.dot(U_train, sigma_train), Vt_train) + train_user_item_matrix.mean(axis=1).values[:, np.newaxis]
predicted_ratings_train_df = pd.DataFrame(predicted_ratings_train, index=train_user_item_matrix.index, columns=train_user_item_matrix.columns)

# Clip predicted ratings to be within the range of 0 to 5
predicted_ratings_train_df_clipped = predicted_ratings_train_df.clip(0, 5)

# User-item rating matrix -> validation and test set versions
validation_user_item_matrix = validation_data.pivot_table(index='user_id', columns='item_id', values='rating').fillna(0)
test_user_item_matrix = test_data.pivot_table(index='user_id', columns='item_id', values='rating').fillna(0)


## Model evaluation

### SVD evaluation

In [6]:
# Function to calculate Mean Squared Error (MSE) for predictions
def calculate_mse(actual_matrix, predicted_ratings_df):
    # Find common user and item IDs between actual and predicted matrices
    common_user_ids = actual_matrix.index.intersection(predicted_ratings_df.index)
    common_item_ids = actual_matrix.columns.intersection(predicted_ratings_df.columns)
    
    # Extract predicted and actual ratings for common user and item IDs
    predicted_ratings = predicted_ratings_df.loc[common_user_ids, common_item_ids]
    actual_ratings = actual_matrix.loc[common_user_ids, common_item_ids]
    
    # Calculate MSE using the predicted and actual ratings
    mse = mean_squared_error(actual_ratings.values.flatten(), predicted_ratings.values.flatten())
    return mse

# Calculate MSE for the validation and test datasets
mse_validation = calculate_mse(validation_user_item_matrix, predicted_ratings_train_df)
mse_test = calculate_mse(test_user_item_matrix, predicted_ratings_train_df)

# Print the MSE values for validation and test sets
print("MSE on Validation Set:", mse_validation)
print("MSE on Test Set:", mse_test)

MSE on Validation Set: 0.3118110467869254
MSE on Test Set: 0.3352843948003167


### SVD (without NaN) output

In [7]:
selected_user_id = 'Jennifer'

# Check if the selected user exists in the training user-item matrix
if selected_user_id in train_user_item_matrix.index:
    # Actual ratings for the selected user
    actual_ratings_user = train_user_item_matrix.loc[selected_user_id]

    # Predicted ratings (using the clipped DataFrame)
    predicted_ratings_user = predicted_ratings_train_df_clipped.loc[selected_user_id]

    # Create a comparison DataFrame
    comparison_df = pd.DataFrame({
        'Actual Rating': actual_ratings_user,
        'Predicted Rating': predicted_ratings_user
    })
    print(comparison_df)
else:
    print("Selected user not found in the training data.")


         Actual Rating  Predicted Rating
item_id                                 
6454               0.0          0.453942
21296              0.0          0.184445
27439              0.0          0.177644
28252              4.0          4.205979
35525              4.0          4.253872
...                ...               ...
154693             0.0          0.220126
154748             0.0          0.220126
154749             0.0          0.219042
155165             0.0          0.191091
155597             0.0          0.220126

[477 rows x 2 columns]


## SVD
Drop NaN values during the training process to train the model, and retain NaN values during the validation and testing phases.

In [8]:
# Leave unrated values as NaN (do not fill with 0 to avoid negative bias)
train_user_item_matrix = train_data.pivot_table(index='user_id', columns='item_id', values='rating', fill_value=np.nan)

# Perform SVD on all the data
train_user_item_sparse = csr_matrix(train_user_item_matrix.fillna(0).values)
U_train, sigma_train, Vt_train = svds(train_user_item_sparse, k=50)
sigma_train_diag_matrix = np.diag(sigma_train)

# Calculate predicted ratings
predicted_ratings = np.dot(np.dot(U_train, sigma_train_diag_matrix), Vt_train)

predicted_ratings_df = pd.DataFrame(predicted_ratings, index=train_user_item_matrix.index, columns=train_user_item_matrix.columns)

# Define the range for predicted ratings
predicted_ratings_df_clipped = predicted_ratings_df.clip(0, 5)


### SVD (with NaN) output

In [9]:
# Compare the actual ratings and predicted ratings for 'Jennifer'
# This code is similar to the previous one but uses a dataset with NaN values and performs the comparison.

selected_user_id = 'Jennifer'
if selected_user_id in train_user_item_matrix.index:
    # Actual ratings for the selected user
    actual_ratings_user = train_user_item_matrix.loc[selected_user_id]

    # Use a different DataFrame for predicted ratings
    predicted_ratings_user = predicted_ratings_df_clipped.loc[selected_user_id]

    # Create a comparison DataFrame
    comparison_df = pd.DataFrame({
        'Actual Rating': actual_ratings_user,
        'Predicted Rating': predicted_ratings_user
    })
    print(comparison_df)
else:
    print("Selected user not found in the training data.")


         Actual Rating  Predicted Rating
item_id                                 
6454               NaN      2.338159e-01
21296              NaN      0.000000e+00
27439              NaN      0.000000e+00
28252              4.0      3.985853e+00
35525              4.0      4.033746e+00
...                ...               ...
154693             NaN      4.412977e-16
154748             NaN      8.536050e-16
154749             NaN      0.000000e+00
155165             NaN      0.000000e+00
155597             NaN      4.409800e-17

[477 rows x 2 columns]


## Utilize SVD class using surprise library
When surprise is used, unrated parts are ignored and only rated parts are used for learning.

In [10]:
pip install scikit-surprise

Note: you may need to restart the kernel to use updated packages.


In [13]:
from surprise import Dataset, Reader, SVD, accuracy
from surprise.model_selection import train_test_split, GridSearchCV

reader = Reader(rating_scale=(0, 5))  # Set rating scale
# Convert DataFrame to the Dataset format of the surprise library using the Dataset.load_from_df() method
data = Dataset.load_from_df(df[['user_id', 'item_id', 'rating']], reader)

# Set hyperparameter grid
param_grid = {
    'n_factors': [50, 100, 150],  # Number of dimensions for SVD
    'n_epochs': [30, 40, 50, 60],  # Number of iterations
    'lr_all': [0.1, 0.01, 0.001, 0.0001],  # Learning rate
    'reg_all': [0.02, 0.05]
}

# Perform hyperparameter tuning using GridSearchCV
gs = GridSearchCV(SVD, param_grid, measures=['mse'], cv=3)
gs.fit(data)

# Print the best MSE result and the corresponding hyperparameters
print(f"Best MSE: {gs.best_score['mse']}")
print(f"Best params: {gs.best_params['mse']}")

# Train the SVD model with the optimal hyperparameters
model = gs.best_estimator['mse']

# Split the dataset and train the model
trainset, testset = train_test_split(data, test_size=0.3)
model.fit(trainset)

# Predictions on the test set
predictions = model.test(testset)

# Calculate MSE
mse = accuracy.mse(predictions)
print(f"MSE: {mse}")

# Choose any user for prediction validation
selected_user_id = 'Jennifer'
user_predictions = []
actual_ratings = []

# Extract actual and predicted ratings for the selected user
for uid, iid, true_r, est, _ in predictions:
    if uid == selected_user_id:
        user_predictions.append((iid, est))
        actual_ratings.append((iid, true_r))

# Convert predicted and actual ratings into DataFrames
predicted_ratings_df = pd.DataFrame(user_predictions, columns=['item_id', 'predicted_rating'])
actual_ratings_df = pd.DataFrame(actual_ratings, columns=['item_id', 'actual_rating'])

# Final DataFrame for comparison
final_comparison_df = pd.merge(actual_ratings_df, predicted_ratings_df, on='item_id')
print(final_comparison_df)


Best MSE: 1.0253040765198127
Best params: {'n_factors': 50, 'n_epochs': 50, 'lr_all': 0.001, 'reg_all': 0.05}
MSE: 1.0056
MSE: 1.0055567449339107
    item_id  actual_rating  predicted_rating
0    153463            5.0          4.367221
1    126993            5.0          4.397068
2    153520            4.0          4.224820
3    121554            5.0          3.897666
4    133926            5.0          4.323115
5     28252            4.0          4.347791
6    153597            5.0          4.440643
7    110820            5.0          4.275310
8     67022            5.0          4.738021
9    119314            5.0          4.428914
10    35525            4.0          3.864608
11    54222            5.0          4.375229
12   135345            4.0          4.393271
13   113643            5.0          4.235319
14   118020            5.0          4.593580
