# Data Exploration

In [10]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

import boto3
from sagemaker import get_execution_role

role = get_execution_role()

bucket = 'gcu-ml2-team2'
s3client = boto3.client('s3')

# import 
response = s3client.get_object(Bucket=bucket, Key='df_modcloth.csv')
df = pd.read_csv(response['Body'])
df.head()

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml


Unnamed: 0,item_id,user_id,rating,timestamp,size,fit,user_attr,model_attr,category,brand,year,split
0,7443,Alex,4,2010-01-21 08:00:00+00:00,,,Small,Small,Dresses,,2012,0
1,7443,carolyn.agan,3,2010-01-27 08:00:00+00:00,,,,Small,Dresses,,2012,0
2,7443,Robyn,4,2010-01-29 08:00:00+00:00,,,Small,Small,Dresses,,2012,0
3,7443,De,4,2010-02-13 08:00:00+00:00,,,,Small,Dresses,,2012,0
4,7443,tasha,4,2010-02-18 08:00:00+00:00,,,Small,Small,Dresses,,2012,0


# Data Preprocessing

## 1. Feature Selection & Handling Missing Value

In [3]:
# Collaborate Filtering method uses - userId, itemId, rating
# Content Filtering method uses - fit, year, catetory, brand

# Drop features that will not be used
df.drop(['user_id','timestamp', 'size', 'user_attr', 'fit', 'split'], axis=1, inplace=True)

# Drop row with NaN as categorical value
df.dropna(subset=['brand'], inplace=True)

df

Unnamed: 0,item_id,rating,model_attr,category,brand,year
280,21296,5,Small,Bottoms,ModCloth,2013
281,21296,4,Small,Bottoms,ModCloth,2013
292,21296,5,Small,Bottoms,ModCloth,2013
318,21296,5,Small,Bottoms,ModCloth,2013
323,21296,5,Small,Bottoms,ModCloth,2013
...,...,...,...,...,...,...
99879,135555,5,Small&Large,Outerwear,ModCloth,2016
99880,86073,3,Small&Large,Outerwear,ModCloth,2017
99882,71607,5,Small&Large,Outerwear,Jack by BB Dakota,2016
99884,154353,5,Small,Outerwear,ModCloth,2018


## 2. Group by item_id and calculate avg_rating

In [4]:
# Group df by "item_id" and calculate avg_rating
avg_df = df.groupby('item_id').agg({
    'rating': lambda x: x.mean(),
    'model_attr': 'first',
    'category': 'first',
    'brand': 'first',
    'year': 'first'
}).reset_index()


avg_df

Unnamed: 0,item_id,rating,model_attr,category,brand,year
0,6454,3.818182,Small&Large,Dresses,ModCloth,2017
1,21296,4.171760,Small,Bottoms,ModCloth,2013
2,27439,4.484456,Small&Large,Outerwear,Steve Madden,2018
3,28252,4.465909,Small,Dresses,Retrolicious,2015
4,35525,3.751613,Small,Dresses,Retrolicious,2014
...,...,...,...,...,...,...
512,154748,3.375000,Small,Bottoms,ModCloth,2018
513,154749,3.142857,Small,Bottoms,ModCloth,2018
514,154934,3.700000,Small,Tops,Kin Ship,2017
515,155165,4.666667,Small,Tops,Out of Print,2018


## 3. Encoding Categorical Data

In [5]:
# Check types of categorical data
print(df['model_attr'].unique())
print(df['category'].unique())
print(df['brand'].unique())
print(df['year'].unique())

# 'model_attr', 'category', 'brand', 'year' 열을 0과 1로 one-hot encoding (vectorization)
df_encoded = pd.get_dummies(avg_df, columns=['model_attr', 'category', 'brand', 'year'], prefix=['model_attr', 'category', 'brand', 'year'], dtype=int)
df_encoded = df_encoded.drop(['rating','item_id'], axis = 1)

print(df_encoded.shape)
df_encoded

['Small' 'Small&Large']
['Bottoms' 'Dresses' 'Outerwear' 'Tops']
['ModCloth' 'Retrolicious' 'Steve Madden' 'Ryu' 'Chi Chi London'
 'Out of Print' 'Kin Ship' 'Jack by BB Dakota' 'Pink Martini'
 'Miss Candyfloss' 'Emily and Fin' 'Daisey Natives' 'Hell Bunny' 'Banned'
 'Sugarhill Boutique' 'Wrangler' 'Wendy Bird' 'Pepaloves' 'Collectif'
 'Compania Fantastica' 'Closet London' 'Eliza J' 'BB Dakota' "Alice's Pig"
 'Louche' "Effie's Heart" 'Miss Patina' 'Mata Traders' "Rolla's" 'Yumi'
 'Blue Platypus']
[2013 2015 2018 2014 2016 2017 2019]
(517, 44)


Unnamed: 0,model_attr_Small,model_attr_Small&Large,category_Bottoms,category_Dresses,category_Outerwear,category_Tops,brand_Alice's Pig,brand_BB Dakota,brand_Banned,brand_Blue Platypus,...,brand_Wendy Bird,brand_Wrangler,brand_Yumi,year_2013,year_2014,year_2015,year_2016,year_2017,year_2018,year_2019
0,0,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,1,0,1,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
2,0,1,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,1,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
4,1,0,0,1,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
512,1,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
513,1,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
514,1,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
515,1,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


# Modeling

## Calculate Similarity

In [6]:
# Calculate similarity by cosine similarity
cosine_sim = cosine_similarity(df_encoded)

cosine_sim.shape

(517, 517)

In [7]:
# Make cosine_similaritiy dataframe have index and column is item_id
cosine_sim_df = pd.DataFrame(cosine_sim, index = avg_df.item_id, columns = avg_df.item_id)
print(cosine_sim_df.shape)
cosine_sim_df.head()

(517, 517)


item_id,6454,21296,27439,28252,35525,40141,40899,48629,54062,54222,...,154546,154555,154667,154668,154693,154748,154749,154934,155165,155597
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
6454,1.0,0.25,0.25,0.25,0.25,0.5,0.5,0.25,0.0,0.5,...,0.25,0.5,0.0,0.0,0.25,0.25,0.25,0.25,0.0,0.25
21296,0.25,1.0,0.0,0.25,0.25,0.25,0.5,0.25,0.25,0.5,...,0.25,0.5,0.25,0.25,0.75,0.75,0.75,0.25,0.25,0.25
27439,0.25,0.0,1.0,0.0,0.0,0.0,0.25,0.0,0.75,0.25,...,0.25,0.0,0.25,0.25,0.25,0.25,0.25,0.0,0.25,0.0
28252,0.25,0.25,0.0,1.0,0.75,0.75,0.0,0.5,0.25,0.25,...,0.25,0.25,0.25,0.25,0.25,0.25,0.25,0.25,0.25,0.25
35525,0.25,0.25,0.0,0.75,1.0,0.75,0.0,0.5,0.25,0.0,...,0.25,0.25,0.25,0.25,0.25,0.25,0.25,0.25,0.25,0.25


## Content Based Recommend

In [11]:
# Get the Top k(default = 10) similary items for the target_item_id
def item_recommendations(target_item_id, k=10):
        recom_idx = cosine_sim_df.loc[:, target_item_id].values.reshape(1, -1).argsort()[:, ::-1].flatten()[1:k + 1]
        recom_item_id = avg_df.iloc[recom_idx, :].item_id.values
        recom_rating = avg_df.iloc[recom_idx, :].rating.values
        recom_brand = avg_df.iloc[recom_idx, :].brand.values
        recom_category = avg_df.iloc[recom_idx, :].category.values
        recom_model_attr = avg_df.iloc[recom_idx, :].model_attr.values
        recom_year = avg_df.iloc[recom_idx, :].year.values

        recommendations = pd.DataFrame({
            'item_id': recom_item_id,
            'rating' : recom_rating,
            'brand': recom_brand,
            'category': recom_category,
            'model_attr': recom_model_attr,
            'year': recom_year
        })

        return recommendations

## Recommendation Result

In [12]:
# print recommendation 10 items about item_id = 27439
item_recommendations(27439)

Unnamed: 0,item_id,rating,brand,category,model_attr,year
0,152153,4.432432,Wrangler,Outerwear,Small&Large,2018
1,54062,4.0,Steve Madden,Outerwear,Small,2018
2,86074,3.90566,ModCloth,Outerwear,Small&Large,2018
3,129096,4.25,Steve Madden,Outerwear,Small,2018
4,153540,3.9,Hell Bunny,Outerwear,Small,2018
5,137285,4.175258,Chi Chi London,Dresses,Small&Large,2018
6,129113,4.505618,Steve Madden,Outerwear,Small,2016
7,154064,4.776471,ModCloth,Dresses,Small&Large,2018
8,146244,3.903226,Banned,Tops,Small&Large,2018
9,152856,3.888889,ModCloth,Outerwear,Small,2018
