## Import Library

In [4]:
import tensorflow as tf
import keras.layers
import os
import glob
import sys
import pandas as pd
import numpy as np
import json
import matplotlib.pyplot as plt
import seaborn as sns
import re
import missingno
from sklearn.model_selection import train_test_split

In [5]:

tf.__version__

'2.12.0'

## Choose df

In [6]:
df = pd.read_json('./dataset/modcloth_final_data.json', lines=True)

## Remove whitespace

In [7]:
df.columns = [i.replace(" ","_") for i in df.columns]

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 82790 entries, 0 to 82789
Data columns (total 18 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   item_id         82790 non-null  int64  
 1   waist           2882 non-null   float64
 2   size            82790 non-null  int64  
 3   quality         82722 non-null  float64
 4   cup_size        76535 non-null  object 
 5   hips            56064 non-null  float64
 6   bra_size        76772 non-null  float64
 7   category        82790 non-null  object 
 8   bust            11854 non-null  object 
 9   height          81683 non-null  object 
 10  user_name       82790 non-null  object 
 11  length          82755 non-null  object 
 12  fit             82790 non-null  object 
 13  user_id         82790 non-null  int64  
 14  shoe_size       27915 non-null  float64
 15  shoe_width      18607 non-null  object 
 16  review_summary  76065 non-null  object 
 17  review_text     76065 non-null 

In [9]:
df.drop(['cup_size','bra_size'],axis=1)

Unnamed: 0,item_id,waist,size,quality,hips,category,bust,height,user_name,length,fit,user_id,shoe_size,shoe_width,review_summary,review_text
0,123373,29.0,7,5.0,38.0,new,36,5ft 6in,Emily,just right,small,991571,,,,
1,123373,31.0,13,3.0,30.0,new,,5ft 2in,sydneybraden2001,just right,small,587883,,,,
2,123373,30.0,7,2.0,,new,,5ft 7in,Ugggh,slightly long,small,395665,9.0,,,
3,123373,,21,5.0,,new,,,alexmeyer626,just right,fit,875643,,,,
4,123373,,18,5.0,,new,,5ft 2in,dberrones1,slightly long,small,944840,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
82785,807722,,8,4.0,,outerwear,,5ft 8in,Jennifer,just right,fit,727820,8.5,average,Cute jacket!,Cute jacket!
82786,807722,,12,5.0,,outerwear,,5ft 5in,Kelli,slightly long,small,197040,,,It's a beautiful jacket.,It's a beautiful jacket. I love how it's knit ...
82787,807722,,12,5.0,36.0,outerwear,,5ft 4in,elacount,just right,fit,102493,,,I love this blazer. It is,I love this blazer. It is a great office piece...
82788,807722,,12,4.0,,outerwear,,5ft 3in,jennaklinner,just right,fit,756491,,,I love this blazer!! I wo,I love this blazer!! I wore it yesterday and g...


## Check data that can be converted into numeric

In [10]:
df.bust.unique()
df.height.unique()
df.length.unique()
df.shoe_width.unique()

array([nan, 'wide', 'average', 'narrow'], dtype=object)

## Normalize data

In [11]:
def normalize_bust(data):
    try :
        if pd.notnull(data) : ##standard size 37-39
            if "-" in data:
                assert len(data.split("-")) == 2
                return np.mean([int(num) for num in data.split("-")])
            else :
                return int(data)
    except Exception as e :
        return None
    return None

def normalize_height(data): ##Convert into cms
    if pd.notnull(data):
        try:
            return (int(data[0])*30.48) + (int(data[4:-2])*2.54)
        except:
            return (int(data[0])*30.48)
    return None

In [12]:
df["bust"] = df["bust"].apply(lambda x:normalize_bust(x))
df["height"] = df["height"].apply(lambda x: normalize_height(x))

In [13]:
df.describe()

Unnamed: 0,item_id,waist,size,quality,hips,bra_size,bust,height,user_id,shoe_size
count,82790.0,2882.0,82790.0,82722.0,56064.0,76772.0,11854.0,81683.0,82790.0,27915.0
mean,469325.22917,31.319223,12.661602,3.949058,40.358501,35.972125,37.499241,165.471906,498849.564718,8.145818
std,213999.803314,5.302849,8.271952,0.992783,5.827166,3.224907,4.635117,7.245308,286356.969459,1.336109
min,123373.0,20.0,0.0,1.0,30.0,28.0,20.0,91.44,6.0,5.0
25%,314980.0,28.0,8.0,3.0,36.0,34.0,34.0,160.02,252897.75,7.0
50%,454030.0,30.0,12.0,4.0,39.0,36.0,36.0,165.1,497913.5,8.0
75%,658440.0,34.0,15.0,5.0,43.0,38.0,40.0,170.18,744745.25,9.0
max,807722.0,50.0,38.0,5.0,60.0,48.0,59.0,241.3,999972.0,38.0


## Check Outlier

removing outlier the values beyond [<(Q1- 1.5IQR), >Q3+1.5IQR)]

In [14]:
Q1 = df.quantile(0.25)
Q3 = df.quantile(0.75)
Q1.drop(["item_id","user_id"], inplace =True)
Q3.drop(["item_id","user_id"], inplace = True)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

  Q1 = df.quantile(0.25)
  Q3 = df.quantile(0.75)


In [15]:
# series_align = pd.Series([11, 12, 13, 14, 15,17,18], index=['waist','size','quality','hips','bra_size','bust','height'])
# df, series_align = df.align(series_align,axis=1, copy=False)
df = df[~((df< (lower_bound)) |(df > (upper_bound))).any(axis=1)]
# df = pd.DataFrame(df)
# reference_df = pd.DataFrame(index=df.index, columns=df.columns)
# df, reference_df = df.align(reference_df, axis=1, copy=False)
# df = df[(df >= lower_bound) & (df <= upper_bound)]


  df = df[~((df< (lower_bound)) |(df > (upper_bound))).any(axis=1)]


In [16]:
df.reset_index(drop=True, inplace=True)

## Hadling missing value

In [17]:
missing_data = pd.DataFrame({'num_missing': (df.isnull().sum()/df.shape[0])*100})
missing_data.sort_values("num_missing", ascending=False, inplace = True)
missing_data
     

Unnamed: 0,num_missing
waist,96.371036
bust,85.001403
shoe_width,77.200393
shoe_size,65.21611
hips,31.618019
review_text,8.005894
review_summary,8.005894
cup_size,7.325288
bra_size,6.994106
height,1.211058


After analysis th numb of missing value, the column that can be analyze by review text is shoe size

In [18]:
shoe_review = df[np.logical_and(pd.notnull(df["review_text"]), pd.notnull(df["shoe_size"]))][["user_name","shoe_size","review_text"]]
shoe_review.shape

(22958, 3)

In [19]:
count = np.sum([True if len(re.findall(r"shoes|shoe",x)) else False for x in shoe_review.review_text])

num_true = count/shoe_review.shape[0]
print(num_true)

num_user_shoe = np.sum(shoe_review.groupby('user_name')['shoe_size'].unique().apply(lambda x: len(x))>1)
print(num_user_shoe) ## differing sho_sizes not complete information

num_user_all_df = np.sum(df.groupby('user_name')['shoe_size'].unique().apply(lambda x: len(x))>1)
print(num_user_all_df) ##700 new data 


0.003745970903388797
585
1201


Note : Handling the others column

In [20]:
df.review_summary.fillna("Unknown", inplace=True)
df.review_text.fillna("Unknown", inplace=True)

In [21]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='median')
df[['height','quality']] = imputer.fit_transform(df[['height','quality']])

In [22]:
df["length"] = df.length.fillna(df['length'].value_counts().index[0])

In [23]:
print(df.hips[12])

36.0


In [24]:
df['hips'].fillna('Unknown', inplace=True)
df['hips'] = pd.to_numeric(df['hips'], errors='coerce')
bins = [-2, 0, 31, 37, 40, 44, 75]
labels = ['Unknown', 'XS', 'S', 'M', 'L', 'XL']
df['hips'] = pd.cut(df['hips'], bins=bins, labels=labels)

In [25]:
df.drop('review_summary', axis=1, inplace = True)

In [26]:
name_id = df.groupby("user_name")['user_id'].apply(lambda x: len(np.unique(x))).reset_index()
id_name = df.groupby("user_id")['user_name'].apply(lambda x: len(np.unique(x))).reset_index()

In [27]:
df.user_name = df.user_name.apply(lambda x: x.lower())

## Making model

In [38]:
data = pd.read_csv('./dataset/processed_data.csv')
data = data.drop("Unnamed: 0",axis =1)
data  = data.drop("review_text",axis=1)
train, val = train_test_split(data, test_size=0.2)
user_categorical_features = ["user_name","hips","cup_size"]
user_numerical_features = ["height","bra_size"]
item_categorical_features = ["item_id", "category", "length"]
item_numerical_features = ["size","quality"]

preprocessing data

In [39]:
from sklearn import preprocessing
data_num = preprocessing.MinMaxScaler().fit(train[["height","bra_size","size","quality"]])

In [40]:
train[["height","bra_size","size","quality"]] = data_num.transform(train[["height","bra_size","size","quality"]])
val[["height","bra_size","size","quality"]] = data_num.transform(val[["height","bra_size","size","quality"]])

In [41]:
for col in user_categorical_features + item_categorical_features:
  df[col] = df[col].astype(str) 

In [42]:

numeric_users = {
    data_col : tf.feature_column.numeric_column(data_col) \
          for data_col in user_numerical_features
}
numeric_items = {
    data_col : tf.feature_column.numeric_column(data_col) \
          for data_col in item_numerical_features
}


hips = tf.feature_column.categorical_column_with_vocabulary_list(
      'hips', df.hips.unique().tolist())
cup_size = tf.feature_column.categorical_column_with_vocabulary_list(
      'cup_size', df.cup_size.unique().tolist())
user_name = tf.feature_column.categorical_column_with_vocabulary_list(
      'user_name', df.user_name.unique().tolist())



item_id = tf.feature_column.categorical_column_with_vocabulary_list(
      'item_id', df.item_id.unique().tolist())
category = tf.feature_column.categorical_column_with_vocabulary_list(
      'category',  df.category.unique().tolist())
length = tf.feature_column.categorical_column_with_vocabulary_list(
      'length', df.length.unique().tolist())

hips_embedding = tf.feature_column.embedding_column(hips, dimension=5)
cup_size_embedding = tf.feature_column.embedding_column(cup_size, dimension=5)
user_name_embedding = tf.feature_column.embedding_column(user_name, dimension=50)
item_id_embedding = tf.feature_column.embedding_column(item_id, dimension=50)
category_embedding = tf.feature_column.embedding_column(category, dimension=5)
length_embedding = tf.feature_column.embedding_column(length, dimension=5)

cat_users = {
    'hips' : hips_embedding,
    'cup_size' : cup_size_embedding,
    'user_name': user_name_embedding
}

cat_items = {
    'item_id' : item_id_embedding,
    'category' : category_embedding,
    'length': length_embedding
}

# input_user = {
#     colname : tf.feature_column(
#         tf.feature_column.numeric_column(colname, shape=(), dtype='float32')
#     )
#     for colname in numeric_users.keys()
# }
# input_user.update({
#     colname : tf.feature_column(
#         tf.feature_column.categorical_column_with_vocabulary_list(colname, shape=(),  dtype='string')
#     )
#           for colname in cat_users.keys()
# })

# input_items = {
#     colname : tf.feature_column(
#         tf.feature_column.numeric_column(colname, shape=(), dtype = 'float32')
#     )
#           for colname in numeric_items.keys()
# }

# input_items.update({
#     colname : tf.feature_column(
#         tf.feature_column.categorical_column_with_vocabulary_list(colname, shape=(),  dtype='string')
#     )
#           for colname in cat_items.keys()
# })
# input_user = {
#     colname : tf.keras.layers.Input(name=colname, shape=(), dtype='float32') \
#           for colname in numeric_users.keys()
# }
# input_user.update({
#     colname_update : tf.keras.layers.Input(name=colname_update, shape=(),  dtype='string') \
#           for colname_update in cat_users.keys()
# })

# input_items = {
#     colname_items : tf.keras.layers.Input(name=colname_items, shape=(), dtype = 'float32') \
#           for colname_items in numeric_items.keys()
# }

# input_items.update({
#     colname_items : tf.keras.layers.Input(name=colname_items, shape=(),  dtype='string') \
#           for colname_items in cat_items.keys()
# })

input_user = {
    colname : tf.keras.layers.Input(name=colname, shape=(), dtype='float32') \
          for colname in numeric_users.keys()
}
input_user.update({
    colname : tf.keras.layers.Input(name=colname, shape=(),  dtype='string') \
          for colname in cat_users.keys()
})

input_items = {
    colname : tf.keras.layers.Input(name=colname, shape=(), dtype = 'float32') \
          for colname in numeric_items.keys()
}

input_items.update({
    colname : tf.keras.layers.Input(name=colname, shape=(),  dtype='string') \
          for colname in cat_items.keys()
})


In [43]:
# Create a feature layer
feature_layer_users = tf.keras.layers.DenseFeatures(numeric_users.values())(input_user)
feature_layer_items = tf.keras.layers.DenseFeatures(numeric_items.values())(input_items)

In [44]:
LABELS =  np.array(["fit","small","large"])

In [48]:
import copy
from tensorflow import keras
def convert_df(dataframe, shuffle=True, batch_size=32):
  dataframe = dataframe.copy()
  labels = dataframe.pop('fit')
  labels = labels.apply(lambda x:x == LABELS)
  ds = tf.data.Dataset.from_tensor_slices((dict(dataframe), labels))
  if shuffle:
    ds = ds.shuffle(buffer_size=len(dataframe))
  ds = ds.batch(batch_size)
  #prefetching was giving some trouble on google colab,
  #there might be some issue with some gdfs, hence not here
  return ds

In [49]:
class SkipCon(keras.layers.Layer):
  def __init__(self, size, reduce = True, deep = 3, skip_when=0, activation="relu", **kwargs):
    """
    @Params
    size = size of dense layer
    deep = the depth of network in one SkipCon block call
    skip_when =  if a skip connection is required, pass 1
    activation = by default using relu, in the paper authors have used tanh(no reasons again)
    """    
    super().__init__(**kwargs)
    self.activation = keras.activations.get(activation) # used to combine
    # skip connections and cascaded dense layers
    self.main_layers =[]
    self.skip_when = skip_when #to be used in call as a control
    if reduce:
      for _ in range(deep):
        self.main_layers.extend([
          keras.layers.Dense(size, activation=activation, 
                              use_bias=True),
          keras.layers.BatchNormalization()])

        # Reduce the input size by two each time, if the
        # network is to be designed deeper and narrow
        size = size/2
    else:
      for _ in range(deep):
        self.main_layers.extend([
        keras.layers.Dense(size, activation=activation, 
                            use_bias=True),
        keras.layers.BatchNormalization()])
        
    self.skip_layers = []
    if skip_when > 0:
      if reduce:
        size = size*2 # since the size of skipped connection  
                      # should match with cascaded dense
      self.skip_layers = [
          keras.layers.Dense(size, activation=activation, 
                          use_bias=True),
          keras.layers.BatchNormalization()]

  def call(self, inputs):
    Z = inputs
    for layer in self.main_layers:
      Z = layer(Z)
    if not self.skip_when:
      return self.activation(Z)
    skip_Z = inputs
    for layer in self.skip_layers:
      skip_Z = layer(skip_Z)
    return self.activation(Z + skip_Z)

In [50]:
batch_size = 512 
# train.drop(['bra_size','cup_size'], axis=1)
# val.drop(['bra_size','cup_size'], axis=1)
train_ds = convert_df(train, batch_size=batch_size)
val_ds = convert_df(val, shuffle=False, batch_size=batch_size)

ValueError: Failed to convert a NumPy array to a Tensor (Unsupported object type numpy.ndarray).