**Install library**

In [51]:
pip install surprise



**Import libraries and load dataset and split into train and test data**

In [0]:
from surprise import Dataset
from surprise import Reader
import pandas as pd
import numpy as np
import os, io
from surprise import SVD
from surprise import accuracy
import csv

In [0]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
data = pd.read_json('renttherunway_final_data.json', lines=True)
train, test = train_test_split(data, test_size=0.2)

**Preprocessing functions defined**

In [0]:
def preprocessAge(data):
  bins = pd.IntervalIndex.from_tuples([(5, 15), (15, 30), (30, 50),(50,65),(65,100)])
  x = pd.cut(data['age'].to_list(), bins)
  x.categories = ['1','2','3','4','5']
  data['age_bins']  = x
  age_Preprocessed_data = data[~data['age_bins'].isnull()]
  print ('{0} rows got dropped after age preprocessing'.format(str(data.shape[0] - age_Preprocessed_data.shape[0])))
  return age_Preprocessed_data

In [0]:
def preprocessBustsize(age_Preprocessed_data):
  data_bust_size_not_null = age_Preprocessed_data[~age_Preprocessed_data['bust size'].isnull()]
  data_bust_size_not_null['bust size'] = data_bust_size_not_null['bust size'].apply(lambda x : (int(x[:2]), x[2:]))
  data_bust_size_not_null.shape
  data_bust_size_not_null[['bust Size1', 'bust']] = pd.DataFrame(data_bust_size_not_null['bust size'].tolist(), index=data_bust_size_not_null.index)
  bustDict = {'a': 1, 'aa': 1, 'b': 2, 'c': 3, 'd': 4, 
            'd+': 5, 'dd': 5, 'ddd/e': 6,
           'f':7 , 'g': 8, 'h': 9, 'i': 10, 'j': 11}
  data_bust_size_not_null['bust1'] = data_bust_size_not_null['bust'].map(bustDict)
# Calculatign model of bust size based on age grp.
  def funct(df):
    return df.mode()
  x = data_bust_size_not_null[['age_bins','bust Size1','bust1']].groupby('age_bins').apply(funct)
  x.reset_index(inplace= True, drop = True)
  data_bust_size_null = age_Preprocessed_data[age_Preprocessed_data['bust size'].isnull()]
  data_bust_size_null = pd.merge(data_bust_size_null, x, how ='left', left_on = 'age_bins', right_on = 'age_bins')
  data_bust_size_cleansed = data_bust_size_not_null.append(data_bust_size_null)
  return data_bust_size_cleansed

In [0]:
def preprocessHeight(data_bust_size_cleansed):
  data_bust_size_cleansed['heightCM'] = data_bust_size_cleansed['height'].apply(lambda x : (int(x.split("\'")[0]) * 30.48) + (int(x.split("\'")[1][:-1]) * 2.54) 
                                                        if type(x) == str else x)
  data_bust_size_cleansed['heightCM'].fillna((data_bust_size_cleansed['heightCM'].mean()), inplace=True)
  return data_bust_size_cleansed

In [0]:
def preprocessWeight(data_bust_size_cleansed):
  data_bust_size_cleansed['weightLbs'] = data_bust_size_cleansed['weight'].apply(lambda x : int(x[:-3])                                                        if type(x) == str else x)
  data_bust_size_cleansed['weightLbs'].fillna((data_bust_size_cleansed['weightLbs'].mean()), inplace=True)
  return data_bust_size_cleansed

In [0]:
def preprocessRentedFor(data_bust_size_cleansed):
  data_bust_size_cleansed['rented for'].fillna('other', inplace=True)
  return data_bust_size_cleansed

In [0]:
def preprocessBodytype(data_bust_size_cleansed):
  # Calculating model of body type based on age grp.
  data_body_type_not_null = data_bust_size_cleansed[~data_bust_size_cleansed['body type'].isnull()]
  def func(df):
    return df.mode()
  x = data_body_type_not_null[['bust Size1','body type']].groupby('bust Size1').apply(func)
  x.reset_index(inplace= True, drop = True)
  # print(x)
  data_body_type_null = data_bust_size_cleansed[data_bust_size_cleansed['body type'].isnull()]
  data_body_type_null = pd.merge(data_body_type_null, x, how ='left', left_on = 'bust Size1', right_on = 'bust Size1')
  # print(data_body_type_null)
  data_body_type_cleansed = data_body_type_not_null.append(data_body_type_null,sort=True)
  # print(data_body_type_cleansed)
  data_body_type_cleansed['body type'].fillna(data_body_type_cleansed['body type_y'], inplace=True)
  del data_body_type_cleansed['body type_x']
  del data_body_type_cleansed['body type_y']
  return data_body_type_cleansed

In [0]:
def createFinalDataframe(cleansedData):
  final_df=cleansedData.copy()
  final_df=final_df.drop(['rented for','rating','review_date','category','age_bins','bust','bust size','fit','height','item_id','review_summary','user_id','weight','review_text'],axis=1)
  cleanup_nums = {"body type":     {"hourglass": 1, "straight & narrow": 2, "pear": 3,"athletic": 4, "full bust": 5,"petite": 6, "apple": 7}}
  final_df.replace(cleanup_nums, inplace=True)
  return final_df

In [0]:
def createFinalTestDataframe(cleansedData):
  final_df=cleansedData.copy()
  final_df=final_df.drop(['rating','review_date','category','age_bins','bust','bust size','fit','height','item_id','review_summary','weight','review_text'],axis=1)
  cleanup_nums = {"body type":     {"hourglass": 1, "straight & narrow": 2, "pear": 3,"athletic": 4, "full bust": 5,"petite": 6, "apple": 7}}
  final_df.replace(cleanup_nums, inplace=True)
  return final_df

**Call functions for train data**

In [62]:
age_Preprocessed_data=preprocessAge(train)
data_bust_size_cleansed=preprocessBustsize(age_Preprocessed_data)
data_bust_size_cleansed=preprocessHeight(data_bust_size_cleansed)
data_bust_size_cleansed=preprocessWeight(data_bust_size_cleansed)
data_bust_size_cleansed=preprocessRentedFor(data_bust_size_cleansed)
cleansedData=preprocessBodytype(data_bust_size_cleansed)
final_df=createFinalDataframe(cleansedData)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


895 rows got dropped after age preprocessing


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':
of pandas will change to not sort by default.

To accept the future behavior, pa

**Classifier to predict body type**

In [0]:
feature_names = ['age', 'bust Size1','bust1','size','heightCM','weightLbs']
X = final_df[feature_names]
y = final_df['body type']
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

In [64]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
print('Accuracy of Logistic regression classifier on training set: {:.2f}'
     .format(logreg.score(X_train, y_train)))
print('Accuracy of Logistic regression classifier on test set: {:.2f}'
     .format(logreg.score(X_test, y_test)))



Accuracy of Logistic regression classifier on training set: 0.44
Accuracy of Logistic regression classifier on test set: 0.45


In [65]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
print('Accuracy of K-NN classifier on training set: {:.2f}'
     .format(knn.score(X_train, y_train)))
print('Accuracy of K-NN classifier on test set: {:.2f}'
     .format(knn.score(X_test, y_test)))

Accuracy of K-NN classifier on training set: 0.64
Accuracy of K-NN classifier on test set: 0.50


In [66]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators=100, 
                               bootstrap = True,
                               max_features = 'sqrt')
model.fit(X_train, y_train)
print('Accuracy of Random forest classifier on training set: {:.2f}'
     .format(model.score(X_train, y_train)))
print('Accuracy of Random forest classifier on test set: {:.2f}'
     .format(model.score(X_test, y_test)))

Accuracy of Random forest classifier on training set: 0.89
Accuracy of Random forest classifier on test set: 0.58


**Call functions on test data**

In [67]:
age_Preprocessed_testdata=preprocessAge(test)
testdata_bust_size_cleansed=preprocessBustsize(age_Preprocessed_testdata)
testdata_bust_size_cleansed=preprocessHeight(testdata_bust_size_cleansed)
testdata_bust_size_cleansed=preprocessWeight(testdata_bust_size_cleansed)
testdata_bust_size_cleansed=preprocessRentedFor(testdata_bust_size_cleansed)
cleansedTestData=preprocessBodytype(testdata_bust_size_cleansed)
final_test_df=createFinalTestDataframe(cleansedTestData)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_index

191 rows got dropped after age preprocessing


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort,


In [0]:
feature_names = ['age', 'bust Size1','bust1','size','heightCM','weightLbs','rented for','user_id']
testdata = final_test_df[feature_names]

**For each test data instance, predict body type using classifier, then cluster based on body type and rented for, then predict rating for each item id, then recommend highest rating item id**

In [69]:
for i in range(5):#len(testdata)
    age = testdata.iloc[i,0]
    bustSize1 = testdata.iloc[i,1]
    bust1 = testdata.iloc[i,2]
    size = testdata.iloc[i,3]
    heightCM = testdata.iloc[i,4]
    weightLbs = testdata.iloc[i,5]
    xnew=[[age,bustSize1,bust1,size,heightCM,weightLbs]]
    bodytypenumber=model.predict(xnew)
    #print("Predicted body type:",bodytypenumber)
    bodytypearray=["NaN","hourglass", "straight & narrow", "pear","athletic", "full bust","petite", "apple"]
    bodytype=np.asanyarray(bodytypearray)[bodytypenumber]
    print("Predicted body type:",bodytype)
    cluster_df=cleansedData.copy()
    groupedbybodytype = cluster_df.groupby('body type')
    for name,group in groupedbybodytype:
      if(bodytype==name):
        #print(name)
        #print(group)
        grouped_df=group.copy()
    rentedFor=testdata.iloc[i,6]
    groupedbybodytype = grouped_df.groupby('rented for')
    for name,group in groupedbybodytype:
      if(rentedFor==name):
        #print(name)
        #print(group)
        cf_df=group.copy()
    cf_df=cf_df[['user_id','item_id','rating']].copy()
    reader = Reader(rating_scale=(1,10))
    traindata=Dataset.load_from_df(cf_df,reader)
    trainingSet = traindata.build_full_trainset()
    svdAlgo = SVD(n_factors=200,n_epochs=50)
    svdAlgo.fit(trainingSet)
    itemid=cf_df['item_id'].unique()
    ratingOutput = pd.DataFrame(columns=['item_id', 'rating'])
    for i in range(len(itemid)):
      user=cf_df.iloc[i,0]
      item=cf_df.iloc[i,1]
      output=svdAlgo.predict(user,item,r_ui=None,clip=True,verbose=False)
      ratingOutput = ratingOutput.append({'item_id': item, 'rating': output.est}, ignore_index=True)
    ratingOutput=ratingOutput.sort_values('rating',ascending=False)
    outputdf=ratingOutput.head(10).copy()
    #print("Top 10 recommended items with their predicted ratings:")
    #outputdf.head()
    rawdf = pd.read_json ('renttherunway_final_data.json',lines=True)
    itemID=outputdf.iloc[1,0].astype('int64')
    print("Recommended itemID:",itemID)
    filter = rawdf['item_id']==itemID
    rawdf.where(filter, inplace = True)
    print("Recommended category:",rawdf.head(1)['category'].astype('str'))

Predicted body type: ['straight & narrow']
Recommended itemID: 304354
Recommended category: 0    nan
Name: category, dtype: object
Predicted body type: ['petite']
Recommended itemID: 1133906
Recommended category: 0    nan
Name: category, dtype: object
Predicted body type: ['petite']
Recommended itemID: 123793
Recommended category: 0    nan
Name: category, dtype: object
Predicted body type: ['athletic']
Recommended itemID: 889239
Recommended category: 0    nan
Name: category, dtype: object
Predicted body type: ['athletic']
Recommended itemID: 268562
Recommended category: 0    nan
Name: category, dtype: object
