# RTR Data Cleaning \& Feature Engineering Notebook

In [1]:
import numpy as np
import pandas as pd

In [2]:
rtr_data = pd.read_json('rent_the_runway_data.zip', lines=True)

In [3]:
rtr_data.head()

Unnamed: 0,age,body type,bust size,category,fit,height,item_id,rating,rented for,review_date,review_summary,review_text,size,user_id,weight
0,28.0,hourglass,34d,romper,fit,"5' 8""",2260466,10.0,vacation,"April 20, 2016",So many compliments!,An adorable romper! Belt and zipper were a lit...,14,420272,137lbs
1,36.0,straight & narrow,34b,gown,fit,"5' 6""",153475,10.0,other,"June 18, 2013",I felt so glamourous!!!,I rented this dress for a photo shoot. The the...,12,273551,132lbs
2,116.0,,,sheath,fit,"5' 4""",1063761,10.0,party,"December 14, 2015",It was a great time to celebrate the (almost) ...,This hugged in all the right places! It was a ...,4,360448,
3,34.0,pear,34c,dress,fit,"5' 5""",126335,8.0,formal affair,"February 12, 2014",Dress arrived on time and in perfect condition.,I rented this for my company's black tie award...,8,909926,135lbs
4,27.0,athletic,34b,gown,fit,"5' 9""",616682,10.0,wedding,"September 26, 2016",Was in love with this dress !!!,I have always been petite in my upper body and...,12,151944,145lbs


In [4]:
rtr_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 192544 entries, 0 to 192543
Data columns (total 15 columns):
age               191584 non-null float64
body type         177907 non-null object
bust size         174133 non-null object
category          192544 non-null object
fit               192544 non-null object
height            191867 non-null object
item_id           192544 non-null int64
rating            192462 non-null float64
rented for        192534 non-null object
review_date       192544 non-null object
review_summary    192544 non-null object
review_text       192544 non-null object
size              192544 non-null int64
user_id           192544 non-null int64
weight            162562 non-null object
dtypes: float64(2), int64(3), object(10)
memory usage: 22.0+ MB


### Next Steps for Rent The Runway:
1. change column name spaces to underscores
2. convert ```review_date``` to datetime
3. determine if ```rented_for``` is a categorical
4. determine if ```body_type``` is categorical
5. determine if ```fit``` is categorical
6. convert ```weight``` to pounds as an int64
7. find size chart data on RTR and add that to each user
8. Change ```bust_size``` to ```band_size``` and ```cup_size``` and make ```band_size``` an int
9. look for ```item_id``` on RTR site _is not on site_
10. convert ```height``` to inches as an int

In [5]:
rtr_dress_sizes = [0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, '24+']

In [6]:
rtr_body_types = ['apple', 'athletic', 'full bust', 'hour glass', 'pair', 'petite', 'straight & narrow']

In [7]:
rtr_jean_sizes = list(range(24, 41))

In [8]:
bust_sizes = list(range(28, 50, 2))
cup_sizes = ['aa', 'a', 'b', 'c', 'd', 'dd', 'ddd/e', 'f', 'g', 'h', 'i', 'j']
rtr_bust_sizes = [str(bust)+cup for bust in bust_sizes for cup in cup_sizes]

In [9]:
rtr_data = rtr_data.rename(columns={'body type':'body_type', 'bust size':'bust_size', 'rented for':'rented_for'})

In [10]:
rtr_data.columns

Index(['age', 'body_type', 'bust_size', 'category', 'fit', 'height', 'item_id',
       'rating', 'rented_for', 'review_date', 'review_summary', 'review_text',
       'size', 'user_id', 'weight'],
      dtype='object')

In [11]:
rtr_data['review_datetime'] = pd.to_datetime(rtr_data.review_date)

In [12]:
rtr_data.rented_for.unique()

array(['vacation', 'other', 'party', 'formal affair', 'wedding', 'date',
       'everyday', 'work', nan, 'party: cocktail'], dtype=object)

In [13]:
rtr_data.body_type.unique()

array(['hourglass', 'straight & narrow', nan, 'pear', 'athletic',
       'full bust', 'petite', 'apple'], dtype=object)

In [14]:
rtr_data.fit.unique()

array(['fit', 'small', 'large'], dtype=object)

In [15]:
def convert_lbs_to_int(pounds):
    if type(pounds) == str:
        if pounds.endswith('lbs'):
            return int(pounds[:-3])
    elif np.isnan(pounds):
        return

In [16]:
rtr_data.weight = rtr_data.weight.apply(convert_lbs_to_int)

In [17]:
def separate_band_and_cup(bust_size):
    if type(bust_size) == str:
#         return int(bust_size[:2]), bust_size[2:]
        return bust_size[:2]+','+bust_size[2:]
    elif np.isnan(bust_size):
        return np.NaN, np.NaN

In [18]:
# rtr_data['band'], rtr_data['cup'] = 
rtr_data.bust_size.apply(separate_band_and_cup)[0]

'34,d'

In [19]:
rtr_data.columns

Index(['age', 'body_type', 'bust_size', 'category', 'fit', 'height', 'item_id',
       'rating', 'rented_for', 'review_date', 'review_summary', 'review_text',
       'size', 'user_id', 'weight', 'review_datetime'],
      dtype='object')

In [21]:
bust_cup = rtr_data.bust_size.apply(separate_band_and_cup).str.split(',', n=2, expand=True)
bust_cup.columns = ['band_size', 'cup_size']

In [22]:
rtr_data = pd.concat([rtr_data, bust_cup], axis=1)

In [23]:
def convert_to_int(band):
    if type(band)==str:
        return int(band)
    elif band==np.NaN:
        pass

In [24]:
rtr_data.band_size = rtr_data.band_size.apply(convert_to_int)

In [26]:
def convert_feetinches_to_inches(distance):
    if type(distance) == str:
        inches = 0
        for i, ft in enumerate(distance):
            if ft=='f' or ft=='\'':
                inches += 12*int(distance[:i])
            if ft=='i' or ft=='\"':
                inches += int(distance[i-2:i].strip())
        return inches
    elif np.isnan(distance):
        return np.NaN

In [27]:
rtr_data.height = rtr_data.height.apply(convert_feetinches_to_inches)

In [28]:
rtr_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 192544 entries, 0 to 192543
Data columns (total 18 columns):
age                191584 non-null float64
body_type          177907 non-null object
bust_size          174133 non-null object
category           192544 non-null object
fit                192544 non-null object
height             191867 non-null float64
item_id            192544 non-null int64
rating             192462 non-null float64
rented_for         192534 non-null object
review_date        192544 non-null object
review_summary     192544 non-null object
review_text        192544 non-null object
size               192544 non-null int64
user_id            192544 non-null int64
weight             162562 non-null float64
review_datetime    192544 non-null datetime64[ns]
band_size          174133 non-null float64
cup_size           174133 non-null object
dtypes: datetime64[ns](1), float64(5), int64(3), object(9)
memory usage: 26.4+ MB


### Clean RTR:
1. Check for duplicate reviews
2. Check for garbage reviews, ex. review with no user information or text but including a rating

In [77]:
# rtr_data[rtr_data.duplicated(keep=False)].sort_values(by='rtr_data.review_datetime', axis=1)
df.head(4)

Unnamed: 0,age,body_type,bust_size,category,fit,height,item_id,rating,rented_for,review_date,review_summary,review_text,size,user_id,weight,review_datetime,band_size,cup_size
483,34.0,pear,34c,sheath,fit,64.0,1384766,10.0,party,"September 20, 2016",I rented this dress for a black & white party....,This dress runs very tight in the waist. Also...,12,61928,135.0,2016-09-20,34.0,c
639,34.0,pear,34c,sheath,fit,64.0,1384766,10.0,party,"September 20, 2016",I rented this dress for a black & white party....,This dress runs very tight in the waist. Also...,12,61928,135.0,2016-09-20,34.0,c
705,42.0,pear,36d,dress,fit,66.0,1522253,8.0,other,"April 9, 2015",tons of compliments. Very nice dress,You can dress this up or down. Great for vaca...,20,952829,165.0,2015-04-09,36.0,d
1146,53.0,hourglass,36d,dress,fit,62.0,1707988,10.0,other,"August 9, 2017",Felt like a Runway Model!,The colors of this dress are absolutely beauti...,16,188164,132.0,2017-08-09,36.0,d


In [84]:
# rtr_data[rtr_data.duplicated(subset=['user_id', 'item_id', 'review_text'], keep=False)]
rtr_data.duplicated(subset=['user_id', 'item_id', 'review_text'], keep=False).sum()

438

In [None]:
rtr_data.loc[rtr_data.age>80].describe()

In [None]:
# rtr_data.to_csv('rtr_clean.csv')