# Feature Engineering

In [1]:
import ast
import json

import pandas as pd
import numpy as np

In [2]:
# Opening JSON file
train = open('dataset/raw/train.json')
 
# returns JSON object as 
# a dictionary
data_train_aux = json.load(train)

data_train = pd.json_normalize(data_train_aux, record_path = 'data')
data_train.columns = data_train_aux['columns']
data_train.head()

Unnamed: 0,_id,average_rating,number_of_reviews,brand,category,crawled_at,description,images,out_of_stock,avg_delivery_time_days,pid,product_details,seller,sub_category,fabrication_time,title,actual_price
0,53df9662-e500-569c-946e-0c8d215a72cd,3.2,26,East I,Clothing and Accessories,2021-02-10 21:17:28,Navy Blue Printed Boxers Has An Inner Elasti...,['https://rukminim1.flixcart.com/image/128/128...,False,8,BXRFTZF7JGX75DAW,"[{'Color': 'Dark Blue'}, {'Fabric': 'Pure Cott...",ZIYAA,Innerwear and Swimwear,653,Printed Men Boxer (Pack of 1),849.0
1,d0142842-84f7-537d-a06f-d85b76488a5f,4.0,33,dream o,Clothing and Accessories,2021-02-11 01:02:46,smiley printed tshirt on round neck cotton tshirt,['https://rukminim1.flixcart.com/image/128/128...,False,12,TSHFWQM96UHR6A4Q,"[{'Type': 'Round Neck'}, {'Sleeve': 'Short Sle...",Dream Onn Creations,Topwear,668,Printed Men Round Neck Orange T-Shirt,699.0
2,79c8f0d7-30b1-5dd4-9f2f-2fe97782b027,3.9,32,Free Authori,Clothing and Accessories,2021-02-11 00:43:37,Free Authority Presents this Crew Neck Yellow ...,['https://rukminim1.flixcart.com/image/128/128...,False,11,SWSFWCXH2WF6ZYRB,"[{'Color': 'Yellow'}, {'Fabric': 'Polycotton'}...",BioworldMerchandising,Winter Wear,53,Full Sleeve Graphic Print Men Sweatshirt,1499.0
3,0531c28c-7c50-5fbd-9ce3-a7cae3243ad5,3.8,31,HUMBE,Clothing and Accessories,2021-02-10 21:22:10,Cotton Blend FabricCollar / Polo Neck White & ...,['https://rukminim1.flixcart.com/image/128/128...,False,11,TSHFHQH3HKDAGGK9,"[{'Type': 'Polo Neck'}, {'Sleeve': 'Short Slee...",HUMBERT,Topwear,510,"Solid Men Polo Neck Light Blue, White T-Shirt ...",1699.0
4,d604baad-472e-5c18-86a3-7b46d4a890c2,2.4,20,Rose We,Clothing and Accessories,2021-02-10 23:36:36,undefined,['https://rukminim1.flixcart.com/image/128/128...,False,5,TSHFW9CJZSYUU6UX,"[{'Type': 'Round Neck'}, {'Sleeve': 'Short Sle...",Rupalcollectionjaipur,Topwear,496,Printed Men Round Neck White T-Shirt,599.0


## Helper Functions

In [3]:
def str2dict(x):
    try:
        return ast.literal_eval(str(x))   
    except Exception as e:
        print(e)
        return []

def junta_dict(dict_list):
    dicionario = {}
    for d in dict_list:
        for key, value in d.items():
            dicionario[key] = value
    return dicionario

In [4]:
def categorize_sleeve(fabric):
    try:
        if 'Full Sleeve' in fabric:
            return 'Full Sleeve'
        elif 'Short Sleeve' in fabric:
            return 'Short Sleeve'
        elif 'Half Sleeve' in fabric:
            return 'Half Sleeve'
        elif 'Sleeveless' in fabric:
            return 'Sleeveless'
        else:
            return 'Outra'
    except TypeError:
        return 'nan'

In [5]:
def categorizar_cortes(cut):
    try:
        cut = cut.lower()
        if 'nan' in cut:
            return 'nan'
        elif 'full sleeve' in cut:
            return 'Full Sleeve'
        elif 'slim' in cut:
            return 'Slim'
        elif 'tapered' in cut:
            return 'Tapered'
        elif 'loose' in cut:
            return 'Loose'
        elif 'tailored' in cut:
            return 'Tailored'
        elif 'jogger' in cut:
            return 'Jogger Fit'
        elif 'relaxed' in cut:
            return 'Relaxed'
        elif 'skinny' in cut:
            return 'Skinny Fit'
        elif 'compression' in cut:
            return 'Compression'
        elif 'straight' in cut:
            return 'Straight Fit'
        elif 'athletic' in cut:
            return 'Athletic'
        elif 'smart' in cut:
            return 'Smart Fit'
        return 'Outro'
    except:
        return np.nan
    

In [21]:
def fill_actual_price(row):
    if pd.notna(row['actual_price']):
        return row['actual_price']
    else:
        return mean_prices.get((row['seller'], row['sub_category'], row['brand']))

## Feature Engineering

In [6]:
list_of_dict = data_train['product_details'].apply(lambda x: str2dict(x))
df = pd.DataFrame([junta_dict(row) for row in list_of_dict])

In [7]:
df1 = pd.concat([data_train, df], axis=1)
df1.head(2)

Unnamed: 0,_id,average_rating,number_of_reviews,brand,category,crawled_at,description,images,out_of_stock,avg_delivery_time_days,...,Foot Coverage,Bust in inch,Shoulder in inch,Sleeve in inch,Pleated,Design,Weave type,Fabric care,Width,Height
0,53df9662-e500-569c-946e-0c8d215a72cd,3.2,26,East I,Clothing and Accessories,2021-02-10 21:17:28,Navy Blue Printed Boxers Has An Inner Elasti...,['https://rukminim1.flixcart.com/image/128/128...,False,8,...,,,,,,,,,,
1,d0142842-84f7-537d-a06f-d85b76488a5f,4.0,33,dream o,Clothing and Accessories,2021-02-11 01:02:46,smiley printed tshirt on round neck cotton tshirt,['https://rukminim1.flixcart.com/image/128/128...,False,12,...,,,,,,,,,,


In [8]:
# Cria algumas features
df1['Fabric'] = df1['Fabric'].str.lower()
df1['cotton'] = df1['Fabric'].str.contains('cotton', case=False, regex=True)
df1['Pack of'] = df1['Pack of'].str.lstrip('Pack of ')
df1['Sleeve'] = df1['Sleeve'].apply(categorize_sleeve)
df1['Fit'] = df1['Fit'].apply(categorizar_cortes)
df1['Color'] = df1['Color'].apply(lambda x: x.split(', ')[0] if isinstance(x, str) else x)

In [9]:
# Preenche NaN em actual_price com os valores de produtos com mesmo pid
media_precos_pid = df1.groupby('pid')['actual_price'].mean()
df1['actual_price'] = df1.apply(lambda row: media_precos_pid[row['pid']] if pd.isna(row['actual_price']) else row['actual_price'], axis=1)

In [19]:
# Preenche NaN em actual_price com a média dos produtos de mesma subcategoria, marca e vendedor
df2 = df1.copy()
mean_prices = df2.groupby(['seller', 'sub_category', 'brand'])['actual_price'].mean()
df2['actual_price'] = df2.apply(fill_actual_price, axis=1)

In [37]:
# Sobraram 49 valores NaN
df2['actual_price'].isna().sum()

49

In [35]:
df2[df2['actual_price'].isna()].iloc[:, :17]

Unnamed: 0,_id,average_rating,number_of_reviews,brand,category,crawled_at,description,images,out_of_stock,avg_delivery_time_days,pid,product_details,seller,sub_category,fabrication_time,title,actual_price
308,4171dda4-57ef-5571-816d-d76d1a29000d,2.5,21,Thug Li,Clothing and Accessories,2021-02-10 23:52:36,Slouchy beanie is head-hugging round close-fit...,[],False,5,CAPEZHFHVXQHFDA6,"[{'Fabric': 'Wool Blend'}, {'Color': 'Multicol...",ATABZZONE4.6Seller changed. Check for any chan...,Clothing Accessories,236,BEANIE Cap (Pack of 2),
466,1b46ea33-1954-55dc-b54e-fff3a47a2b07,2.9,24,saltla,Clothing and Accessories,2021-02-11 00:26:09,undefined,['https://rukminim1.flixcart.com/image/128/128...,True,7,JCKFZJWZNCGGZGSE,"[{'Color': 'Blue'}, {'Fabric': 'Denim'}, {'Pat...",undefined,Winter Wear,721,Full Sleeve Washed Men Denim Jacket,
1606,3438f7c2-8d11-5256-a037-b4f2237a2e31,0.0,0,Asa,Clothing and Accessories,2021-02-10 22:56:47,ASABAs Fon Long Sleeve milange cotton shirt fo...,['https://rukminim1.flixcart.com/image/128/128...,True,4,SHTFENBEHZZUCAM7,"[{'Pack of': '1'}, {'Model Name': 'milange cot...",undefined,Topwear,658,Men Regular Fit Solid Cut Away Collar Formal S...,
2168,68d8310b-ca0c-5df9-ac4d-462952e7ebb5,0.0,0,undefined,Clothing and Accessories,2021-02-10 20:52:57,undefined,['https://rukminim1.flixcart.com/image/128/128...,False,4,VESFYUAFKUFJMAGF,"[{'Pattern': 'Solid'}, {'Sleeve': 'Half Sleeve...",LION LEVEL(Not Enough Ratin,Innerwear and Swimwear,225,lion level Men Vest,
3142,3e840d5d-2514-5041-b679-bc2c90fe9874,2.7,22,Solid Styl,Clothing and Accessories,2021-02-10 20:23:32,Grey Slim Fit Partywear Non Lined Denim Blazer...,['https://rukminim1.flixcart.com/image/128/128...,True,6,BZRFUXVMRDYBJB96,"[{'Color': 'Grey'}, {'Fabric': 'Denim'}, {'Pat...",undefined,"Blazers, Waistcoats and Suits",328,Self Design Single Breasted Casual Men Full Sl...,
3207,9574815d-2e52-5ab4-8d82-8f34e5179664,3.5,29,yellowvib,Clothing and Accessories,2021-02-10 23:42:25,undefined,['https://rukminim1.flixcart.com/image/128/128...,False,9,CAPFHSVH3M6DBFZE,"[{'Fabric': 'cotton'}, {'Color': 'Blue'}, {'St...",firstly online,Clothing Accessories,520,cricket Cap,
3529,664c0669-27ee-5e28-a151-ad19014dc464,4.0,33,Dex,Clothing and Accessories,2021-02-10 22:22:57,undefined,['https://rukminim1.flixcart.com/image/128/128...,True,12,PYJFWZFZRSTN63ZH,"[{'Pattern': 'Checkered'}, {'Color': 'Multicol...",undefined,Sleepwear,77,Men Pyjama (Pack of 3),
3533,6083623f-869c-5759-b59a-bea507b72593,0.0,0,fashion mu,Clothing and Accessories,2021-02-11 00:33:45,undefined,['https://rukminim1.flixcart.com/image/128/128...,False,4,JCKFZD6HTWQF8YFB,"[{'Color': 'Blue'}, {'Fabric': 'Polyester Visc...",AB INTERNATIONAL(Not Enough Ratin,Winter Wear,679,Sleeveless Self Design Men Nehru Jacket,
4463,a4d4d379-d3a6-5509-b222-745f14dfb10c,3.6,30,R,Clothing and Accessories,2021-02-10 21:54:17,Self Design Unisex Baseball NY Logo Stylish Co...,['https://rukminim1.flixcart.com/image/128/128...,False,10,CAPF8QM3PFA2ZCQ8,"[{'Fabric': 'Polycotton'}, {'Color': 'Black'},...",A ONE QUALITY,Clothing Accessories,329,Solid New BLACK Design Half Net for Boy girls Cap,
5514,d617ca68-d544-535f-9e6b-0450d2b8bfcd,4.5,37,PixF,Clothing and Accessories,2021-02-10 23:49:56,This winter Indian homegrown brand PIXFAB brin...,['https://rukminim1.flixcart.com/image/128/128...,False,15,SWSFY5ZHEJ2HYWDG,"[{'Color': 'Black'}, {'Fabric': 'Cotton Fleece...",PixFab,Winter Wear,118,Full Sleeve Printed Men Sweatshirt,
