# Imports

In [659]:
import pandas as pd
import numpy as np
import ast

import matplotlib.pyplot as plt

from sklearn.preprocessing import TargetEncoder
from sklearn.model_selection import train_test_split

plt.style.use('ggplot')

pd.options.display.max_columns = 999
pd.options.display.max_rows = 999


import warnings

warnings.filterwarnings('ignore')

## Load Data

In [712]:
path = '/home/edilson07/projects/hackday_6/datasets/'

df_raw = pd.read_json(path + 'train.json', orient='split')

## Criar numero de caracteristicas

In [713]:
df_raw['product_details_count'] = df_raw['product_details'].apply(lambda x: len(ast.literal_eval(x)) )

## Transformar detalhes dos produtos em colunas

In [714]:
df_raw['product_details'] = df_raw['product_details'].apply(lambda x: ast.literal_eval(x))

In [715]:
all_keys = set()
for data in df_raw['product_details']:
    for item in data:
        all_keys.update(item.keys())

# Para cada chave no conjunto, crie uma coluna no DataFrame e preencha com os valores correspondentes
for key in all_keys:
    df_raw[key] = df_raw['product_details'].apply(lambda x: next((item[key] for item in x if key in item), np.nan))

# Remova a coluna 'product_details' original
df_raw.drop('product_details', axis=1, inplace=True)

In [717]:
df_raw.rename(columns={'Pack of': 'Pack Of 2'}, inplace=True)

# Preencha os valores NaN na coluna 'Pack Of' com os valores da coluna 'Pack Of' 2
#f_raw['Pack Of'].fillna(df_raw['Pack Of 2'], inplace=True)

#df_raw.drop('Pack Of 2', axis=1, inplace=True)

In [718]:
df_raw.head()

Unnamed: 0,_id,average_rating,number_of_reviews,brand,category,crawled_at,description,images,out_of_stock,avg_delivery_time_days,pid,seller,sub_category,fabrication_time,title,actual_price,product_details_count,Unnamed: 18,International Warranty,Faded,Pack Of 2,Minimum Age,Reversible,Closure,Width,Stretchable,Shoulder in inch,Hip in inch,Occasion,Ideal For,Length,Generic Name,Length Type,Waistband,Weight,Pockets,Unnamed: 37,Season,Style Code,Distressed,Lining Material,Tanning Process,Thumb Hole,Pack Of,Knit Type,Fabric Care,Length in inch,Pleats,Other Dimensions,Fastener,Sales Package,Model Number,Other Features,Inseam Length,Care instructions,Maximum Age,Pattern,Material,Coat Type,Country of Origin,Weave Type,Type for Flats,Covered in Warranty,Other Details,Hem,Sleeve Type,Top Closure,Width in inch,Model Details,Animal Source,Sole Material,Vents,Brand,Pocket Type,Rise,Technology Used,Fabric Details,Pleated,Clasp Type,Tip Shape,Fabric,Neck,Warranty Summary,Bottom Fabric,Shade,Hooded,Bottom Type,Package contains,Strap Material,Top Type,Dupatta Length,Stitching Type,Belt Loops,Brand Color,Sleeve,Fly,Placket,Domestic Warranty,Character,Clasp Material,Upper Pattern,Not Covered in Warranty,Top Fabric,Inside Leg in inch,Neck Type,Series,Weave type,Sleeve in inch,Number of Contents in Sales Package,Region,Heel Pattern,Shoe Length,Bust in inch,Design,Inner Material,Thigh in inch,Care Instructions,Brand Fit,Lining,Foot Coverage,Cuff,Collar,Outer Material,Model Name,Rise in inch,Width at Base,Height,Waist in inch,Fabric care,Suitable For,Warranty Service Type,Secondary Color,Leather Type,Color,School Shoe,Fit,Bottom Length,Type,Style,Top Length,Size,Alteration Required
0,53df9662-e500-569c-946e-0c8d215a72cd,3.2,26,East I,Clothing and Accessories,2021-02-10 21:17:28,Navy Blue Printed Boxers Has An Inner Elasti...,['https://rukminim1.flixcart.com/image/128/128...,False,8,BXRFTZF7JGX75DAW,ZIYAA,Innerwear and Swimwear,653,Printed Men Boxer (Pack of 1),849.0,6,,,,1.0,,,,,,,,,,,,,,,,,,EIBXCO053,,,,,,,,,,,,1 boxer,,,,,,Printed,,,,,,,,,,,,,,,,,,,,,,,,Pure Cotton,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Dark Blue,,,,,,,,
1,d0142842-84f7-537d-a06f-d85b76488a5f,4.0,33,dream o,Clothing and Accessories,2021-02-11 01:02:46,smiley printed tshirt on round neck cotton tshirt,['https://rukminim1.flixcart.com/image/128/128...,False,12,TSHFWQM96UHR6A4Q,Dream Onn Creations,Topwear,668,Printed Men Round Neck Orange T-Shirt,699.0,16,,,,1.0,,No,,,,,,,Men,,,,,,,,,orange,,,,,,,Do not Iron on print/embroidery/embellishment,,,,,pack of 1,,,,,,Printed,,,,,,,,,Narrow,,,,,,,,,,,,,,,Cotton Blend,,,,,,,,,,,,,ORANGE,Short Sleeve,,,,,,,,,,Round Neck,,,,,,,,,,,,,,,,,,,,,,,,,Western Wear,,,,,,Regular,,Round Neck,,,M,
2,79c8f0d7-30b1-5dd4-9f2f-2fe97782b027,3.9,32,Free Authori,Clothing and Accessories,2021-02-11 00:43:37,Free Authority Presents this Crew Neck Yellow ...,['https://rukminim1.flixcart.com/image/128/128...,False,11,SWSFWCXH2WF6ZYRB,BioworldMerchandising,Winter Wear,53,Full Sleeve Graphic Print Men Sweatshirt,1499.0,13,,,,,,No,,,,,,Casual,,,,,,,,,,STY-20-21-001595,,,,,,,"Hand Wash, Reverse and Dry",,,,,1 Terry Cotton Sweatshirt for Men,,,,,,Graphic Print,,,,,,,Star Wars printed Yellow Sweatshirt for Men,,,,,,,,,,,,,,,,,Polycotton,Crew Neck,,,,No,,,,,,,,,Full Sleeve,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Western Wear,,,,Yellow,,,,,,,,
3,0531c28c-7c50-5fbd-9ce3-a7cae3243ad5,3.8,31,HUMBE,Clothing and Accessories,2021-02-10 21:22:10,Cotton Blend FabricCollar / Polo Neck White & ...,['https://rukminim1.flixcart.com/image/128/128...,False,11,TSHFHQH3HKDAGGK9,HUMBERT,Topwear,510,"Solid Men Polo Neck Light Blue, White T-Shirt ...",1699.0,18,,,,2.0,,No,,,,,,,Men,,,,,,,,,HU2002PO-HS-IB-WH,,,,,,,Gentle Machine Wash,,,,,2 pc T-shirts,,,,,,Solid,,,,,,,Polo T-shirts Combo,,Wide,,,,,,,,,,,,,,,Cotton Blend,,,,,,,,,,,,,Multi-Color,Short Sleeve,,,,,,,,,,Polo Neck,,,,,,,,,,,,,Regular,,,,,,,,,,,,Western Wear,,,,,,Regular,,Polo Neck,,,L,
4,d604baad-472e-5c18-86a3-7b46d4a890c2,2.4,20,Rose We,Clothing and Accessories,2021-02-10 23:36:36,undefined,['https://rukminim1.flixcart.com/image/128/128...,False,5,TSHFW9CJZSYUU6UX,Rupalcollectionjaipur,Topwear,496,Printed Men Round Neck White T-Shirt,599.0,15,,,,1.0,,,,,,,,,Men,,T Shirts,,,,,,,RC021_White,,,,,,,Regular Machine Wash,,,,,,,,,,,Printed,,,India,,,,,,,,,,,,,,,,,,,,,Cotton Blend,,,,,,,,,,,,,White,Short Sleeve,,,,,,,,,,Round Neck,,,,,,,,,,,,,,,,,,,,,,,,,Western Wear,,,,,,Regular,,Round Neck,,,S,


In [666]:
df_raw.isna().sum()

_id                                        0
average_rating                             0
number_of_reviews                          0
brand                                      0
category                                   0
crawled_at                                 0
description                                0
images                                     0
out_of_stock                               0
avg_delivery_time_days                     0
pid                                        0
seller                                     0
sub_category                               0
fabrication_time                           0
title                                      0
actual_price                            3496
product_details_count                      0
                                       23073
International Warranty                 23219
Faded                                  22608
Minimum Age                            23295
Reversible                             11108
Closure   

## Escolher as colunas com menos NaN

In [667]:
details_to_maintain = ['pid', '_id',
                       'average_rating',
                       'number_of_reviews',
                       'brand',
                       'category',
                       'crawled_at',
                       'description',
                       'images',
                       'out_of_stock',
                       'avg_delivery_time_days',
                       'pid',
                       'seller',
                       'sub_category',
                       'fabrication_time',
                       'title',
                       'actual_price',
                       'product_details_count',
                       "Fabric",
                        "Fabric Care",
                        "Hooded",
                        "Pockets",
                        "Pack Of",
                        "Secondary Color",
                        "Style Code",
                        "Ideal For",
                        "Reversible",
                        "Neck",
                        "Generic Name",
                        "Brand Fit",
                        "Pattern",
                        "Sleeve",
                        "Country of Origin",
                        "Size",
                        "Fit",
                        "Number of Contents in Sales Package",
                        "Suitable For",
                        "Occasion",
                        "Color",
                        "Neck Type"]

df_raw = df_raw[details_to_maintain]

# Data Description

In [668]:
df1 = df_raw.copy()

In [669]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
Index: 23309 entries, 0 to 23308
Data columns (total 40 columns):
 #   Column                               Non-Null Count  Dtype         
---  ------                               --------------  -----         
 0   pid                                  23309 non-null  object        
 1   _id                                  23309 non-null  object        
 2   average_rating                       23309 non-null  float64       
 3   number_of_reviews                    23309 non-null  int64         
 4   brand                                23309 non-null  object        
 5   category                             23309 non-null  object        
 6   crawled_at                           23309 non-null  datetime64[ns]
 7   description                          23309 non-null  object        
 8   images                               23309 non-null  object        
 9   out_of_stock                         23309 non-null  bool          
 10  avg_delivery_ti

In [670]:
df1.describe()

Unnamed: 0,average_rating,number_of_reviews,crawled_at,avg_delivery_time_days,fabrication_time,actual_price,product_details_count
count,23309.0,23309.0,23309,23309.0,23309.0,19813.0,23309.0
mean,3.367322,27.767386,2021-02-10 22:55:49.561242624,9.933502,366.663563,1480.980568,13.4385
min,0.0,0.0,2021-02-10 20:11:51,3.0,1.0,150.0,1.0
25%,3.1,26.0,2021-02-10 21:30:50,7.0,184.0,895.0,11.0
50%,3.7,31.0,2021-02-10 23:01:37,10.0,369.0,1256.0,14.0
75%,4.1,34.0,2021-02-11 00:17:54,13.0,550.0,1799.0,17.0
max,5.0,41.0,2021-02-11 01:31:55,19.0,729.0,12999.0,25.0
std,1.15757,9.555809,,3.568971,210.255811,967.82465,4.517272


## Criar colunas Numero de Imagens

In [671]:
df1['images'] = df1['images'].apply(lambda x: ast.literal_eval(x))

In [672]:
df1['number_images'] = df1['images'].apply(lambda x: len(x))

In [673]:
#Coluna Fabric
df1['has_cotton']    = df1['Fabric'].str.contains('cott', case=False, na=False)
df1['has_polyester'] = df1['Fabric'].str.contains('poly', case=False, na=False)
df1['has_lycra']     = df1['Fabric'].str.contains('lycr', case=False, na=False)

#Coluna Brand Fit
df1['is_regular'] = df1['Brand Fit'].str.contains('reg', case=False, na=False)
df1['is_slim']    = df1['Brand Fit'].str.contains('slim', case=False, na=False)
df1['is_fit']     = df1['Brand Fit'].str.contains('fit', case=False, na=False)

## Dropar colunas complicadas/descessarias

In [674]:
cols_drop = ['crawled_at','description','title','images']

In [675]:
df1 = df1.drop(cols_drop, axis=1)

### NAN das coluna price

In [676]:
#brand_avg_actual_price = df1.loc[:,['brand','actual_price']].groupby(['brand']).mean().reset_index()

In [677]:
#brand_avg_actual_price

In [678]:
#df1 = df1.merge(brand_avg_actual_price, on='brand', suffixes=('', '_avg'))
#df1['actual_price'].fillna(df1['actual_price_avg'], inplace=True)
#df1.drop(columns=['actual_price_avg'], inplace=True)

In [679]:
df1 = df1.dropna(subset='actual_price')

In [680]:
df1 = df1.fillna(0)

In [681]:
df1.isna().sum()

pid                                    0
_id                                    0
average_rating                         0
number_of_reviews                      0
brand                                  0
category                               0
out_of_stock                           0
avg_delivery_time_days                 0
pid                                    0
seller                                 0
sub_category                           0
fabrication_time                       0
actual_price                           0
product_details_count                  0
Fabric                                 0
Fabric Care                            0
Hooded                                 0
Pockets                                0
Pack Of                                0
Secondary Color                        0
Style Code                             0
Ideal For                              0
Reversible                             0
Neck                                   0
Generic Name    

In [682]:
df1.shape

(19813, 43)

## Encoders

In [683]:
df1.nunique()

pid                                    18718
_id                                    19813
average_rating                            38
number_of_reviews                         32
brand                                    301
category                                   3
out_of_stock                               2
avg_delivery_time_days                    17
pid                                    18718
seller                                   460
sub_category                              23
fabrication_time                         729
actual_price                             636
product_details_count                     25
Fabric                                   198
Fabric Care                              422
Hooded                                     3
Pockets                                   98
Pack Of                                   14
Secondary Color                           51
Style Code                             15745
Ideal For                                  6
Reversible

In [684]:
df1.columns

Index(['pid', '_id', 'average_rating', 'number_of_reviews', 'brand',
       'category', 'out_of_stock', 'avg_delivery_time_days', 'pid', 'seller',
       'sub_category', 'fabrication_time', 'actual_price',
       'product_details_count', 'Fabric', 'Fabric Care', 'Hooded', 'Pockets',
       'Pack Of', 'Secondary Color', 'Style Code', 'Ideal For', 'Reversible',
       'Neck', 'Generic Name', 'Brand Fit', 'Pattern', 'Sleeve',
       'Country of Origin', 'Size', 'Fit',
       'Number of Contents in Sales Package', 'Suitable For', 'Occasion',
       'Color', 'Neck Type', 'number_images', 'has_cotton', 'has_polyester',
       'has_lycra', 'is_regular', 'is_slim', 'is_fit'],
      dtype='object')

In [685]:
df1.dtypes

pid                                     object
_id                                     object
average_rating                         float64
number_of_reviews                        int64
brand                                   object
category                                object
out_of_stock                              bool
avg_delivery_time_days                   int64
pid                                     object
seller                                  object
sub_category                            object
fabrication_time                         int64
actual_price                           float64
product_details_count                    int64
Fabric                                  object
Fabric Care                             object
Hooded                                  object
Pockets                                 object
Pack Of                                 object
Secondary Color                         object
Style Code                              object
Ideal For    

## PreProcessing

In [686]:
X = df1.drop(['pid','actual_price'], axis=1).copy()
y = df1['actual_price'].copy()

In [687]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [688]:
#ONE HOT ENCODER
onehot = ['category','out_of_stock']
X_train = pd.get_dummies(X_train, columns=onehot)
X_val = pd.get_dummies(X_val, columns=onehot)

In [689]:


#TARGET ENCODER
te_cols = ['_id','brand','seller','sub_category',"Fabric",
    "Fabric Care",
    "Hooded",
    "Pockets",
    "Pack Of",
    "Secondary Color",
    "Style Code",
    "Ideal For",
    "Reversible",
    "Neck",
    "Generic Name",
    "Brand Fit",
    "Pattern",
    "Sleeve",
    "Country of Origin",
    "Size",
    "Fit",
    "Number of Contents in Sales Package",
    "Suitable For",
    "Occasion",
    "Color",
    "Neck Type"]

X_train[te_cols] = X_train[te_cols].astype(str)
X_val[te_cols] = X_val[te_cols].astype(str)

te = TargetEncoder(target_type='continuous')

# Ajuste o encoder aos dados de treinamento
X_train[te_cols] = te.fit_transform(X_train[te_cols], y_train)

# Transforme os dados de validação usando o encoder treinado com os dados de treinamento
X_val[te_cols] = te.transform(X_val[te_cols])



## Dropar ID E variável resposta

In [690]:
# from sklearn.ensemble import RandomForestRegressor

# # Crie o modelo Random Forest
# rf = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=1) 

# # Treine o modelo
# rf.fit(X, y)

# # Obtenha a importância das características
# feature_importance = rf.feature_importances_

# # Crie um DataFrame para facilitar a visualização
# feature_importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': feature_importance})

# # Classifique as características com base na importância
# feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# # Crie o gráfico de barras
# plt.figure(figsize=(10, 6))
# plt.barh(feature_importance_df['Feature'], feature_importance_df['Importance'])
# plt.xlabel('Importância da Característica')
# plt.ylabel('Característica')
# plt.title('Importância das Características - Random Forest')
# plt.show()

In [691]:
# from sklearn.feature_selection import RFECV

# rfecv = RFECV(estimator=rf, step=1, cv=3, verbose=2)

# rfecv.fit(X_train, y_train)

In [692]:
# # Ajuste o seletor de características aos dados de treinamento


# selected_features_mask = rfecv.support_

# # Obtenha o nome das características selecionadas a partir das colunas do DataFrame original
# selected_features = X.columns[selected_features_mask]

# # Exiba os nomes das características selecionadas
# print("Características selecionadas:", selected_features)

In [693]:
best_features = ['average_rating', 'number_of_reviews', 'brand',
       'avg_delivery_time_days', 'seller', 'sub_category', 'fabrication_time',
       'product_details_count', 'Fabric', 'Fabric Care', 'Hooded', 'Pockets',
       'Pack Of', 'Secondary Color', 'Style Code', 'Ideal For', 'Reversible',
       'Neck', 'Generic Name', 'Brand Fit', 'Pattern', 'Sleeve',
       'Country of Origin', 'Size', 'Fit',
       'Number of Contents in Sales Package', 'Occasion', 'Color', 'Neck Type',
       'number_images', 'has_polyester']

# Machine Learning

In [694]:
X_train

Unnamed: 0,_id,average_rating,number_of_reviews,brand,avg_delivery_time_days,seller,sub_category,fabrication_time,product_details_count,Fabric,Fabric Care,Hooded,Pockets,Pack Of,Secondary Color,Style Code,Ideal For,Reversible,Neck,Generic Name,Brand Fit,Pattern,Sleeve,Country of Origin,Size,Fit,Number of Contents in Sales Package,Suitable For,Occasion,Color,Neck Type,number_images,has_cotton,has_polyester,has_lycra,is_regular,is_slim,is_fit,"category_Bags, Wallets & Belts",category_Clothing and Accessories,category_Footwear,out_of_stock_False,out_of_stock_True
21651,1477.916404,3.7,31,1141.412002,10,984.085040,2577.209234,487,6,1492.779263,1306.960716,1404.258144,1436.681422,1467.031900,1466.740820,1477.916404,1624.043981,1325.942312,1420.564759,1770.500696,1567.643883,1635.475894,1501.532715,1318.714307,1713.242756,1621.902535,1495.166436,1329.467178,1422.587493,1960.348876,1713.111269,4,True,False,False,False,False,False,False,True,False,True,False
20460,1482.265931,4.2,35,2668.720681,13,2335.638151,1298.259245,456,18,1378.179494,1558.772174,1407.553076,1440.144803,1430.807227,1469.918765,1482.265931,1390.490030,1625.181070,1424.842908,1582.919018,1170.459172,1279.356314,1170.754163,1581.857065,1159.686233,1305.453449,1500.126334,1551.727928,1426.562465,1193.086935,1132.530833,6,True,False,False,True,False,True,False,True,False,True,False
11783,1477.916404,4.5,37,1010.356597,15,1010.356597,950.247429,163,5,1152.694912,1306.960716,1404.258144,1436.681422,1467.031900,1466.740820,1477.916404,1624.043981,1325.942312,1420.564759,1579.641186,1567.643883,1558.141899,1501.532715,1577.690397,1713.242756,1621.902535,1495.166436,1329.467178,1422.587493,1337.297887,1713.111269,0,False,False,False,False,False,False,False,True,False,True,False
5488,1468.690931,4.0,33,2410.783507,12,2371.359710,1986.198600,541,6,2193.276262,1276.063392,1394.469935,1427.244866,1468.983922,1454.859700,1468.690931,1614.317648,1310.417276,1410.283800,1572.895579,1558.296332,1588.458637,1482.172470,1571.499675,1700.810214,1614.501729,1484.056241,1313.987992,1415.211933,2020.473700,1700.537687,6,False,True,False,False,False,False,False,True,False,True,False
19484,1472.332334,4.1,34,1575.352195,13,1575.352195,1291.892353,387,15,1481.942952,1199.510142,1397.647016,1429.038487,1416.915785,1461.995294,1472.332334,1624.504176,1610.223798,1412.763349,1628.770183,1562.202199,1500.912062,1170.480118,1315.941615,1710.014973,999.000000,1488.054038,1537.906934,1417.673941,1977.804407,1709.737186,4,True,False,False,False,False,False,False,True,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13278,1475.213013,3.4,28,1601.366759,9,1601.366759,1295.436781,263,20,1491.246839,1684.379203,1399.126814,1433.994018,1608.881918,1514.883542,1475.213013,1382.739953,1619.012300,1416.599913,1576.698899,1185.675066,1578.720977,1257.208785,1576.557909,1148.157294,1306.728949,1492.259794,1545.339096,1421.173837,1186.710307,1125.063859,5,True,False,False,True,False,True,False,True,False,True,False
14078,1477.916404,3.9,32,1199.000000,11,1199.000000,1295.716592,401,18,1380.392774,1568.335914,1404.258144,1436.681422,1429.955474,1466.740820,1477.916404,1386.765120,1325.942312,1420.564759,1041.619422,1224.844360,1271.824064,1172.999660,1318.714307,1217.082967,1299.805558,1495.166436,1545.883513,1422.587493,1189.495352,1120.855506,2,True,False,False,True,False,False,False,True,False,True,False
6357,1477.916404,3.2,26,1251.162211,8,1255.764266,1295.716592,637,17,1492.779263,1568.335914,1404.258144,1436.681422,1429.955474,1466.740820,1477.916404,1386.765120,1615.472649,1420.564759,1041.619422,1567.643883,1271.824064,1258.559203,1318.714307,1177.344509,1412.017948,1495.166436,1545.883513,1422.587493,1189.495352,1120.855506,6,True,False,False,False,False,False,False,True,False,True,False
1003,1482.265931,3.4,28,954.358511,9,836.425290,1298.259245,315,18,1378.179494,1680.585591,1407.553076,1440.144803,1430.807227,1469.918765,1482.265931,1390.490030,1625.181070,1424.842908,1042.385337,1577.289625,1640.294465,1170.754163,1319.502163,1123.743955,1305.453449,1500.126334,1551.727928,1426.562465,1193.086935,1132.530833,4,True,False,False,False,False,False,False,True,False,True,False


In [695]:
X_val

Unnamed: 0,_id,average_rating,number_of_reviews,brand,avg_delivery_time_days,seller,sub_category,fabrication_time,product_details_count,Fabric,Fabric Care,Hooded,Pockets,Pack Of,Secondary Color,Style Code,Ideal For,Reversible,Neck,Generic Name,Brand Fit,Pattern,Sleeve,Country of Origin,Size,Fit,Number of Contents in Sales Package,Suitable For,Occasion,Color,Neck Type,number_images,has_cotton,has_polyester,has_lycra,is_regular,is_slim,is_fit,"category_Bags, Wallets & Belts",category_Clothing and Accessories,category_Footwear,out_of_stock_False,out_of_stock_True
4414,1475.283722,4.1,34,832.713926,13,832.426372,1295.115731,399,18,897.583064,747.772929,1400.608325,1433.42326,1423.286182,1463.655811,500.000000,1383.517539,1616.399592,1417.011271,1039.667137,1566.372347,1630.084281,1171.850822,1313.368740,1120.994332,1402.123844,1491.931292,1543.974676,1420.644757,1188.615773,1127.544847,6,True,False,False,False,False,False,False,True,False,True,False
15317,1475.283722,2.7,22,1393.594102,6,1395.033889,1295.115731,610,18,1376.474817,1572.832463,1400.608325,1433.42326,1423.286182,1463.655811,1475.283722,1383.517539,1616.399592,1417.011271,1039.667137,1317.695916,1370.127497,1171.850822,1313.368740,1220.693023,1402.123844,1491.931292,1543.974676,1420.644757,1188.615773,1127.544847,2,True,False,False,False,True,False,False,True,False,True,False
2609,1475.283722,3.4,28,2145.833778,9,1499.000000,1295.115731,512,17,1492.045028,1297.208865,1400.608325,1433.42326,1423.286182,1463.655811,1475.283722,1383.517539,1616.399592,1417.011271,1039.667137,1102.368076,1630.084281,1171.850822,1313.368740,1120.994332,1402.123844,1491.931292,1543.974676,1420.644757,1188.615773,1331.990069,5,True,False,False,False,True,True,False,True,False,True,False
6767,1475.283722,3.6,30,1194.488347,10,1196.349687,1745.259651,360,13,1376.474817,1201.393292,1400.608325,1433.42326,1467.537397,1463.655811,1475.283722,1383.517539,1319.845946,1793.920001,1576.270678,1566.372347,1582.203207,1952.868123,1575.051174,1712.101922,1621.184544,1303.592333,1324.361636,1913.624966,1820.248289,1711.879309,4,True,False,False,False,False,False,False,True,False,True,False
10020,1475.283722,1.9,16,1080.685525,4,1080.685525,1999.578671,254,12,1376.474817,1201.393292,1400.608325,1433.42326,1423.286182,1463.655811,1428.497370,1624.081331,1319.845946,1417.011271,1576.270678,1566.372347,1630.084281,1495.824594,1575.051174,1712.101922,2100.249076,1491.931292,1256.921907,1913.624966,1572.882521,1711.879309,2,True,False,False,False,False,False,False,True,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17564,1475.283722,3.4,28,1353.259978,9,1351.356359,858.016191,674,7,1492.045028,1297.208865,1400.608325,1433.42326,1612.860477,1463.655811,459.000000,1624.081331,1319.845946,1417.011271,1224.984596,1566.372347,1269.805944,1495.824594,1313.368740,1712.101922,1621.184544,1491.931292,1324.361636,1420.644757,1333.458441,1711.879309,5,True,False,False,False,False,False,False,True,False,True,False
14819,1475.283722,3.7,31,1191.748224,10,1191.748224,1295.115731,238,20,1492.045028,1675.449750,1400.608325,1433.42326,1423.286182,1463.655811,1475.283722,1383.517539,1616.399592,1417.011271,1039.667137,1185.545288,1614.268408,1171.850822,1313.368740,1212.678685,1302.295253,1491.931292,1543.974676,1420.644757,1188.615773,1331.990069,5,True,False,False,True,False,True,False,True,False,True,False
1674,1475.283722,4.5,37,2403.447320,15,2675.849391,1295.115731,707,16,2232.924621,1675.449750,1400.608325,1433.42326,1423.286182,1463.655811,1475.283722,1383.517539,1616.399592,1417.011271,1576.270678,1226.449527,1582.203207,1171.850822,1575.051174,1147.897102,1302.295253,1491.931292,1543.974676,1420.644757,1188.615773,1331.990069,6,False,True,False,True,False,False,False,True,False,True,False
9844,1475.283722,2.8,23,1027.503695,6,1024.272457,1295.115731,452,19,1492.045028,999.000000,1400.608325,1433.42326,1612.860477,1463.655811,1475.283722,1383.517539,1658.884869,1417.011271,1039.667137,1226.449527,1269.805944,1171.850822,1313.368740,1212.678685,1302.295253,1491.931292,1543.974676,1420.644757,1188.615773,1127.544847,4,True,False,False,True,False,False,False,True,False,True,False


In [696]:
y_pred

array([1364.34637868, 2652.32693357, 2112.96099046, ..., 8392.95619048,
       1301.96149278, 2164.45814546])

In [697]:
X_train = X_train[best_features]
X_val = X_val[best_features]

In [698]:
X_train.shape, X_val.shape

((15850, 31), (3963, 31))

## XGBoost

In [699]:
from xgboost import XGBRegressor

xgb = XGBRegressor()

xgb.fit(X_train, y_train)

In [700]:
y_pred_val = xgb.predict(X_val)

In [701]:
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error, mean_squared_error 

def smape(y_pred, y_test):

    smape = 100 / len(y_test) * np.sum(2 * np.abs(y_pred - y_test) / (np.abs(y_test) + np.abs(y_pred)))
    return smape

In [702]:
print(f'MAE: {mean_absolute_error(y_pred_val, y_val)}')
print(f'MAPE: {mean_absolute_percentage_error(y_pred_val, y_val)}')
print(f'RMSE: {mean_squared_error(y_pred_val, y_val, squared=False)}')
print(f'SMAPE: {smape(y_pred_val, y_val)}')


MAE: 210.51748266034795
MAPE: 0.13707989477429078
RMSE: 400.28868601190334
SMAPE: 13.718850077861488


## Catboost

In [703]:
from catboost import CatBoostRegressor

cat = CatBoostRegressor(verbose=False)

cat.fit(X_train, y_train)
y_pred_cat = cat.predict(X_val)

In [704]:
print(f'MAE: {mean_absolute_error(y_pred_cat, y_val)}')
print(f'MAPE: {mean_absolute_percentage_error(y_pred_cat, y_val)}')
print(f'RMSE: {mean_squared_error(y_pred_cat, y_val, squared=False)}')
print(f'SMAPE: {smape(y_pred_cat, y_val)}')

MAE: 218.80088955282756
MAPE: 0.14558772787959734
RMSE: 391.3110151897406
SMAPE: 14.626490863183143


In [705]:
X_train.dtypes

average_rating                         float64
number_of_reviews                        int64
brand                                  float64
avg_delivery_time_days                   int64
seller                                 float64
sub_category                           float64
fabrication_time                         int64
product_details_count                    int64
Fabric                                 float64
Fabric Care                            float64
Hooded                                 float64
Pockets                                float64
Pack Of                                float64
Secondary Color                        float64
Style Code                             float64
Ideal For                              float64
Reversible                             float64
Neck                                   float64
Generic Name                           float64
Brand Fit                              float64
Pattern                                float64
Sleeve       

In [706]:
import inflection

X_train.columns = X_train.columns.map(lambda x: inflection.parameterize(x, separator='_'))
X_val.columns = X_val.columns.map(lambda x: inflection.parameterize(x, separator='_'))
X.columns = X.columns.map(lambda x: inflection.parameterize(x, separator='_'))

In [707]:
X_train.dtypes

average_rating                         float64
number_of_reviews                        int64
brand                                  float64
avg_delivery_time_days                   int64
seller                                 float64
sub_category                           float64
fabrication_time                         int64
product_details_count                    int64
fabric                                 float64
fabric_care                            float64
hooded                                 float64
pockets                                float64
pack_of                                float64
secondary_color                        float64
style_code                             float64
ideal_for                              float64
reversible                             float64
neck                                   float64
generic_name                           float64
brand_fit                              float64
pattern                                float64
sleeve       

## LightGBM

In [708]:
from lightgbm import LGBMRegressor

lgb = LGBMRegressor()

lgb.fit(X_train, y_train)

y_pred_lgb = lgb.predict(X_val)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001455 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2847
[LightGBM] [Info] Number of data points in the train set: 15850, number of used features: 31
[LightGBM] [Info] Start training from score 1475.283722


In [709]:
print(f'MAE: {mean_absolute_error(y_pred_lgb, y_val)}')
print(f'MAPE: {mean_absolute_percentage_error(y_pred_lgb, y_val)}')
print(f'RMSE: {mean_squared_error(y_pred_lgb, y_val, squared=False)}')
print(f'SMAPE: {smape(y_pred_lgb, y_val)}')

MAE: 224.6561553493578
MAPE: 0.14927834566818418
RMSE: 396.5012497632079
SMAPE: 15.048515349780857


## Random Forest

In [710]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(n_jobs=-1)

rf.fit(X_train, y_train)

y_pred_rf = rf.predict(X_val)

In [711]:
print(f'MAE: {mean_absolute_error(y_pred_rf, y_val)}')
print(f'MAPE: {mean_absolute_percentage_error(y_pred_rf, y_val)}')
print(f'RMSE: {mean_squared_error(y_pred_rf, y_val, squared=False)}')
print(f'SMAPE: {smape(y_pred_rf, y_val)}')

MAE: 168.56611910169065
MAPE: 0.1036211593280569
RMSE: 380.3848566162192
SMAPE: 10.425038064039937


In [547]:
X_train

Unnamed: 0,average_rating,number_of_reviews,brand,avg_delivery_time_days,seller,sub_category,fabrication_time,product_details_count,fabric,fabric_care,hooded,pockets,pack_of,secondary_color,style_code,ideal_for,reversible,neck,generic_name,brand_fit,pattern,sleeve,country_of_origin,size,fit,number_of_contents_in_sales_package,occasion,color,neck_type,number_images,has_polyester
644,3.9,32,1113.426499,11,1095.281995,1315.592292,228,14,1375.597083,1370.178702,1408.400849,1437.743465,1435.612192,1463.788294,1474.431454,1384.527721,1323.144241,1419.712847,1564.233248,1258.039118,1305.163774,1196.312739,1565.647949,1230.834552,1308.427711,1489.214792,1421.426311,1203.490235,1162.158884,6,False
22417,2.5,21,1299.000000,5,1299.000000,1967.822462,349,11,2474.905591,1280.777548,1404.660418,1435.383813,1432.516314,1461.080003,1473.173263,1389.831369,1611.142518,1418.479981,2448.362177,1554.825579,1265.098981,1475.595747,1568.970170,1689.920498,1597.064425,1487.497946,1422.624266,1838.306778,1689.743737,4,False
7834,3.7,31,2633.550652,10,2656.236340,1962.108537,285,9,1497.963622,2774.219296,1402.879841,2068.039894,1447.252033,1462.007790,1471.027877,1605.112440,1318.712790,1416.814351,1569.141712,1549.778673,1303.777688,1464.555469,1568.281479,1681.820011,1586.502063,1486.977032,1422.047067,1923.105518,1681.542843,5,False
22020,1.0,8,737.110220,3,741.499975,881.722681,404,6,1496.016424,1280.777548,1404.660418,1435.383813,1824.317435,1461.080003,1473.173263,1609.046750,1321.889330,1418.479981,1569.116301,1554.825579,1454.754352,1475.595747,1568.970170,1689.920498,1597.064425,1487.497946,1422.624266,1332.329470,1689.743737,3,False
1153,0.0,0,1146.551592,4,1792.446114,974.380222,283,4,986.318655,1280.777548,1404.660418,1435.383813,1453.671482,1461.080003,1473.173263,1609.046750,1321.889330,1418.479981,1569.116301,1554.825579,1265.098981,1475.595747,1568.970170,1689.920498,1597.064425,1487.497946,1422.624266,1207.743666,1689.743737,3,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11964,4.3,35,1535.801382,14,1513.888428,1313.083203,69,15,1377.324694,1578.448526,1404.660418,1435.383813,1432.516314,1461.080003,1499.000000,1389.831369,1611.142518,1418.479981,1569.116301,1554.825579,1502.172739,1199.891373,1568.970170,1242.032119,1306.642885,1487.497946,1422.624266,1207.743666,1165.839151,5,False
21575,4.4,36,2119.164121,15,1789.493399,2405.559220,228,14,2210.136684,3026.282830,1402.879841,1434.666092,1432.836525,1462.007790,1471.027877,1388.737000,1554.295913,1416.814351,2632.551539,1549.778673,1608.418947,1934.068928,1310.388839,1681.820011,1586.502063,1486.977032,1422.047067,3117.579412,1681.542843,5,True
5390,3.2,26,1206.566642,8,1219.218081,1194.794145,284,13,1248.357411,1205.551530,1404.660418,1435.383813,1453.671482,1461.080003,1473.173263,1389.831369,1321.889330,1418.479981,1569.116301,1554.825579,1291.602031,1475.595747,1568.970170,1689.920498,1597.064425,1487.497946,1401.781190,1947.231213,1689.743737,4,False
860,4.0,33,990.370166,12,990.334786,1313.083203,466,18,1208.891132,1205.551530,1404.660418,1435.383813,1824.317435,1461.080003,1473.173263,1389.831369,1611.142518,1418.479981,1569.116301,1224.305865,1623.473722,1937.369507,1568.970170,1242.032119,1306.642885,1487.497946,1422.624266,1207.743666,1165.839151,5,True


Unnamed: 0,average_rating,number_of_reviews,brand,avg_delivery_time_days,seller,sub_category,fabrication_time,fabric_care,hooded,pockets,pack_of,secondary_color,style_code,ideal_for,reversible,neck,generic_name,pattern,sleeve,country_of_origin,size,fit,number_of_contents_in_sales_package,suitable_for,occasion,color,neck_type,number_images,has_cotton,has_polyester,has_lycra,is_regular,is_slim,is_fit,category_bags_wallets_belts,category_clothing_and_accessories,category_footwear,out_of_stock_false,out_of_stock_true
0,3.2,26,1087.055334,8,1087.055334,852.604367,653,1291.550546,1405.899643,1436.178900,1434.894554,1471.155557,1481.557504,1626.659012,1321.300045,1421.320784,1581.245413,1282.190942,1498.372832,1578.669607,1720.259013,1629.907692,1498.876721,1329.356086,1428.405241,1976.278614,1720.260716,5,True,False,False,False,False,False,False,True,False,True,False
1,4.0,33,707.794245,12,707.794245,1301.508013,668,1090.182884,1404.917419,1440.769472,1438.655676,1469.699846,1482.540252,1390.730330,1629.613756,1420.716573,1579.344609,1280.516506,1259.183792,1579.057719,1208.151576,1305.953177,1500.119080,1556.643222,1426.162426,1190.137370,1137.053555,2,True,False,False,False,False,False,False,True,False,True,False
2,3.9,32,1099.111675,11,1074.289445,2536.886840,53,1809.936664,2375.521554,1440.769472,1459.081435,1469.699846,1482.540252,1631.372201,1629.613756,2144.977064,1579.344609,1288.108926,1989.802282,1579.057719,1716.646459,1621.428401,1500.119080,1556.643222,1901.630783,2095.760289,1716.500511,5,True,True,False,False,False,False,False,True,False,True,False
3,3.8,31,1034.767961,11,1035.157129,1293.679566,510,1572.095149,1405.899643,1436.178900,1613.018171,1471.155557,1481.557504,1392.170338,1627.947624,1421.320784,1581.245413,1627.274458,1259.329349,1578.669607,1209.714693,1304.132885,1498.876721,1551.892425,1428.405241,1184.172685,1324.677614,5,True,False,False,True,False,False,False,True,False,True,False
4,2.4,20,829.551101,5,660.790017,1297.721210,496,1207.797965,1402.300726,1437.047268,1423.602659,1462.916513,1476.104851,1378.776927,1323.659617,1418.908249,1042.017930,1282.441152,1256.504303,1320.729747,1149.443563,1297.446849,1492.055654,1545.237862,1419.525465,1189.344266,1128.744392,5,True,False,False,False,False,False,False,True,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23303,2.5,21,1291.227209,5,1282.128192,1297.721210,552,1571.796661,1402.300726,1437.047268,1423.602659,1462.916513,1476.104851,1378.776927,1615.695298,1418.908249,1042.017930,1622.912837,1966.305764,1320.729747,1209.373621,1409.215774,1492.055654,1545.237862,1419.525465,1189.344266,1128.744392,5,True,False,False,False,False,False,False,True,False,True,False
23304,4.1,34,1145.708775,13,550.374474,2533.805362,501,1309.763549,1407.568476,1441.386514,1476.722090,1471.967698,870.000000,1391.737754,1328.107686,717.184766,716.649183,1284.046140,1969.586902,1320.835429,1722.208062,1630.732612,851.446212,1332.657470,1428.960554,1193.792687,1721.896481,4,True,False,False,False,False,False,False,True,False,True,False
23306,0.0,0,1793.160869,4,1793.160869,1712.912443,687,1206.087207,1404.917419,1440.769472,1459.081435,1355.326254,1999.000000,1631.372201,1320.445004,2470.353443,1889.427395,1636.402259,1477.253735,1325.121549,1716.646459,1621.428401,1500.119080,1320.255356,1426.162426,2062.922359,1716.500511,5,False,False,False,False,False,False,False,True,False,True,False
23307,0.0,0,898.436843,4,898.436843,1299.813289,647,1581.762594,1407.568476,1441.386514,1430.785210,1471.967698,1483.935142,1391.737754,1624.961718,1424.121942,1042.651103,1289.265430,1262.082339,1320.835429,1194.141380,1302.919436,1499.589203,1553.159968,1428.960554,1193.792687,1130.878584,6,True,False,False,False,False,False,False,True,False,True,False


In [39]:
rf.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'criterion': 'squared_error',
 'max_depth': None,
 'max_features': 1.0,
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': -1,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

## Fine Tuning Optuna

In [537]:
import optuna

In [543]:
def objective(trial):
    # Defina os parâmetros que você deseja otimizar
    n_estimators = trial.suggest_int('n_estimators', 50, 200)
    max_depth = trial.suggest_int('max_depth', 10, 30)
    min_samples_split = trial.suggest_float('min_samples_split', 0.1, 1.0)
    min_samples_leaf = trial.suggest_float('min_samples_leaf', 0.1, 0.5)
    max_features = trial.suggest_categorical('max_features', ['auto', 'sqrt', 'log2'])

    # Crie o modelo Random Forest com os parâmetros otimizados
    model = RandomForestRegressor(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        max_features=max_features,
        bootstrap=True,  # Mantenha os outros parâmetros inalterados
        ccp_alpha=0.0,
        criterion='squared_error',
        max_leaf_nodes=None,
        max_samples=None,
        min_impurity_decrease=0.0,
        min_weight_fraction_leaf=0.0,
        n_jobs=-1,
        oob_score=False,
        random_state=None,
        verbose=0,
        warm_start=False
    )
    
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_val)
    
    smape_error = smape(y_pred, y_val)
    
    return smape_error

In [544]:
study = optuna.create_study(direction='minimize')

study.optimize(objective, n_trials=50)

[I 2023-10-21 16:06:22,781] A new study created in memory with name: no-name-c7618c13-1828-4d55-9740-5c28ab67e685
[I 2023-10-21 16:06:23,347] Trial 0 finished with value: 36.81348999592606 and parameters: {'n_estimators': 150, 'max_depth': 24, 'min_samples_split': 0.6041830449554761, 'min_samples_leaf': 0.14522864321300158, 'max_features': 'log2'}. Best is trial 0 with value: 36.81348999592606.
[I 2023-10-21 16:06:23,836] Trial 1 finished with value: 43.83022377866186 and parameters: {'n_estimators': 172, 'max_depth': 16, 'min_samples_split': 0.9636601906912733, 'min_samples_leaf': 0.3114813606269792, 'max_features': 'sqrt'}. Best is trial 0 with value: 36.81348999592606.
[I 2023-10-21 16:06:24,187] Trial 2 finished with value: 43.810493442588296 and parameters: {'n_estimators': 125, 'max_depth': 25, 'min_samples_split': 0.9632750458207889, 'min_samples_leaf': 0.4975305190001079, 'max_features': 'sqrt'}. Best is trial 0 with value: 36.81348999592606.
[I 2023-10-21 16:06:24,527] Trial 3

InvalidParameterError: The 'max_features' parameter of RandomForestRegressor must be an int in the range [1, inf), a float in the range (0.0, 1.0], a str among {'log2', 'sqrt'} or None. Got 'auto' instead.

In [332]:
rf.fit(X, y)

## Previsao No test(fazer as mesmas transformações do treino)

In [548]:
X_test = pd.read_json(path + 'test.json', orient='split')

In [549]:
X_test['product_details_count'] = X_test['product_details'].apply(lambda x: len(ast.literal_eval(x)) )

In [550]:
X_test['product_details'] = X_test['product_details'].apply(lambda x: ast.literal_eval(x))

In [551]:
all_keys = set()
for data in X_test['product_details']:
    for item in data:
        all_keys.update(item.keys())

# Para cada chave no conjunto, crie uma coluna no DataFrame e preencha com os valores correspondentes
for key in all_keys:
    X_test[key] = X_test['product_details'].apply(lambda x: next((item[key] for item in x if key in item), np.nan))

# Remova a coluna 'product_details' original
X_test.drop('product_details', axis=1, inplace=True)

In [552]:
X_test.rename(columns={'Pack of': 'Pack Of 2'}, inplace=True)
# Preencha os valores NaN na coluna 'Pack Of' com os valores da coluna 'Pack Of' 2
X_test['Pack Of'].fillna(X_test['Pack Of 2'], inplace=True)

# Agora você pode descartar a coluna 'Pack Of 2' se desejar
X_test.drop('Pack Of 2', axis=1, inplace=True)

In [553]:
X_test['images'] = X_test['images'].apply(lambda x: ast.literal_eval(x))

In [554]:
X_test['number_images'] = X_test['images'].apply(lambda x: len(x))

In [555]:
details_to_maintain_test = ['_id',
                       'average_rating',
                       'number_of_reviews',
                       'brand',
                       'category',
                       'crawled_at',
                       'description',
                       'images',
                       'out_of_stock',
                       'avg_delivery_time_days',
                       'pid',
                       'seller',
                       'sub_category',
                       'fabrication_time',
                       'title',
                       #'actual_price',
                       "Fabric",
                        "Fabric Care",
                        "Hooded",
                        "Pockets",
                        "Pack Of",
                        "Secondary Color",
                        "Style Code",
                        "Ideal For",
                        "Reversible",
                        "Neck",
                        "Generic Name",
                        "Brand Fit",
                        "Pattern",
                        "Sleeve",
                        "Country of Origin",
                        "Size",
                        "Fit",
                        "Number of Contents in Sales Package",
                        "Suitable For",
                        "Occasion",
                        "Color",
                        "Neck Type",
                        'number_images',
                        'product_details_count'
                           ]

In [556]:
X_test = X_test[details_to_maintain_test]

In [557]:
#Coluna Fabric
X_test['has_cotton']    = X_test['Fabric'].str.contains('cott', case=False, na=False)
X_test['has_polyester'] = X_test['Fabric'].str.contains('poly', case=False, na=False)
X_test['has_lycra']     = X_test['Fabric'].str.contains('lycr', case=False, na=False)

#Coluna Brand Fit
X_test['is_regular'] = X_test['Brand Fit'].str.contains('reg', case=False, na=False)
X_test['is_slim']    = X_test['Brand Fit'].str.contains('slim', case=False, na=False)
X_test['is_fit']     = X_test['Brand Fit'].str.contains('fit', case=False, na=False)

In [558]:
X_test.shape

(5600, 45)

In [559]:
X_test.head()

Unnamed: 0,_id,average_rating,number_of_reviews,brand,category,crawled_at,description,images,out_of_stock,avg_delivery_time_days,pid,seller,sub_category,fabrication_time,title,Fabric,Fabric Care,Hooded,Pockets,Pack Of,Secondary Color,Style Code,Ideal For,Reversible,Neck,Generic Name,Brand Fit,Pattern,Sleeve,Country of Origin,Size,Fit,Number of Contents in Sales Package,Suitable For,Occasion,Color,Neck Type,number_images,product_details_count,has_cotton,has_polyester,has_lycra,is_regular,is_slim,is_fit
0,35d28961-170f-515d-90d4-55ee978a8afa,4.8,40,Mo,Clothing and Accessories,2021-02-10 21:18:47,Accentuate your formal wardrobe by buying this...,[https://rukminim1.flixcart.com/image/128/128/...,False,17,SHTFDYZGEXPED7YS,KKSONS,Topwear,32,Men Slim Fit Printed Button Down Collar Casual...,Cotton Blend,"Cold water wash only, Gentle Machine Wash",,,1.0,,11,,No,,Shirt,,Printed,Full Sleeve,India,,Slim,,Western Wear,,White,,4,14,True,False,False,False,False,False
1,5089618f-7eec-571b-84b6-41c3b43a2ed4,3.7,31,True Bl,Clothing and Accessories,2021-02-10 22:12:20,undefined,[https://rukminim1.flixcart.com/image/128/128/...,False,10,TROFM47EVWRYK7ZU,KAPSONSRETAILPVTLTD,Bottomwear,318,Slim Fit Men Brown Cotton Blend Trousers,Cotton Blend,Gentle Machine Wash,,,1.0,,,,,,Trousers,,Solid,,India,,Slim Fit,,Western Wear,Casual,Brown,,5,14,True,False,False,False,False,False
2,84716375-2ea3-5376-bca1-ecef76b0aa87,3.2,26,vims rai,Clothing and Accessories,2021-02-11 01:16:36,undefined,[https://rukminim1.flixcart.com/image/128/128/...,False,8,FABFZAMPGDW3BKGX,WHITE SKY,Fabrics,411,Rayon Printed Shirt Fabric (Unstitched),Rayon,Regular Machine Wash,,,,,Rayon Printed Shirt Fabric (Unstitched) s-155,Men,,,Fabric,,Printed,,India,,,,,Formal,Gold,,4,15,False,False,False,False,False,False
3,84120873-2b67-5f2c-9074-803e52e278f7,3.6,30,True Bl,Clothing and Accessories,2021-02-10 22:21:32,undefined,[https://rukminim1.flixcart.com/image/128/128/...,False,10,BZRFNMGPNBQTYHMG,KAPSONSRETAILPVTLTD,"Blazers, Waistcoats and Suits",431,Solid Single Breasted Formal Men Full Sleeve B...,Polyester,,,,1.0,,20318126204,,,,Blazer,,Solid,Full Sleeve,India,,,,,Formal,Blue,,5,11,False,True,False,False,False,False
4,96238f84-3bb8-5ec1-b2f4-fe481ba35faa,2.5,21,ECKO Unl,Clothing and Accessories,2021-02-10 22:32:00,ECKO Unltd YD CHECK Cotton Woven Slim Fit IND...,[https://rukminim1.flixcart.com/image/128/128/...,False,5,SHTFV5G6PZDQZEVZ,SandSMarketing,Topwear,512,Men Slim Fit Checkered Cut Away Collar Casual ...,Cotton Blend,Gentle Machine Wash,,,1.0,,EKSH001497,,No,,Shirt,,Checkered,Full Sleeve,India,,Slim,,Western Wear,,Brown,,5,13,True,False,False,False,False,False


In [560]:
X_test = X_test.drop(cols_drop, axis=1)
X_test = X_test.drop('pid', axis=1)

In [561]:
X_test = pd.get_dummies(X_test, columns=onehot)
X_test[te_cols] = te.transform(X_test[te_cols])

In [562]:
X_test.shape

(5600, 43)

In [563]:
X_test = X_test[best_features]

In [564]:
X_test.columns = X_test.columns.map(lambda x: inflection.parameterize(x, separator='_'))

In [565]:
X_test

Unnamed: 0,average_rating,number_of_reviews,brand,avg_delivery_time_days,seller,sub_category,fabrication_time,product_details_count,fabric,fabric_care,hooded,pockets,pack_of,secondary_color,style_code,ideal_for,reversible,neck,generic_name,brand_fit,pattern,sleeve,country_of_origin,size,fit,number_of_contents_in_sales_package,occasion,color,neck_type,number_images,has_polyester
0,4.8,40,1448.529978,17,1492.392079,1313.973719,32,14,1494.032586,1471.668584,1471.668584,1471.668584,1433.338691,1471.668584,699.000000,1471.668584,1611.767665,1471.668584,1644.928768,1471.668584,1296.356750,1932.401538,1319.394221,1471.668584,1435.568932,1471.668584,1471.668584,1526.115711,1471.668584,4,False
1,3.7,31,2604.845754,10,2785.690634,1964.134989,318,14,1494.032586,1585.319130,1471.668584,1471.668584,1433.338691,1471.668584,1471.668584,1471.668584,1471.668584,1471.668584,2122.069750,1471.668584,1610.420131,1471.668584,1319.394221,1471.668584,2114.327934,1471.668584,1864.870112,1659.081426,1471.668584,5,False
2,3.2,26,1204.640000,8,1219.889861,1194.734769,411,15,1256.315928,1203.008121,1471.668584,1471.668584,1471.668584,1471.668584,1471.668584,1386.126367,1471.668584,1471.668584,1346.479741,1471.668584,1296.356750,1471.668584,1319.394221,1471.668584,1471.668584,1471.668584,1379.115121,1351.370525,1471.668584,4,False
3,3.6,30,2604.845754,10,2785.690634,3499.777676,431,11,2196.397428,1471.668584,1471.668584,1471.668584,1433.338691,1471.668584,1471.668584,1471.668584,1471.668584,1471.668584,4122.239463,1471.668584,1610.420131,1932.401538,1319.394221,1471.668584,1471.668584,1471.668584,1379.115121,2015.999399,1471.668584,5,True
4,2.5,21,1285.419997,5,1272.123859,1313.973719,512,13,1494.032586,1585.319130,1471.668584,1471.668584,1433.338691,1471.668584,1471.668584,1471.668584,1611.767665,1471.668584,1644.928768,1471.668584,1480.570762,1932.401538,1319.394221,1471.668584,1435.568932,1471.668584,1471.668584,1659.081426,1471.668584,5,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5595,3.4,28,1623.621571,9,1622.429351,1313.973719,8,18,1494.032586,1268.810918,1471.668584,1471.668584,1603.164574,1471.668584,1471.668584,1386.126367,1611.767665,1471.668584,1471.668584,1231.379065,1490.104789,1196.135202,1471.668584,1172.614328,1309.100233,1471.668584,1471.668584,1471.668584,1164.333257,5,False
5596,4.2,35,1129.281095,13,2197.899494,1708.707887,72,5,1372.900270,1471.668584,1471.668584,1471.668584,1471.668584,1471.668584,1471.668584,1386.126367,1471.668584,1471.668584,1471.668584,1471.668584,1471.668584,1471.668584,1471.668584,1471.668584,1471.668584,1328.481759,1808.089336,1471.668584,1471.668584,5,False
5597,0.0,0,8599.000000,4,8599.000000,3499.777676,378,14,1678.620375,3734.969095,1471.668584,2299.840259,1433.338691,1471.668584,1471.668584,1471.668584,1471.668584,1471.668584,4122.239463,1471.668584,1480.570762,1932.401538,1319.394221,1471.668584,1471.668584,1471.668584,8599.000000,1813.001319,1471.668584,5,False
5598,3.3,27,1332.849595,8,1324.020497,1313.973719,524,14,1494.032586,1203.008121,1471.668584,1471.668584,1433.338691,1471.668584,1471.668584,1386.126367,1471.668584,1471.668584,1471.668584,1382.258658,1296.356750,1196.135202,1471.668584,1199.286925,1309.100233,1471.668584,1471.668584,1471.668584,1334.841625,4,False


## Predict

In [566]:
#X_test = X_test.drop(['brand_fit','fabric'], axis=1)

In [567]:
## same order
train_column_order = X_train.columns
X_test = X_test[train_column_order]

In [568]:
y_pred = rf.predict(X_test)


## submissao

In [569]:
X_test = pd.read_json(path + 'test.json', orient='split')

In [570]:
X_test['actual_price'] = y_pred

In [571]:
cols_sub = ['pid','actual_price']
df_submission = X_test[cols_sub]

In [572]:
df_submission.to_csv('nona_submissao.csv', index=False)

In [573]:
df_submission

Unnamed: 0,pid,actual_price
0,SHTFDYZGEXPED7YS,1364.346379
1,TROFM47EVWRYK7ZU,2652.326934
2,FABFZAMPGDW3BKGX,2112.960990
3,BZRFNMGPNBQTYHMG,5224.328525
4,SHTFV5G6PZDQZEVZ,1580.395238
...,...,...
5595,TSHFHHYFYUDABZAV,1529.379385
5596,DHTFZ3WE9XGWEH3H,1748.636144
5597,BZRFZQC6NXA5QV7F,8392.956190
5598,TSHFMKUGFR6HSP7E,1301.961493
