In [None]:
!pip install eli5

In [None]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import cross_val_score

import eli5
from eli5.sklearn import PermutationImportance

from ast import literal_eval
from tqdm.notebook import tqdm

In [None]:
cd "/content/drive/My Drive/Colab Notebooks/dw_matrix"

In [None]:
df = pd.read_csv('data/men_shoes.csv', low_memory=False)

In [None]:
def run_model(feats, model = DecisionTreeRegressor(max_depth=5)):
  X = df [ feats ].values
  y = df['prices_amountmin'].values

  scores = cross_val_score(model, X, y, scoring='neg_mean_absolute_error')
  return np.mean(scores), np.std(scores)

In [None]:
df['brand_cat'] = df['brand'].map(lambda x: str(x).lower()).factorize()[0]
run_model(['brand_cat'])

(-58.133398968282776, 4.206122611474276)

In [None]:
model = RandomForestRegressor(max_depth=5, n_estimators=100, random_state=0)
run_model(['brand_cat'], model)

(-57.31783843165656, 4.181246596160967)

In [None]:
df.features.head().values

array(['[{"key":"Gender","value":["Men"]},{"key":"Shoe Size","value":["M"]},{"key":"Shoe Category","value":["Men\'s Shoes"]},{"key":"Color","value":["Multicolor"]},{"key":"Manufacturer Part Number","value":["8190-W-NAVY-7.5"]},{"key":"Brand","value":["Josmo"]}]',
       '[{"key":"Gender","value":["Men"]},{"key":"Shoe Size","value":["M"]},{"key":"Shoe Category","value":["Men\'s Shoes"]},{"key":"Color","value":["Multicolor"]},{"key":"Manufacturer Part Number","value":["8190-W-NAVY-7.5"]},{"key":"Brand","value":["Josmo"]}]',
       '[{"key":"Gender","value":["Men"]},{"key":"Color","value":["Black"]},{"key":"Shipping Weight (in pounds)","value":["0.45"]},{"key":"Condition","value":["New"]},{"key":"Brand","value":["SERVUS BY HONEYWELL"]},{"key":"manufacturer_part_number","value":["ZSR101BLMLG"]}]',
       '[{"key":"Gender","value":["Men"]},{"key":"Color","value":["Black"]},{"key":"Shipping Weight (in pounds)","value":["0.45"]},{"key":"Condition","value":["New"]},{"key":"Brand","value":["SER

In [None]:
str_dict = '[{"key":"Gender","value":["Men"]},{"key":"Shoe Size","value":["M"]},{"key":"Shoe Category","value":["Men\'s Shoes"]},{"key":"Color","value":["Multicolor"]},{"key":"Manufacturer Part Number","value":["8190-W-NAVY-7.5"]},{"key":"Brand","value":["Josmo"]}]'

literal_eval(str_dict)[0]['value'][0]

'Men'

In [None]:
def parse_features(x):
    output_dict = {}
    if str(x) == 'nan': return output_dict

    features =  literal_eval(x.replace('\\"', '"'))
    for item in features:
      key = item['key'].lower().strip()
      value = item['value'][0].lower().strip()

      output_dict[key] = value
    
    return output_dict

df['features_parsed'] = df['features'].map(parse_features)

In [None]:
keys = set()

df['features_parsed'].map( lambda x: keys.update(x.keys()) )

len(keys)

476

In [None]:
def get_name_feat(key):
  return 'feat_' + key

for key in tqdm(keys):
  df[get_name_feat(key)] = df.features_parsed.map(lambda feats: feats[key] if key in feats else np.nan)

In [None]:
df.columns

Index(['id', 'asins', 'brand', 'categories', 'colors', 'count', 'dateadded',
       'dateupdated', 'descriptions', 'dimension',
       ...
       'feat_mechanic', 'feat_features', 'feat_bridge/temple size:',
       'feat_theme', 'feat_model', 'feat_suitable for',
       'feat_domestic shipping', 'feat_construction', 'feat_finish',
       'feat_pronation'],
      dtype='object', length=526)

In [None]:
keys_stat = {}
for key in keys:
  keys_stat[key] = df[ False == df[get_name_feat(key)].isnull() ].shape[0] / df.shape[0] *100

In [None]:
{k:v for k,v in keys_stat.items() if v > 30}

{'brand': 48.62691466083151,
 'color': 47.784463894967175,
 'gender': 50.17505470459519,
 'manufacturer part number': 36.252735229759296,
 'material': 34.9070021881838}

In [None]:
df['feat_brand_cat'] = df['feat_brand'].factorize()[0]
df['feat_color_cat'] = df['feat_color'].factorize()[0]
df['feat_gender_cat'] = df['feat_gender'].factorize()[0]
df['feat_manufacturer part number_cat'] = df['feat_manufacturer part number'].factorize()[0]
df['feat_material_cat'] = df['feat_material'].factorize()[0]

df['feat_sport_cat'] = df['feat_sport'].factorize()[0]
df['feat_style_cat'] = df['feat_style'].factorize()[0]

for key in keys:
  df[get_name_feat(key) + '_cat'] = df[get_name_feat(key)].factorize()[0]

In [None]:
df['brand'] = df['brand'].map( lambda x: str(x).lower() )
df[ df.brand == df.feat_brand ].shape

(8846, 1002)

In [None]:
model = RandomForestRegressor(max_depth=5, n_estimators=100)
run_model(["brand_cat"], model)

(-57.264681845174984, 4.156836219234714)

In [None]:
feats_cat = [x for x in df.columns if '_cat' in x and x != 'feat_catalog']

In [None]:
feats = ["brand_cat", 'feat_brand_cat', 'feat_gender_cat', 'feat_material_cat', 'feat_style_cat', 'feat_sport_cat']
feats += feats_cat
feats = list(set(feats))

model = RandomForestRegressor(max_depth=5, n_estimators=100)
results = run_model(feats, model)

In [None]:
X = df[ feats].values
y = df[ 'prices_amountmin'].values

m = RandomForestRegressor(max_depth=5, n_estimators=100, random_state=0)
m.fit(X,y)

print(results)
perm = PermutationImportance(m, random_state=1).fit(X, y)
eli5.show_weights(perm, feature_names=feats)

(-57.6558320037952, 4.2487963195656455)


Weight,Feature
0.2440  ± 0.0130,brand_cat
0.0998  ± 0.0115,feat_material_cat
0.0123  ± 0.0010,feat_weight_cat
0.0119  ± 0.0010,feat_brand_cat
0.0109  ± 0.0032,feat_adjustable_cat
0.0096  ± 0.0010,feat_fabric content_cat
0.0061  ± 0.0018,feat_resizable_cat
0.0057  ± 0.0002,feat_shoe category_cat
0.0047  ± 0.0009,feat_color_cat
0.0043  ± 0.0005,feat_fabric material_cat


In [None]:
df ['brand'].value_counts(normalize=True)

nike                       0.097210
puma                       0.033315
ralph lauren               0.028775
vans                       0.021116
new balance                0.020295
                             ...   
habit                      0.000055
1031                       0.000055
nissun                     0.000055
gitzo                      0.000055
polo sport ralph lauren    0.000055
Name: brand, Length: 1732, dtype: float64

In [None]:
df [df['brand'] =='nike' ].features_parsed.sample(5).values

array([{'sport': 'soccer', 'condition': 'new with box', 'type': 'cleats'},
       {'gender': 'men', 'shoe category': 'mens shoes', 'color': 'wolf grey/ivory/mystic green', 'casual & dress shoe style': 'athletic sneakers', 'manufacturer part number': '644843032', 'brand': 'nike', 'shoe width': 'medium (d, m)'},
       {'condition': 'new with tags', 'type': 'socks', 'sub type': 'over-the-calf'},
       {'condition': 'new with box', 'type': 'cleats'},
       {'material': 'suede', 'gender': 'men', 'shoe size': '11 d(m) us', 'size': '11 d(m) us', 'shoe category': 'men', 'color': 'challenge red/game royal/white/metallic gold', 'model': '091207264258', 'brand': 'nike', 'manufacturer_part_number': '313171-674-11'}],
      dtype=object)

In [None]:
df['weight'].unique()

In [None]:
!git add matrix_one/day5.ipynb