In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder
import re
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import make_scorer
from fuzzywuzzy import fuzz
import string
import xgboost as xgb
from xgboost import cv, XGBRegressor

In [2]:
shoes_train = pd.read_csv('data/shoes_train.csv')
shoes_test = pd.read_csv('data/shoes_test.csv')

In [3]:
def group_similar_values(data, threshold=80):
    grouped_brands = {}
    
    for brand in data:
        matched = False
        
        # Check if the brand is similar to any existing group
        for group in grouped_brands:
            similarity = fuzz.token_sort_ratio(brand, group)
            
            # If similarity is above the threshold, add brand to the group
            if similarity >= threshold:
                grouped_brands[group].append(brand)
                matched = True
                break
        
        # If the brand doesn't match any existing group, create a new group
        if not matched:
            grouped_brands[brand] = [brand]
    
    return grouped_brands

In [4]:
# brand
shoes_train['brand'].fillna('unbranded',inplace=True)
shoes_train['brand'] = shoes_train['brand'].map(str.lower)
shoes_train['brand'] = shoes_train['brand'].map(lambda x: 'unbranded' if x=='non' else x)
data = list(shoes_train['brand'])
grouped_brands = group_similar_values(data)
for group in grouped_brands:
    grouped_brands[group] = set(grouped_brands[group])

In [5]:
# style
shoes_train['style'].fillna('unstyle',inplace=True)
shoes_train['style'] = shoes_train['style'].map(str.lower)
shoes_train['style'] = shoes_train['style'].map(lambda x: 'pump' if x=='pumps, classics' else x)
shoes_train['style'] = shoes_train['style'].map(lambda x: 'ballet' if x=='ballet flats' else x)
shoes_train['style'] = shoes_train['style'].map(lambda x: 'loafers' if x=='loafers, moccasins' or x=='loafers & moccasins' or x=='moccasins' else x)
data = list(shoes_train['style'])
grouped_styles = group_similar_values(data)
for group in grouped_styles:
    grouped_styles[group] = set(grouped_styles[group])

In [6]:
# material
shoes_train['material'].fillna('unmaterial',inplace=True)
shoes_train['material'] = shoes_train['material'].map(str.lower)
data = list(shoes_train['material'])
grouped_materials = group_similar_values(data)
for group in grouped_materials:
    grouped_materials[group] = set(grouped_materials[group])

In [7]:
# color
shoes_train['color'] = shoes_train['color'].fillna(shoes_train['colour']).fillna(shoes_train['main_colour'])
shoes_train['color'].fillna('uncolor',inplace=True)
shoes_train['color'] = shoes_train['color'].map(str.lower)
shoes_train['color'] = shoes_train['color'].map(lambda x: 'uncolor' if x=='does not apply' else x)
data = list(shoes_train['color'])
grouped_colors = group_similar_values(data)
for group in grouped_colors:
    grouped_colors[group] = set(grouped_colors[group])

In [8]:
# percent missing values
percent_missing = shoes_train.isnull().sum() * 100 / len(shoes_train)
percent_missing

Unnamed: 0                        0.000000
id                                0.000000
title                             0.000000
price                             0.000000
brand                             0.000000
style                             0.000000
heel_type                        90.285714
heel_height                      61.628571
width                            79.228571
shoe_width                       84.014286
material                          0.000000
occasion                         87.821429
country_region_of_manufacture    84.107143
lining_material                  98.864286
upper_material                   79.457143
shoe_size                        97.607143
toe_shape                        88.285714
model                            83.542857
year_of_manufacture              99.000000
size                             79.578571
colour                           82.814286
color                             0.000000
main_colour                      93.207143
lining     

In [9]:
# Removing unnecessary columns/// or with really high missing values
def func1(df):
    df.drop(['heel_type', 'width','shoe_width','shoe_size','size','year_of_manufacture',
                     'lining','sole','platform_height','n_sold','n_watchers'], axis=1, inplace=True)
    return df

In [10]:
def better_name(df, col_name):
    df[col_name] = df[col_name].map(lambda x: str(x).translate(str.maketrans('', '', string.punctuation)))
    return(df)

In [11]:
def value_grouped(value, grouped_values):
    for group in grouped_values:
        if value in grouped_values[group]:
            return group

def replace_ampersand(string):
    if '&' in string:
        # print(string)
        result = string.replace(' ', '')
        result = result.replace('&', ' and ')
        # print(result)
        return result
    else:
        return string
    
def replace_dot(string):
    if '.' in string:
        # print(string)
        result = string.replace(' ', '')
        result = result.replace('.', '')
        # print(result)
        return result
    else:
        return string
    
def get_brand_from_title(row, brands1):
    if row['brand'] == 'unbranded':
        for brand in brands1:
            if (brand in row['title']) and len(brand) > 1 and brand != 'on':
                    return brand
        return 'unbranded'
    else:
        return row['brand']

In [12]:
# brand 2
def brand_func2(df, p1, p2, grouped_brands):
    df['brand'].fillna('unbranded',inplace=True)
    df['brand'] = df['brand'].map(str.lower)
    df['brand'] = df['brand'].map(lambda x: 'unbranded' if x=='non' else x)
    # data = list(shoes_train['brand'])
    # grouped_brands = group_similar_values(data)
    df['brand'] = df['brand'].apply(value_grouped, args=(grouped_brands,))
    df['title'] = df['title'].map(str.lower)

    brands1 = list(df['brand'].value_counts().head(p1).index)
    brands2 = ['demonia', 'mocassino', 'sanuk', 'damen', 'pin up', 'pleaser', 
                   'christian loubutin', 'laurent', 'blahnik', 'vuitton', 'orthaheel']
    brands1.extend(brands2)
    df['brand'] = df.apply(get_brand_from_title,args=(brands1,), axis=1)
    # df['brand'] = df['brand'].apply(value_grouped, args=(grouped_brands,))
    top_shoes_brands = list(df['brand'].value_counts().head(p2).index)
    df['brand'] = df['brand'].map(lambda x: x if x in top_shoes_brands else 'unbranded')
    return df

In [13]:
# style
def style_func(df, head, grouped_styles):
    # better_name(df, "style")
    df['style'].fillna('unstyle',inplace=True)
    df['style'] = df['style'].map(str.lower)
    df['style'] = df['style'].map(lambda x: 'pump' if x=='pumps, classics' else x)
    df['style'] = df['style'].map(lambda x: 'ballet' if x=='ballet flats' else x)
    df['style'] = df['style'].map(lambda x: 'loafers' if x=='loafers, moccasins' or x=='loafers & moccasins' or x=='moccasins' else x)
    # data = list(shoes_train['style'])
    # grouped_brands = group_similar_values(data)
    df['style'] = df['style'].apply(value_grouped, args=(grouped_styles,))
    top_shoes_styles = list(df['style'].value_counts().head(head).index)
    df['style'] = df['style'].map(lambda x: x if x in top_shoes_styles else 'unstyle')
    return df

In [14]:
# heel height
def heel_height_func(df):
    better_name(df, 'heel_height')
    def heel_height(text):
        if 'flat' in text:
            return 'flat'
        elif 'low' in text:
            return 'low'
        elif 'mid' in text or 'med' in text:
            return 'mid'
        elif 'high' in text:
            return 'high'
        return 'unheight'
    df['heel_height'].fillna('unheight',inplace=True)
    df['heel_height'] = df['heel_height'].map(str.lower)
    df['heel_height'] = df['heel_height'].map(lambda x: re.sub('[^A-Za-z0-9 ]+', '', x))
    df['heel_height'] = df['heel_height'].apply(heel_height)
    return df

In [15]:
# material
def material_func(df, head, grouped_materials):
    # better_name(df, 'material')
    df['material'] = df['material'].fillna(df['lining_material']).fillna(df['upper_material'])
    df.drop(['lining_material', 'upper_material'], axis=1, inplace=True)
    df['material'].fillna('unmaterial',inplace=True)
    df['material'] = df['material'].map(str.lower)
    # data = list(shoes_train['material'])
    # grouped_materials = group_similar_values(data)
    df['material'] = df['material'].apply(value_grouped, args=(grouped_materials,))
    top_shoes_materials = list(df['material'].value_counts().head(head).index) #top10
    df['material'] = df['material'].map(lambda x: x if x in top_shoes_materials else 'unmaterial')
    return df

In [16]:
# color
def color_func(df, head, grouped_colors):
    # better_name(df, 'color')
    df['color'] = df['color'].fillna(df['colour']).fillna(df['main_colour'])
    df.drop(['colour', 'main_colour'], axis=1, inplace=True)
    df['color'].fillna('uncolor',inplace=True)
    df['color'] = df['color'].map(str.lower)
    df['color'] = df['color'].map(lambda x: 'uncolor' if x=='does not apply' or x=='see picture' else x)
    df['color'] = df['color'].apply(value_grouped, args=(grouped_colors,))
    top_shoes_colors = list(df['color'].value_counts().head(head).index)
    df['color'] = df['color'].map(lambda x: x if x in top_shoes_colors else 'uncolor')
    return df

In [17]:
# condiiton + category
def condition_category_func(df):
    better_name(df, 'condition')
    better_name(df, 'category')
    df['condition'].fillna('uncondition',inplace=True)
    df['category'].fillna('uncategory',inplace=True)
    return df

In [18]:
# vintage
def vintage_func(df):
    better_name(df, 'vintage')
    def vintage(text):
        no = ['No','Nein','Not','Non']
        yes = ['Yes','Ja','Sì']
        if text in no:
            return 'no'
        elif text in yes:
            return 'yes'
        return 'unvivntage'
    df['vintage'] = df['vintage'].apply(vintage)
    return df

In [19]:
# occasion
def occasion_func(df):
    def occasion(text):
        if 'casual' in text:
            return 'casual'
        elif 'party' in text or 'cocktail' in text or 'clubwaer' in text:
            return 'party'
        elif 'work' in text:
            return 'work'
        elif 'all' in text or 'any' in text or 'versatile' in text or 'various' in text or 'everyday' in text:
            return 'all'
        elif 'wedding' in text:
            return 'wedding'
        return 'unoccasion'
    df['occasion'].fillna('unoccasion',inplace=True)
    df['occasion'] = df['occasion'].map(str.lower)
    df['occasion'] = df['occasion'].map(lambda x: re.sub('[^a-z0-9 ]+', ' ', x))
    df['occasion'] = df['occasion'].apply(occasion)
    return df

In [20]:
# location
def location_func(df):
    better_name(df, 'location')
    def location(text):
        if 'japan' in text:
            return 'japan'
        elif 'united kingdom' in text:
            return 'uk'
        elif 'united states' in text:
            return 'usa'
        elif 'germany' in text:
            return 'germany'
        elif 'poland' in text:
            return 'poland'
        elif 'china' in text:
            return 'china'
        elif 'italy' in text:
            return 'italy'
        elif 'france' in text:
            return 'france'
        elif 'australia' in text:
            return 'australia'
        return 'unlocation'
    df['location'].fillna('unlocation',inplace=True)
    df['location'] = df['location'].map(str.lower)
    df['location'] = df['location'].apply(location)
    return df


In [21]:
# country_region_of_manufacture
def country_region_of_manufacture_func(df):
    better_name(df, 'country_region_of_manufacture')
    df['country_region_of_manufacture'].fillna('unknown',inplace=True)
    df['country_region_of_manufacture'] = df['country_region_of_manufacture'].map(str.lower)
    top_shoes_regions = list(df['country_region_of_manufacture'].value_counts().head(10).index)
    df['country_region_of_manufacture'] = df['country_region_of_manufacture'].map(lambda x: x if x in top_shoes_regions else 'unknown')
    return df

In [22]:
# title + seller_notes
def title_seller_notes_func(df):
    df['title'].fillna('untitle',inplace=True)
    df['seller_notes'].fillna('unseller_notes',inplace=True)
    df['title'] = df['title'].map(lambda x: str.lower(x) if type(x)==str else '')
    df['seller_notes'] = df['seller_notes'].map(lambda x: str.lower(x) if type(x)==str else '')
    def title_seller_notes(row):
        pos = ['new','good','great']
        neg = ['used','preowned','wear','scuffs','scrathces']
        x = 0
        if 'new' in row['title']:
            x += 1
        if 'new' in row['seller_notes']:
            x += 1
        for word1 in neg:
            for word2 in neg:
                if word1 in row['title'] and word2 in row['seller_notes']:
                    x -= 2
                if word1 in row['title'] != word2 in row['seller_notes']:
                    x -= 1
        return x
    df['title_seller_notes_score'] = df.apply(title_seller_notes,axis=1)
    df['title_seller_notes_score'].value_counts()
    return df

In [23]:
def func2(df):
    df.drop(['title','toe_shape','model','closure','pattern','theme','fastening','seller_notes',
            'Unnamed: 0'],axis=1,inplace=True)
    df.fillna(0,inplace=True)
    return df

In [24]:
def clear_data(df, p1, p2, style, material, color, grouped_brands, grouped_styles, grouped_materials, grouped_colors):
    df1 = func1(df)
    df2 = brand_func2(df1, p1, p2, grouped_brands)
    df3 = style_func(df2,style, grouped_styles)
    df4 = heel_height_func(df3)
    df5 = material_func(df4,material, grouped_materials)
    df6 = color_func(df5,color, grouped_colors)
    df7 = condition_category_func(df6)
    df8 = vintage_func(df7)
    df9 = occasion_func(df8)
    df10 = location_func(df9)
    df11 = country_region_of_manufacture_func(df10)
    df12 = title_seller_notes_func(df11)
    df13 = func2(df12)
    return df13

# Model assesment for us

## CV

In [25]:
def get_ohe(df):
    temp_df = pd.DataFrame(data=ohe.transform(df[['brand','style','heel_height','material','occasion','country_region_of_manufacture',
                                                  'color','vintage','location','condition','category']]), columns=ohe.get_feature_names_out())
    df.drop(columns=['brand','style','heel_height','material','occasion','country_region_of_manufacture',
                                                  'color','vintage','location','condition','category'], axis=1, inplace=True)
    df = pd.concat([df.reset_index(drop=True), temp_df], axis=1)
    return df

In [26]:
#### DONT RUN THIS BLOCK - VERY LONG CV ####

# performance_metrics = {}
# x_range = [50, 75, 100, 125, 150, 175, 200]
# y_range = [0, 10, 25, 50, 75]
# params = {
#     'objective': 'reg:squarederror',
#     'eval_metric': 'rmse'
# }
# num_rounds = 150
# shoes_train = pd.read_csv('shoes_train.csv')
# kf = KFold(n_splits=5, random_state=42, shuffle=True)
# shoes_train['price'] = shoes_train['price'].apply(np.log)
# ohe = OneHotEncoder(sparse=False, handle_unknown='ignore')
# mse = make_scorer(mean_squared_error)
# for p1 in x_range:
#     for p2 in x_range:
#         for style in y_range:
#             for material in y_range:
#                 for color in y_range:
#                     shoes_train1 = shoes_train.copy()
#                     df = clear_data(shoes_train1, p1,p2,style,material,color, grouped_brands, grouped_styles, grouped_materials,grouped_colors)
                    
#                     X = df.loc[:,df.columns != 'price']
#                     y = df['price']
                    
#                     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


#                     ohe.fit(X_train[['brand','style','heel_height','material','occasion','country_region_of_manufacture',
#                                                   'color','vintage','location','condition','category']])
                    
#                     X_train = get_ohe(X_train)
#                     X_test = get_ohe(X_test)
                    
#                     xgb = XGBRegressor(n_jobs = -1,random_state=42)
                    
#                     scores = cross_val_score(xgb, X_train, y_train, cv=kf, scoring=mse, error_score='raise')

#                     name = f"{p1}_{p2}_{style}_{material}_{color}"
#                     avg_score = np.sqrt(np.mean(scores))
#                     print(f"{name}__{avg_score}")                                       
#                     performance_metrics[name] = avg_score

### The best rmse mean from above:
#### 175_200_50_10_0 -> 0.499
#### 200_200_75_10_15 -> 0.5004
#### 125_200_75_10_10 -> 0.5008
#### 175_200_75_75_25 -> 0.5011

In [33]:
shoes_train = pd.read_csv('data/shoes_train.csv')
df = shoes_train
# df = clear_data(shoes_train, 175, 200, 50, 10, 0, grouped_brands, grouped_styles, grouped_materials,grouped_colors)

In [34]:
df['price'] = df['price'].apply(np.log)
X = df.loc[:,df.columns != 'price']
y = df['price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) #can add random_state=42

X_train = clear_data(X_train, 175, 200, 50, 10, 0, grouped_brands, grouped_styles, grouped_materials,grouped_colors)
X_test = clear_data(X_test, 175, 200, 50, 10, 0, grouped_brands, grouped_styles, grouped_materials,grouped_colors)



ohe = OneHotEncoder(sparse=False, handle_unknown='ignore')
ohe.fit(X_train[['brand','style','heel_height','material','occasion','country_region_of_manufacture',
                                                  'color','vintage','location','condition','category']])

In [35]:
X_train = get_ohe(X_train)
X_test = get_ohe(X_test)

#### Gradient boost test

In [36]:
gb = HistGradientBoostingRegressor(random_state=42)
gb.fit(X_train, y_train)
y_pred = gb.predict(X_test)
np.sqrt(mean_squared_error(y_test, y_pred))

0.5063481612416031

#### XGBoost test

In [42]:
xgb = XGBRegressor()
xgb.fit(X_train,y_train)
y_pred = xgb.predict(X_test)
np.sqrt(mean_squared_error(y_test, y_pred))

0.49790942860003923

### XGB is the winner!

# Model for giora

In [50]:
shoes_train = pd.read_csv('data/shoes_train.csv')
shoes_test = pd.read_csv('data/shoes_test.csv')
train = clear_data(shoes_train, 175, 200, 50, 10, 0, grouped_brands, grouped_styles, grouped_materials,grouped_colors)
test = clear_data(shoes_test, 175, 200, 50, 10, 0, grouped_brands, grouped_styles, grouped_materials,grouped_colors)

In [51]:
train['price'] = train['price'].apply(np.log)

In [52]:
X = train.loc[:,train.columns != 'price']
y = train['price']

ohe = OneHotEncoder(sparse=False, handle_unknown='ignore')
ohe.fit(X[['brand','style','heel_height','material','occasion','country_region_of_manufacture',
           'color','vintage','location','condition','category']])


In [53]:
def get_ohe(df):
    temp_df = pd.DataFrame(data=ohe.transform(df[['brand','style','heel_height','material','occasion','country_region_of_manufacture',
                                                  'color','vintage','location','condition','category']]), columns=ohe.get_feature_names_out())
    df.drop(columns=['brand','style','heel_height','material','occasion','country_region_of_manufacture',
                                                  'color','vintage','location','condition','category'], axis=1, inplace=True)
    df = pd.concat([df.reset_index(drop=True), temp_df], axis=1)
    return df

X = get_ohe(X)
test = get_ohe(test)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(columns=['brand','style','heel_height','material','occasion','country_region_of_manufacture',


In [54]:
xgb = XGBRegressor()
xgb.fit(X, y)

In [55]:
shoes_test['pred_price'] = xgb.predict(test)

In [56]:
shoes_test.loc[:,['id','pred_price']].to_csv('model07.csv',index=False)