<center><h1>HSE FCS SE ML</h1></center>
<center><h1>"Fast sold post prediction" Kaggle competition</h1></center>
<center><h2>Team name: Turbo 3D</h2></center>
<center><h3>Daniil Kraynov, Dmitry Strokov, Danil Kolesnikov</h3></center>

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

%matplotlib inline

# Read Data

In [None]:
df = pd.read_csv('train.tsv', sep='\t').drop(columns='Unnamed: 0')

In [None]:
df.head()

In [None]:
df.info()

# Preprocessing "properties" feature

In [None]:
import json

def parse_properties(jstr_orig):
    jstr = jstr_orig
    jstr = jstr.replace("\"Cee'd\"", "'_'")
    jstr = jstr.replace("\\xa0", "_")
    jstr = jstr.replace("\"Levi's\"", "'_'")
    jstr = jstr.replace("\"Victoria's Secret\"", "'_'")
    jstr = jstr.replace("\"O'Stin\"", "'_'")
    jstr = jstr.replace("\"Carter's\"", "'_'")
    jstr = jstr.replace("\"Colin's\"", "'_'")
    jstr = jstr.replace('"', '!@#$').replace("'", '"').replace('!@#$', "'")
    try:
        obj = json.loads(jstr)
        return obj
    except:
        print(jstr)
    return json.loads(jstr)

min_usage = 50

def preprocess_properties(df_source):
    s = df_source.drop(columns=['properties'])
    
    props_col = df_source['properties'].apply(parse_properties)
    
    all_props = dict()
    for props in props_col.values:
        for prop in props:
            prop_name = "slug_" + prop['slug_id'] 
            if prop_name not in all_props:
                all_props[prop_name] = 0
            all_props[prop_name] += 1
    
    for i in range(len(s)):
        if i % 20000 == 0:
            print("Done: {:.2f}%".format(i / len(s) * 100))
        for prop in props_col.values[i]:
            prop_name = "slug_" + prop['slug_id']
            if all_props[prop_name] < min_usage:
                continue
            if prop_name not in s.columns:
                s[prop_name] = -1
            s.at[i, prop_name] = prop["value_id"]
    return s

In [None]:
df2 = preprocess_properties(df)

In [None]:
df2.head()

In [None]:
df2.info()

# Study/preprocess categorical features

First of all, we need to understand what categories are important before applying one-hot encoding to categorical features.

We asume that category importancy depends on usage (number of entries) and sold probability.

In [None]:
def get_category_sold_probability(df_source, column, value):
    target_col = df_source['sold_fast'].values
    target_col_in_category = target_col[column == value]
    target_col_in_category_sold = target_col_in_category[target_col_in_category == 1]
    return len(target_col_in_category_sold) / len(target_col_in_category)

In [None]:
# Let's consider magnitude as abs(sold_probability - 0.5)
def get_category_magnitude(df_source, column, value):
    return abs(get_category_sold_probability(df_source, column, value) - 0.5)

In [None]:
all_categories = []

min_usage = 50

except_columns = ['date_created', 'delivery_available', 'payment_available',
                  'img_num', 'lat', 'long', 'price', 'product_id', 'sold_fast']

for i in range(len(df2.columns)):
    feature_name = df2.columns[i]
    if feature_name in except_columns:
        print("\rSkipping", feature_name, "[", i + 1, "of", len(df2.columns), "]", " " * 100, end='')
        continue
    print("\rScanning", feature_name, "[", i + 1, "of", len(df2.columns), "]", " " * 100, end='')
    column = df2[feature_name].values
    categories, usages = np.unique(column, return_counts=True)
    for i in range(len(categories)):
        usage = usages[i]
        if usage < min_usage:
            continue
        category = categories[i]
        if "slug" in feature_name and category == -1:
            continue
        info = [feature_name, category]
        info.append(usage)
        info.append("{:.3f}%".format(usage / len(column) * 100))
        info.append(get_category_magnitude(df2, column, category))
        all_categories.append(info)

In [None]:
len(all_categories)

In [None]:
selected_categories = set()

def select_top_categories(from_cat, n_select):
    global selected_categories
    top_features = from_cat['feature'].values[:n_select]
    top_categories = from_cat['category'].values[:n_select]
    selected_categories |= set(zip(top_features, top_categories))

In [None]:
cat_df = pd.DataFrame(
    np.array(all_categories),
    columns=['feature', 'category', 'usage', 'usage%', 'magnitude']
)
for col_name in ['usage', 'magnitude']:
    cat_df[col_name] = pd.to_numeric(cat_df[col_name])
cat_df.head()

Let's select some categories with highest magnitude

In [None]:
cat_df_by_magnitude = cat_df.sort_values(by=['magnitude'], ascending=False)
cat_df_by_magnitude.head(10)

In [None]:
select_top_categories(cat_df_by_magnitude, 100)

In [None]:
cat_df_by_usage = cat_df.sort_values(by=['usage'], ascending=False)
cat_df_by_usage.head(10)

In [None]:
select_top_categories(cat_df_by_usage, 100)

In [None]:
selected_categories

# Encode Data

In [None]:
from sklearn.preprocessing import LabelEncoder

import time
import datetime

# Encode data to numberable format
# Drop excess columns
# Apply one hot encoding
def encode_data(df_source):
    lb = LabelEncoder()
    
    columns_to_drop = ['product_id']
    for col_name in df_source.columns:
        if 'slug' in col_name:
            columns_to_drop.append(col_name)
    
    s = df_source.drop(columns=columns_to_drop)

    # Delivery/payment encoding: 0 is false (bad), 1 is true (good)
    s['delivery_available'] = lb.fit_transform(s['delivery_available'])
    s['payment_available'] = lb.fit_transform(s['payment_available'])
    # City/region encoding
    s['city'] = lb.fit_transform(s['city'])
    s['region'] = lb.fit_transform(s['region'])
    # Converting date to timestamp
    dates = s['date_created'].values
    convert_time = lambda d: int(time.mktime(datetime.datetime.strptime(d, "%Y-%m-%d").timetuple()))
    s['date_created'] = list(map(convert_time, dates))
    # Misc
    s['owner_id'] = lb.fit_transform(s['owner_id'])
    
    s['name_text'] = s['name_text'].apply(len)
    s['desc_text'] = s['desc_text'].apply(len)
    
    for feature, category in selected_categories:
        fcol = df_source[feature].apply(str).values
        ncol = np.zeros(len(fcol), dtype='int64')
        ncol[fcol == category] = 1
        s['{}={}'.format(feature, category)] = ncol
    
    return s

In [None]:
df_encoded = encode_data(df2)

In [None]:
df_encoded.head()

In [None]:
df_encoded.info()

# Preprocess data

In [None]:
X = df_encoded.drop(columns=['sold_fast'])
y = df_encoded['sold_fast']

In [None]:
# from sklearn.model_selection import train_test_split
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fitting & Predicting

In [None]:
# To avoid kernel dying
import os
os.environ['KMP_DUPLICATE_LIB_OK']='True'
from xgboost import XGBClassifier
import xgboost as xgb

In [None]:
# from sklearn import metrics
# from matplotlib.pylab import rcParams
# rcParams['figure.figsize'] = 12, 4

# def modelfit(alg, Xt, yt, useTrainCV=True, cv_folds=5, early_stopping_rounds=50):
#     if useTrainCV:
#         xgb_param = alg.get_xgb_params()
#         xgtrain = xgb.DMatrix(Xt, label=yt)
#         cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,
#             metrics='auc', early_stopping_rounds=early_stopping_rounds, verbose_eval=True)
#         alg.set_params(n_estimators=cvresult.shape[0])
    
#     #Fit the algorithm on the data
#     alg.fit(Xt, yt, eval_metric='auc')
        
#     #Predict training set:
#     dtrain_predictions = alg.predict(Xt)
#     dtrain_predprob = alg.predict_proba(Xt)[:,1]
        
#     #Print model report:
#     print("\nModel Report")
#     print("Accuracy : %.4g" % metrics.accuracy_score(yt, dtrain_predictions))
#     print("AUC Score (Train): %f" % metrics.roc_auc_score(yt, dtrain_predprob))
                    
#     feat_imp = pd.Series(alg.get_booster().get_fscore()).sort_values(ascending=False)
#     feat_imp.plot(kind='bar', title='Feature Importances')
#     plt.ylabel('Feature Importance Score')

In [None]:
# def do_gridsearch(static_params, search_params):
#     gsearch = GridSearchCV(
#         estimator = XGBClassifier(**static_params),
#         param_grid = search_params,
#         scoring='roc_auc',
#         n_jobs=4,
#         iid=False,
#         cv=5,
#         verbose=3
#     )
#     gsearch.fit(X, y)
#     return gsearch.best_params_, gsearch.best_score_

In [None]:
# xgb_params = {
#     'base_score': 0.5,
#     'booster': 'gbtree',
#     'objective': 'binary:logistic',
#     'colsample_bylevel': 1,
#     'gamma': 0.46,
#     'learning_rate': 0.1,
#     'n_estimators': 444,
#     'max_depth': 4,
#     'min_child_weight': 4,
#     'max_delta_step': 0,
#     'missing': None,
#     'reg_alpha': 0,
#     'reg_lambda': 1,
#     'scale_pos_weight': 1,
#     'seed': 27,
#     'silent': 1,
#     'subsample': 0.95,
#     'colsample_bytree': 0.8,
#     'nthread': 4,
#     'n_jobs': 4
# }

# search_params = {
#     'reg_alpha': [1e-5, 1e-2, 0.1, 1, 100]
# }

# #bp, bs = do_gridsearch(xgb_params, search_params)
# #bp, bs

# xgbm = XGBClassifier(**xgb_params)
# modelfit(xgbm, X, y)

In [None]:
# from sklearn.model_selection import cross_val_score

# model = XGBClassifier(**xgb_params)

# cross_val_score(model, X, y, cv=5, scoring='roc_auc', verbose=3, n_jobs=4)

In [None]:
# from sklearn.model_selection import cross_val_score

# model = XGBClassifier()
# cross_val_score(model, X, y, cv=3, scoring='roc_auc', verbose=3, n_jobs=4)

In [None]:
xgb_params = {
    'objective': 'binary:logistic',
    'gamma': 0,
    'learning_rate': 0.01,
    'n_estimators': 5000, # Set best before real fit/predict!!!
    'max_depth': 6,
    'seed': 27,
    'silent': 1,
    'subsample': 0.8,
    'colsample_bytree': 0.3,
    'n_jobs': 4
};

In [None]:
model = XGBClassifier(**xgb_params)
#0.639196
#783 n_est
xgb.cv(
    model.get_xgb_params(),
    xgb.DMatrix(X, label=y),
    num_boost_round=xgb_params['n_estimators'],
    nfold=3,
    metrics='auc',
    early_stopping_rounds=50,
    verbose_eval=True)

# Test Data Prediction

In [None]:
df_test = pd.read_csv('test_nolabel.tsv', sep='\t').drop(columns='Unnamed: 0')
df_test["sold_fast"] = 0
df_merged = pd.concat([df, df_test])
df_merged.reset_index(inplace=True, drop=True)

In [None]:
df_merged_p = encode_data(preprocess_properties(df_merged))
df_mp = df_merged_p[:len(df)]
df_test_mp = df_merged_p[len(df):]

In [None]:
X_fit = df_mp.drop(columns=['sold_fast'])
y_fit = df_mp['sold_fast']
X_actual = df_test_mp.drop(columns=['sold_fast'])

In [None]:
model = XGBClassifier(**xgb_params)
model.fit(X_fit, y_fit)
y_actual = model.predict_proba(X_actual)[:, 1]
y_actual

In [None]:
df_final = pd.DataFrame.from_dict({'product_id' : df_test['product_id'].values, 'score' : y_actual})
df_final.to_csv('submission.csv', sep = ',', index = False)