In [1]:
# ploting
import plotly.express as px

# data handeling
import pandas as pd
import numpy as np
from collections import defaultdict

# pre process
from sklearn.preprocessing import StandardScaler, LabelEncoder

# models
from sklearn.linear_model import LinearRegression
import catboost

# cross validation 
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error


# Set maximum display columns to 100
pd.set_option('display.max_columns', 100)

# reproducabillitu
import random
random.seed(1140)

# progress bar
from tqdm import tqdm
tqdm.pandas() # allow progress apply with groupby

# hyper-parmameter tuning
import optuna

In [None]:
# read the data

train = pd.read_csv("train_data.csv")
test = pd.read_csv("test_data.csv")

In [None]:
# sort the data - very important for the catboost model with use_time option
train = train.sort_values(by = ["eventTimestamp", "deviceId"])
train = train.reset_index(drop=True)

In [None]:
# look at the data
train.head()

### What do I see?
- Time stamp is important for correct cross validation
- Most features are categorical
- c1 & c3 are categorical, c2 & c4 might be numerical


In [None]:
# what size's are we dealing with?
train.shape, test.shape

### Lets start by looking at eventTimestamp, time is important (and if not considered .. dangerous) 



In [None]:
print(f"train max time: {train.eventTimestamp.max()}")
print(f"test min time: {test.eventTimestamp.min()}")

if train.eventTimestamp.max() > test.eventTimestamp.min():
    print("not a time series! train time points are ahead..")

else:
    print("time series! test time points are ahead..")


### It looks like the test data and train data are not splitted by time... but..
### Mabey each device is it's own time series?

In [None]:
example_device = "74f9b473fad"

device_df_train = train.query("deviceId == @example_device")
device_df_test = test.query("deviceId == @example_device")

In [None]:
print(f"train max time: {device_df_train.eventTimestamp.max()}")
print(f"test min time: {device_df_test.eventTimestamp.min()}")

if device_df_train.eventTimestamp.max() > device_df_test.eventTimestamp.min():
    print("not a time series! train time points are ahead..")

else:
    print("time series! test time points are ahead..")


##  A ha! got you. this device's data in the train set is priot to it's data in the test set
## Let's see if that is the case for every deviceId

In [None]:
# get max train time stamp and min test time stemp per device 
max_time_train = train.groupby("deviceId")["eventTimestamp"].max().to_frame()
min_time_test = test.groupby("deviceId")["eventTimestamp"].min().to_frame()

In [None]:
# check if all test data points are ahead of train data points
device_max_train_min_test_df = max_time_train.merge(min_time_test, on="deviceId", suffixes=("_train", "_test"))
device_max_train_min_test_df["is_time_series"] = np.where(
                    device_max_train_min_test_df.eventTimestamp_test - device_max_train_min_test_df.eventTimestamp_train >0,
                    True,
                    False)
device_max_train_min_test_df.head(3)

In [None]:
print(f"do all devices behave as time series: {device_max_train_min_test_df.is_time_series.all()}")

### Do we have unseen devices in the test set?

In [None]:
total_devices_train = train["deviceId"].nunique()
total_devices_test = test["deviceId"].nunique()

new_devices = len(set(test["deviceId"].unique()) - set(train["deviceId"].unique()))

print(f"train device count: {total_devices_train}")
print(f"test device count: {total_devices_test}")
print(f"new device count in test: {new_devices}")

### This is a time series problem with recpect to deviceId

- Decision making: should I use time series split (with recpect to deviceId) or should I use groupKfold?
- for cross-validation (hyperparameter tuning, model selection, feature selection) I will use time series split 
- That is for use of time based features.

* Note - some devices in the test set are new (never seen before)

### Let's look at the target variable

In [None]:
print(f"nans in target: {train.winBid.isna().sum()}")

In [None]:
train.winBid.max()

In [None]:
train.query("winBid < 5").winBid.plot(kind="hist", bins=20, title="histogram: winBid < 5", figsize = (20,5))

### From this plot output we see:
- Most of the winning bids are below 0.5 while the most extreme bid won with 3405
- No nan values


In [None]:
train.groupby("unitDisplayType")["winBid"].mean().plot(kind="bar",  figsize = (10,2))

### From this plot output we see:
- banner is the cheapest to win
- rewarded is the most expencive

In [None]:
train.groupby("size")["winBid"].mean().sort_values().plot(kind="bar",  figsize = (10,2))

### From this plot output we see:
- The Cheapest size is 320x50
- the most expensive size is: 480x320

### Nan values check.

In [None]:
print(f"train nans count: {train.isna().sum().sum()}")
print(f"test nans count: {test.isna().sum().sum()}")

In [None]:
train_isna = train.isna().sum()
train_isna[train_isna > 0]

In [None]:
test_isna = test.isna().sum()
test_isna[test_isna > 0]

### I will use mode imputation for countryCode, connectionType nan values.
- Note: For this test I will only handle these nans, in a real project I will implement fillna for every column in case such nans will appear in the future

In [None]:
# basic (baseline) pre_processing
def default_pre_process(df):
    to_drop = ["eventTimestamp", "deviceId"]
    to_label_encode = [c for c in df if df[c].dtype == "object" and c not in to_drop]
    le = LabelEncoder()
    le_dict = {}
    for col in to_label_encode:
        df[col] = le.fit_transform(df[col])
        le_dict[col] = le
    
    for fillna_col in [ "countryCode", "connectionType"]:
        most_common = df[fillna_col].value_counts().idxmax()
        df[fillna_col] =  df[fillna_col].fillna(most_common)
    return df.drop(to_drop, axis = 1)
    
    

In [None]:
def default_fit_predict_score(X_train, y_train, X_val, y_val,parms=None):
    model = LinearRegression()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    
    # squared=False to make mse -> rmse
    score = mean_squared_error(y_val, y_pred, squared=False) 

    return score

In [None]:
"""
main cross validation function
df - dataframe to train & evaluate on
save_train_indexs/ save_test_indexs - used for reproducabillity
fit_func/pre_process_func/ feature_enigneering_func - what functions are used in the pipeline 
to_drop - columns to drop before training a model
parms - hyperparameters to add

returns:
score - RMSE
save_train_indexs, save_test_indexs -  used for reproducabillity

"""

def cross_validation(df, save_train_indexs = None, save_test_indexs=None,fit_func=default_fit_predict_score, 
                     pre_process_func = default_pre_process, feature_enigneering_func=None, to_drop=[],
                    parms = None):
    if "has_won" not in to_drop:
        to_drop.append("has_won")
        
    X = df.drop("winBid", axis = 1)
    y = df["winBid"]

    N_FOLDS = 5
    tscv = TimeSeriesSplit(n_splits = N_FOLDS)

    if save_train_indexs is None:
        save_train_indexs = []
        save_test_indexs = []
        save_indexs = True
    else:
        save_indexs = False

    scores = []

    for i, (train_index, test_index) in tqdm(enumerate(tscv.split(X))):
        if save_indexs:
            save_train_indexs.append(train_index)
            save_test_indexs.append(test_index)

        X_train = X.loc[train_index].copy()
        X_val =  X.loc[test_index].copy()

        y_train =  y[train_index]
        y_val = y[test_index]

        X_train = pre_process_func(X_train)
        X_val = pre_process_func(X_val)

        if feature_enigneering_func is not None:
            X_train = feature_enigneering_func(X_train)
            X_val =  feature_enigneering_func(X_val, is_train=False, train_set = X_train )
    
        X_train = X_train.drop(to_drop, axis = 1)
        X_val = X_val.drop(to_drop, axis = 1)

        fold_score = fit_func(X_train, y_train, X_val, y_val, parms)
        scores.append(fold_score)

    score = sum(scores) / N_FOLDS
    print(f"RMSE score: {round(score,2)} [$]")
    return score, save_train_indexs, save_test_indexs


In [None]:
score, save_train_indexs, save_test_indexs= cross_validation(train.copy())

## Let's try to improve the initial score
### Let's look at our categorical featurs

In [None]:
cat_features = [c for c in train if train[c].dtype == "object"]
print("number of unique values\n")

for col in cat_features:
    print(f"{col}: {train[col].nunique()}")


In [None]:
numerical_features = [c for c in train if train[c].dtype != "object" and c not in ["winBid", "has_won"]]
numerical_features
for col in numerical_features:
    print(f"{col}: {train[col].nunique()}")

## Plan:

### Categorical features decisions
- Do not use use deviceId 
- use catboost (see modeling decision) 
- convert "size" to width, length and total (width * length)

### Numerical features decisions
- try treating c2 and c4 as categorical features
- create new feature: sentPrice - bidFloorPrice,  total bids made (per device)

### Time based features
- for those features test will recieve last value per device, if exist - and will be maen imputed otherwise
- has_won expanding mean per device 
- "bids_speed" = number of bids per device / (max time stemp - min time stemp)
- "bids_spees" * has_won_expanding_mean

### Modeling decision 
- use Catboost Regressor as a model - over 50% categorical features (mabey even more..)
- use RMSE as Catboost's loss function
- use hastime = True // time series support


In [None]:
"""
This functions creates the following features:

width - derived from left side of "size"
length - derived from right side of "size"
total - total add area

bid_diff - bid sent - bid floor price
total_bids - count bids made untill now, per device
has_won_expanding_average - winning rate per device
time_alive - time that this device is in the data
bids_speed - count bids / time alive
wins_speed - count wins/ time alive

for the following features, the last value in the train set is used for the test set:
"total_bids", "has_won_expanding_average", "bids_speed", "wins_speed", "time_alive"
"""
def iteration_1_feature_engineering(df, is_train = True, train_set = None):        
    df["width"] = df["size"].str.split("x",expand=True)[0].astype("int")
    df["length"] = df["size"].str.split("x",expand=True)[1].astype("int")
    df["total"] = df["width"]  * df["length"]

    df["bid_diff"] = df.sentPrice - df.bidFloorPrice
    
    if is_train:
        # for covinient
        df_gb_deviceId = df.groupby("deviceId")
        
        df["total_bids"] = df_gb_deviceId['has_won'].agg("cumcount")
        
        # avoid leakage - only use values from the past by substracting current value
        has_won_cumsum = df_gb_deviceId['has_won'].agg("cumsum") - df["has_won"]
        
        # avoid devied by 0 by adding 1
        has_won_cumcount = df_gb_deviceId['has_won'].agg("cumcount") + 1 
        df['has_won_expanding_average'] = has_won_cumsum / has_won_cumcount
        
#         "bids_speed" = number of bids per device / time alive
        
        device_start_time = df_gb_deviceId["eventTimestamp"].min()
        time_alive = df["eventTimestamp"] - device_start_time
        
        # + 1 to avoid dev by 0
        df["time_alive"] = df["eventTimestamp"] - df["deviceId"].map(device_start_time) + 1 
        
        
        df["bids_speed"] = df["total_bids"] /  df["time_alive"]
        
        df["wins_speed"] = df["bids_speed"] * df['has_won_expanding_average']

    
    else:
        for col in ["total_bids", "has_won_expanding_average", "bids_speed", "wins_speed", "time_alive"]:
            last_train_values = train_set.groupby("deviceId")[col].agg('last')
            default_value = last_train_values.mean()
            col_default_dict = defaultdict(lambda: default_value, last_train_values.to_dict())

            df[col] = df["deviceId"].map(col_default_dict)
    return df

In [None]:
# not using label encoder - using catboost instead.
def iteration_1_pre_process(df):
    for fillna_col in [ "countryCode", "connectionType"]:
        most_common = df[fillna_col].value_counts().idxmax()
        df[fillna_col] =  df[fillna_col].fillna(most_common)
    
    return df
    
    

In [None]:
# using catboost - because of high categorical features fraction
def iteration_1_fit_predict_score(X_train, y_train, X_val, y_val, parameters = None):
    cat_features =  [c for c in X_train if X_train[c].dtype == "object"]
    
    parms = {"cat_features" : cat_features, 
            "loss_function" : "RMSE",
            "has_time": True,
             'iterations': 100,
             'verbose' : 0}
    
    if parameters is not None:
        for new_parm in parameters:
            parms[new_parm] = parameters[new_parm]
            
    model = catboost.CatBoostRegressor(**parms)
    
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    
    # squared=False to make mse -> rmse
    score = mean_squared_error(y_val, y_pred, squared=False) 

    return score

In [None]:
def iteration_1_fit_predict(X_train, y_train, X_test, parameters = None):
    cat_features =  [c for c in X_train if X_train[c].dtype == "object"]
    
    parms = {"cat_features" : cat_features, 
            "loss_function" : "RMSE",
            "has_time": True,
             'iterations': 100,
             'verbose' : 0}
    
    if parameters is not None:
        for new_parm in parameters:
            parms[new_parm] = parameters[new_parm]
            
    model = catboost.CatBoostRegressor(**parms)
    
    model.fit(X_train, y_train)
    return model.predict(X_test), model

In [None]:
# check iteration #1 score
score, save_train_indexs, save_test_indexs= cross_validation(train.copy(), 
                                                             save_train_indexs, 
                                                             save_test_indexs,
                                                             fit_func=iteration_1_fit_predict_score,
                                                            pre_process_func=iteration_1_pre_process,
                                                            feature_enigneering_func=iteration_1_feature_engineering)

In [None]:
# enter random column
# later - any column less important than the random column will be removed

train["random_column"] =  np.random.rand(train.shape[0])
test["random_column"] =  np.random.rand(test.shape[0])

y_train = train["winBid"]

X_train = iteration_1_pre_process(train)
X_train = iteration_1_feature_engineering(X_train)

X_test = iteration_1_pre_process(test)
X_test = iteration_1_feature_engineering(X_test, is_train=False, train_set = X_train)



_, final_model = iteration_1_fit_predict(X_train.drop(["has_won", "winBid"], axis = 1), y_train, test)



In [None]:
feature_importance = final_model.get_feature_importance()
feature_names = final_model.feature_names_


# Create a dataframe for feature importance
feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importance})
feature_importance_df = feature_importance_df.sort_values('Importance', ascending=False)

# Plot feature importance using Plotly
fig = px.bar(feature_importance_df, x='Feature', y='Importance', title='CatBoost Feature Importance')
fig.show()

In [None]:
# remove all columns with less importance than random column
random_col_importance = feature_importance_df.query("Feature == 'random_column'")["Importance"].values[0]
to_drop = list(feature_importance_df.loc[feature_importance_df.Importance <= random_col_importance].Feature.unique())

to_drop = [c for c in to_drop if c != "deviceId"]

In [None]:
# get cv score without non important columns 
score, save_train_indexs, save_test_indexs= cross_validation(train.copy(), 
                                                             save_train_indexs, 
                                                             save_test_indexs,
                                                             fit_func=iteration_1_fit_predict_score,
                                                            pre_process_func=iteration_1_pre_process,
                                                            feature_enigneering_func=iteration_1_feature_engineering,
                                                            to_drop=to_drop)

In [None]:
# let's down sample the data and use optuna for hyperparameter tuning

train_optuna = train.sample(frac = 0.2)
train_optuna = train_optuna.sort_values(by = ["eventTimestamp", "deviceId"])
train_optuna = train_optuna.reset_index(drop=True)

In [None]:
def objective(trial):
    # Define the hyperparameters to tune
    parms = {
        'iterations': trial.suggest_int('iterations', 10, 200),
        'depth': trial.suggest_int('depth', 2, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1),
    }

    
    score, save_train_indexs, save_test_indexs= cross_validation(train_optuna.copy(), 
                                 save_train_indexs=None, 
                                 save_test_indexs=None,
                                 fit_func=iteration_1_fit_predict_score,
                                pre_process_func=iteration_1_pre_process,
                                feature_enigneering_func=iteration_1_feature_engineering,
                                to_drop=to_drop,
                                parms = parms)

    return score

# Create an Optuna study and optimize the objective function
study = optuna.create_study(direction='minimize')
study.optimize(objective, timeout= 60*60) # 60 minutes 

# Print the best parameters and best score
print('Best parameters:', study.best_params)
print('Best score:', study.best_value)

In [None]:
# get cv score with new parameters
score, _, _= cross_validation(train.copy(), 
                                 save_train_indexs, 
                                 save_test_indexs,
                                 fit_func=iteration_1_fit_predict_score,
                                pre_process_func=iteration_1_pre_process,
                                feature_enigneering_func=iteration_1_feature_engineering,
                                to_drop=to_drop,
                             parms = study.best_params)

In [None]:
# # submission
for col in ["has_won", "winBid"]:
    if col not in to_drop:
        to_drop.append(col)
        
to_drop_test = [c for c in to_drop if c not in ["has_won", "winBid"]]
        
y_train = train["winBid"]

X_train = iteration_1_pre_process(train)
X_test = iteration_1_pre_process(test)

X_train = iteration_1_feature_engineering(X_train)
X_test =  iteration_1_feature_engineering(X_test, is_train=False, train_set = X_train )

X_train = X_train.drop(to_drop, axis = 1)
X_test = X_test.drop(to_drop_test, axis = 1)
X_test["winBid"], final_model = iteration_1_fit_predict(X_train, y_train, X_test,
                                                       parameters = study.best_params)

In [None]:
X_test.isna().sum().sum()

In [None]:
X_test[["deviceId", "winBid"]].to_csv("submission.csv", index=False)

In [None]:
X_test[["deviceId", "winBid"]]