In [1]:
import numpy as np
import pandas as pd
from sklearn import model_selection
from sklearn import linear_model
from sklearn import preprocessing
from sklearn import metrics
from sklearn import ensemble
from sklearn import cluster

import optuna
import datetime
import time

In [None]:
TIMEOUT = 100

In [None]:
data = pd.read_csv("data/css_main_training.csv", sep=";")
data_fuel = pd.read_csv("data/fuel_prices.csv", sep=";")

In [None]:
def preproc(df, train=True, poly_f=True):
    df = df.sample(frac=1)
    
    df["id_currency"] = df.apply(lambda row: row["id_currency"] if row["id_currency"] in ["PLN", "EUR"] else "OTHER", axis=1)
    
    nan_to_zero = ["prim_train_line", "prim_ferry_line"]
    for col_name in nan_to_zero:
        df[col_name] = df.apply(lambda row: 0 if np.isnan(row[col_name]) else row[col_name], axis=1)
    
    to_drop = ["id_contract", "id_payer", "temperature", "route_start_datetime", "route_end_datetime"]
    country = ["route_start_country", "last_unload_country", "first_load_country", "route_end_country"]
    ordinal_cats = ["load_size_type", "contract_type"]
    one_hot_cats = ["id_currency", "direction"]
    
    
    df.drop(columns=to_drop, inplace=True)
    df.drop(columns=country, inplace=True)
    
    
    #print(df.isna().sum())
    if train:
        df.dropna(inplace=True)
        
    
    df_one_hot_cats = df[one_hot_cats]
    df.drop(columns=one_hot_cats, inplace=True)
    enc_one_hot = preprocessing.OneHotEncoder()
    X_one_hot_cats = enc_one_hot.fit_transform(df_one_hot_cats.to_numpy()).toarray()
    print(f"ont hot: {X_one_hot_cats.shape[1]}")
    
    df_ordinal_cats = df[ordinal_cats]
    df.drop(columns=ordinal_cats, inplace=True)
    enc_ordinal = preprocessing.OrdinalEncoder()
    X_ordinal_cats = enc_ordinal.fit_transform(df_ordinal_cats.to_numpy())
    print(f"ordinal: {X_ordinal_cats.shape[1]}")
    
    
    X, y = df.drop(columns=["expenses"]).to_numpy(), df["expenses"].to_numpy()
    print(f"standard: {X.shape[1]}")
    X = np.concatenate((X, X_one_hot_cats), axis=1)
    X = np.concatenate((X, X_ordinal_cats), axis=1)
    
    scaler = preprocessing.StandardScaler()
    scaler.fit(X)
    X = scaler.transform(X)

    if poly_f:
        poly = preprocessing.PolynomialFeatures(degree=2)
        X = poly.fit_transform(X)
    
    print(X.shape)
    return X, y

In [None]:
def preproc_tree(df, red_dim, train=True, poly_f=False):
    df = df.copy()

    nan_to_zero = ["prim_train_line", "prim_ferry_line"]
    for col_name in nan_to_zero:
        df[col_name] = df.apply(lambda row: 0 if np.isnan(row[col_name]) else row[col_name], axis=1)

    df["route_start_unix"] = df.apply(lambda row: time.mktime(datetime.datetime.fromisoformat(row["route_start_datetime"]).timetuple()), axis=1)
    df["route_end_unix"] = df.apply(lambda row: time.mktime(datetime.datetime.fromisoformat(row["route_end_datetime"]).timetuple()), axis=1)
    df["duration"] = df.apply(lambda row: row["route_end_unix"] - row["route_start_unix"], axis=1)
    df["date"] = df.apply(lambda row: row["route_start_datetime"].split()[0], axis=1)


    df = df.merge(data_fuel, on="date")

    to_drop = ["id_contract", "id_payer", "temperature", "route_start_datetime", "route_end_datetime", "route_start_unix", "route_end_unix", "date"]
    ordinal_cats = ["load_size_type", "contract_type"]
    one_hot_cats = ["direction"]
    reduce_one_hot_cats = ["route_start_country", "last_unload_country", "first_load_country", "route_end_country", "id_currency"]

    df.drop(columns=to_drop, inplace=True)

    #print(df.isna().sum())
    if train:
        df.dropna(inplace=True)

    df = df.sample(frac=1)

    X, y = df.drop(columns=["expenses"] + one_hot_cats + ordinal_cats + reduce_one_hot_cats).to_numpy(), df["expenses"].to_numpy()

    df_one_hot_cats = df[one_hot_cats]
    enc_one_hot = preprocessing.OneHotEncoder()
    X_one_hot_cats = enc_one_hot.fit_transform(df_one_hot_cats.to_numpy()).toarray()
    X = np.concatenate((X, X_one_hot_cats), axis=1)

    df_ordinal_cats = df[ordinal_cats]
    enc_ordinal = preprocessing.OrdinalEncoder()
    X_ordinal_cats = enc_ordinal.fit_transform(df_ordinal_cats.to_numpy())
    X = np.concatenate((X, X_ordinal_cats), axis=1)

    for col_name in reduce_one_hot_cats:
        enc_one_hot = preprocessing.OneHotEncoder()
        X_anc = enc_one_hot.fit_transform(df[[col_name]].to_numpy()).toarray()
        red = cluster.FeatureAgglomeration(n_clusters=red_dim)
        X_anc = red.fit_transform(X_anc)
        X = np.concatenate((X, X_anc), axis=1)

    #scaler = preprocessing.StandardScaler()
    #scaler.fit(X)
    #X = scaler.transform(X)

    if poly_f:
        poly = preprocessing.PolynomialFeatures(degree=2)
        X = poly.fit_transform(X)

    print(X.shape)
    return X, y

In [None]:
#X, y = preproc_tree(data)


In [None]:
def validate(model, X, y, test_size=0.2):
    X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=test_size)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    score = metrics.mean_squared_error(y_test, y_pred) ** (1/2)
    return -score

In [None]:
X, y = preproc_tree(data, 5)

In [None]:
def objective(trial):
    n_est = trial.suggest_int("n_est", 100, 400, log=True)
    print(f"n_est value: {n_est}")
    reg = ensemble.RandomForestRegressor(n_jobs=-1, verbose=10, n_estimators=n_est)
    result = model_selection.cross_val_score(reg, X, y, n_jobs=-1, cv=5, scoring="neg_root_mean_squared_error", verbose=10)
    print(f"RESULT WITH {n_est=}: MEAN {np.mean(result)}, STD {np.std(result)}")
    return np.mean(result)

    #return validate(reg, X, y)
    
    #min_samples_split = trial.suggest_float("min_samples_split", 0, 1)
    #min_samples_leaf = trial.suggest_float("min_samples_leaf", 0, 0.5)
    #min_weight_fraction_leaf = trial.suggest_float("min_weight_fraction_leaf", 0, 0.5)
    #max_features = trial.suggest_float("max_features", 0, 1)
    #max_samples = trial.suggest_float("max_samples", 0, 1)
    #if min_samples_split == 0:
    #    min_samples_split = int(2)
    #if min_samples_leaf == 0:
    #    min_samples_leaf = int(1)
    #reg = ensemble.RandomForestRegressor(min_samples_split=min_samples_split,
     #                                    min_samples_leaf=min_samples_leaf,
     #                                    min_weight_fraction_leaf=min_weight_fraction_leaf,
     #                                    max_features=max_features,
    #                                  max_samples=max_samples)
    #alpha = trial.suggest_float("alpha", 0, 1)

    #max_depth = trial.suggest_int("max_depth", 1, 10)
    #print(f"suggested max_depth: {max_depth}")
    #reg = ensemble.RandomForestRegressor(max_depth=max_depth)
    #result = validate(reg)
    #print(f"result with {max_depth=}:")
    #print(result)
    #return result
    #scores = model_selection.cross_val_score(reg, X, y, cv=5, scoring="neg_root_mean_squared_error")
    #return np.mean(scores)

In [None]:
def initial_guess(study):
    sampler = study.sampler
    params = {"min_samples_split": 0.,
              "min_samples_leaf": 0.,
              "min_weight_fraction_leaf": 0.,
              "max_features": 1.,
              "max_samples": 1.,
              }
    study.sampler = optuna.samplers.PartialFixedSampler(params, sampler)
    study.optimize(objective, n_trials=1)
    study.sampler = sampler

In [None]:
def study_visualize(study):
    optuna.visualization.plot_optimization_history(study).show()
    optuna.visualization.plot_intermediate_values(study).show()
    optuna.visualization.plot_parallel_coordinate(study).show()
    optuna.visualization.plot_contour(study).show()
    optuna.visualization.plot_slice(study).show()
    optuna.visualization.plot_param_importances(study).show()
    best_params = study.best_trial.params
    best_value = study.best_trial.value
    print(f"best params:\n{best_params}")
    print(f"best value {best_value}")

In [None]:
study = optuna.create_study(direction="maximize")

In [None]:
#initial_guess(study)

In [None]:
%%time
study.optimize(objective, timeout=TIMEOUT, n_jobs=-1)

In [None]:
study_visualize(study)

In [None]:
#reg = ensemble.RandomForestRegressor(verbose=10, n_jobs=-1, n_estimators=100)
#X, y = preproc_tree(data, 5, poly_f=False)
#print(validate(reg, X, y))

In [None]:
#200 est
#random forest after adding time and fuel
#cat reduce to 5
#time ?
-0.15119003329514882

In [None]:
#100 est
#random forest after adding time and fuel
#cat reduce to 8
#time 3.8
-0.1514203883840837

In [None]:
#100 est
#random forest after adding time and fuel
#cat reduce to 4
#time 3.5
-0.15207472815412973

In [None]:
#100 est
#random forest after adding time and fuel
#cat reduce to 5
#time 4.3
-0.14865615526799092

In [18]:
#100 est
#random forest after adding time and fuel
#cat reduce to 4
#time 3.5
-0.15207472815412973

-0.15207472815412973

In [19]:
#100 est
#random forest after adding time and fuel
#cat reduce to 5
#time 4.3
-0.14865615526799092

-0.14865615526799092

In [20]:
#random forest without hiperparams optimiztion
0.15468864180454378

0.15468864180454378

In [21]:
#with more cats
#only interactions
0.28300849758869806

0.28300849758869806

In [22]:
#with more cats
0.2809768534229591

0.2809768534229591

In [23]:
#with some cats
0.28220298380787107

0.28220298380787107

In [24]:
#with poly 2
#only interactions
np.mean([-0.32750984, -0.32808752, -0.32783682, -0.32724463, -0.32590006])

-0.327315774

In [25]:
#with poly 2
np.mean([-0.32557581, -0.33074444, -0.32652596, -0.32451948, -0.32606595])

-0.326686328

In [26]:
#without poly
np.mean([-0.49644549, -0.49597342, -0.49757726, -0.49581476, -0.49521693])

-0.49620557200000004

[CV] START .....................................................................
building tree 1 of 1
[CV] END ............................... score: (test=-0.196) total time=   7.9s
[CV] START .....................................................................
building tree 1 of 1
[CV] END ............................... score: (test=-0.198) total time=   7.9s
[CV] START .....................................................................
building tree 1 of 4
building tree 2 of 4
building tree 3 of 4
building tree 4 of 4
[CV] END ............................... score: (test=-0.165) total time=  31.5s
[CV] START .....................................................................
building tree 1 of 3
building tree 2 of 3
building tree 3 of 3
[CV] END ............................... score: (test=-0.169) total time=  23.7s
[CV] START .....................................................................
building tree 1 of 1
[CV] END ............................... score: (test=-0.198)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    7.6s
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    7.6s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    7.4s
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    7.4s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.2s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.2s finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  

[CV] START .....................................................................
building tree 1 of 1
[CV] END ............................... score: (test=-0.196) total time=   8.6s
[CV] START .....................................................................
building tree 1 of 1
[CV] END ............................... score: (test=-0.196) total time=   8.2s
[CV] START .....................................................................
building tree 1 of 4
building tree 2 of 4
building tree 3 of 4
building tree 4 of 4
[CV] END ............................... score: (test=-0.163) total time=  31.5s
[CV] START .....................................................................
building tree 1 of 4
building tree 2 of 4
building tree 3 of 4
building tree 4 of 4
[CV] END ............................... score: (test=-0.162) total time=   7.5s
[CV] START .....................................................................
building tree 1 of 3
building tree 2 of 3
building tree 3 of 

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    8.1s
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    8.1s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    7.3s
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    7.3s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  

[CV] START .....................................................................
building tree 1 of 1
[CV] END ............................... score: (test=-0.198) total time=   8.6s
[CV] START .....................................................................
building tree 1 of 1
[CV] END ............................... score: (test=-0.195) total time=   7.7s
[CV] START .....................................................................
building tree 1 of 1
[CV] END ............................... score: (test=-0.196) total time=   5.1s
[CV] START .....................................................................
building tree 1 of 4
building tree 2 of 4
building tree 3 of 4
building tree 4 of 4
[CV] END ............................... score: (test=-0.163) total time=  31.6s
[CV] START .....................................................................
building tree 1 of 3
building tree 2 of 3
building tree 3 of 3
[CV] END ............................... score: (test=-0.166)