In [1]:
import numpy as np
import pandas as pd
from sklearn import model_selection
from sklearn import linear_model
from sklearn import preprocessing
from sklearn import metrics
from sklearn import ensemble

import datetime
import time

In [2]:
data_train = pd.read_csv("data/css_main_training.csv", sep=";")
data_test = pd.read_csv("data/css_main_test.csv", sep=";")
data_fuel = pd.read_csv("data/fuel_prices.csv", sep=";")

In [3]:
to_drop = ["id_contract", "id_payer", "temperature", "route_start_datetime", "route_end_datetime", "route_start_unix", "route_end_unix", "date"]
ordinal_cats = ["load_size_type", "contract_type"]
one_hot_cats = ["direction", "route_start_country", "last_unload_country", "first_load_country", "route_end_country", "id_currency", "prim_train_line",
                "prim_ferry_line", "id_service_type"]


In [4]:
def preproc(df, train=True, enc_one_hot=None, enc_ordinal=None):
    if train:
        assert(enc_one_hot is None and enc_ordinal is None)
    else:
        assert(enc_one_hot is not None and enc_ordinal is not None)

    df = df.copy()

    nan_to_zero = ["prim_train_line", "prim_ferry_line"]
    for col_name in nan_to_zero:
        df[col_name] = df.apply(lambda row: 0 if np.isnan(row[col_name]) else row[col_name], axis=1)

    df["route_start_unix"] = df.apply(lambda row: time.mktime(datetime.datetime.fromisoformat(row["route_start_datetime"]).timetuple()), axis=1)
    df["route_end_unix"] = df.apply(lambda row: time.mktime(datetime.datetime.fromisoformat(row["route_end_datetime"]).timetuple()), axis=1)
    df["duration"] = df.apply(lambda row: row["route_end_unix"] - row["route_start_unix"], axis=1)
    df["date"] = df.apply(lambda row: row["route_start_datetime"].split()[0], axis=1)

    df = df.merge(data_fuel, on="date")

    df.drop(columns=to_drop, inplace=True)

    if train:
        df.dropna(inplace=True)

    if train:
        df = df.sample(frac=1, random_state=18)

    X, y = df.drop(columns=["expenses"] + one_hot_cats + ordinal_cats).to_numpy(), df["expenses"].to_numpy()

    df_one_hot_cats = df[one_hot_cats]
    if train:
        enc_one_hot = preprocessing.OneHotEncoder(handle_unknown="ignore")
        X_one_hot_cats = enc_one_hot.fit_transform(df_one_hot_cats.to_numpy()).toarray()
    else:
        X_one_hot_cats = enc_one_hot.transform(df_one_hot_cats.to_numpy()).toarray()
    X = np.concatenate((X, X_one_hot_cats), axis=1)

    df_ordinal_cats = df[ordinal_cats]
    if train:
        enc_ordinal = preprocessing.OrdinalEncoder()
        X_ordinal_cats = enc_ordinal.fit_transform(df_ordinal_cats.to_numpy())
    else:
        X_ordinal_cats = enc_ordinal.transform(df_ordinal_cats.to_numpy())
    X = np.concatenate((X, X_ordinal_cats), axis=1)


    print(X.shape)
    if train:
        return X, y, enc_one_hot, enc_ordinal
    return X, y

In [5]:
def validate(model, X, y, test_size=0.2):
    X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=test_size)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    score = metrics.mean_squared_error(y_test, y_pred) ** (1/2)
    return -score

In [6]:
def make_submit(model, name):
    X_train, y_train, enc_one_hot, enc_ordinal = preproc(data_train, train=True)
    X_test, _ = preproc(data_test, train=False, enc_one_hot=enc_one_hot, enc_ordinal=enc_ordinal)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    assert(len(y_pred) == len(data_test.index))
    print(y_pred)
    pd.Series(y_pred).to_csv(f"result{name}.txt", sep='\n', header=False, index=False)

In [7]:
X_train, y_train, enc_one_hot, enc_ordinal = preproc(data_train, train=True)
X_test, y_test = preproc(data_test, train=False, enc_one_hot=enc_one_hot, enc_ordinal=enc_ordinal)

(326465, 289)
(72452, 289)


In [8]:
model = ensemble.ExtraTreesRegressor(
    n_jobs=-1,
    random_state=18,
    verbose=1,

    n_estimators=330,
    max_depth=21,
    min_samples_leaf=100,
    # max_features=0.98,
    max_leaf_nodes=150,
)

In [9]:
validate(model, X_train, y_train)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done 168 tasks      | elapsed:  6.1min
[Parallel(n_jobs=-1)]: Done 330 out of 330 | elapsed: 11.5min finished
[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.0s
[Parallel(n_jobs=16)]: Done 168 tasks      | elapsed:    0.0s
[Parallel(n_jobs=16)]: Done 330 out of 330 | elapsed:    0.2s finished


-0.17808402915550867

In [10]:
make_submit(model, name="var1")

(326465, 289)
(72452, 289)


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 168 tasks      | elapsed:  7.4min
[Parallel(n_jobs=-1)]: Done 330 out of 330 | elapsed: 14.0min finished
[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done  18 tasks      | elapsed:    0.0s
[Parallel(n_jobs=16)]: Done 168 tasks      | elapsed:    0.0s


[7.04428474 4.99381426 7.23042244 ... 4.93177446 4.93177446 7.02306503]


[Parallel(n_jobs=16)]: Done 330 out of 330 | elapsed:    0.1s finished
