In [None]:
%env CLEARML_WEB_HOST=https://app.clear.ml
%env CLEARML_API_HOST=https://api.clear.ml
%env CLEARML_FILES_HOST=https://files.clear.ml
%env CLEARML_API_ACCESS_KEY=... # set the CLEARML_API_ACCESS_KEY
%env CLEARML_API_SECRET_KEY=... # set the CLEARML_API_ACCESS_KEY

In [43]:
import math
import pickle
import datetime
from datetime import datetime

import pandas as pd
import numpy as np

from sklearn.model_selection import GroupKFold
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.compose import ColumnTransformer

import matplotlib.pyplot as plt

from clearml import Task

In [44]:
def smape_loss(y_true, y_pred):
    return np.abs(y_true - y_pred) / (y_true + np.abs(y_pred)) * 200


def train_and_validate(X, y, groups, n_splits=4, esitmator=lambda: Ridge(alpha=0.5)):
    models = []
    scores = []
    preprocessor = make_pipeline(MinMaxScaler(), StandardScaler())
    kf = GroupKFold(n_splits=n_splits)

    for train_idx, val_idx in kf.split(X, y, groups):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y[train_idx], y[val_idx]

        X_train = preprocessor.fit_transform(X_train)
        X_val = preprocessor.transform(X_val)

        model = esitmator()
        model.fit(X_train, np.log(y_train))
        models.append(model)
        
        y_pred = np.exp(model.predict(X_val))
        score = smape_loss(y_val, y_pred).mean()
        scores.append(score)

        print(f'SMAPE Score: {score:.4f}')

    return scores, models, preprocessor


def predict_with_models(models, preprocessor, X_test, mode='average', scores=None):
    test_predictions = np.zeros(len(X_test))

    X_test_processed = preprocessor.transform(X_test)

    if mode == 'average':
        for model in models:
            test_predictions += np.exp(model.predict(X_test_processed))
        test_predictions = test_predictions / len(models)
    
    elif mode == 'best':
        test_predictions = np.exp(models[np.argmin(scores)].predict(X_test_processed))

    return test_predictions


def prepare_data(feature_engineer_fn, train_file_path='../data/train.csv', test_file_path='../data/test.csv'):

    original_train_df = pd.read_csv(train_file_path)
    original_test_df = pd.read_csv(test_file_path)
    
    for df in [original_train_df, original_test_df]:
        df['date'] = pd.to_datetime(df.date)
    
    train_df = feature_engineer_fn(original_train_df)
    test_df = feature_engineer_fn(original_test_df)
    y_train = original_train_df['num_sold'].values
    
    year = original_train_df.date.dt.year

    return train_df, test_df, y_train, year

# Experiment 1

In [58]:
task1 = Task.init(project_name='tps-jan22', task_name='experiment_1')

In [45]:
def engineer(df):
    new_df = pd.DataFrame()

    new_df['wd'] = df.date.dt.weekday

    for country in ['Finland', 'Norway', 'Sweden']:
        new_df[country] = df.country == country
    for store in ['KaggleMart', 'KaggleRama']:
        new_df[store] = df.store == store

    for product in ['Kaggle Mug', 'Kaggle Hat', 'Kaggle Sticker']:
        new_df[product] = df['product'] == product

    return new_df.astype(np.float32)


In [46]:
train_df, test_df, y_train, year = prepare_data(engineer)
scores, models, preprocessor = train_and_validate(train_df, y_train, year)
average_smape = np.mean(scores)
print(f'Average SMAPE: {average_smape:.4f}')

test_predictions = predict_with_models(models, preprocessor, test_df, scores=scores)

submission_df = pd.DataFrame({
    'row_id': original_test_df['row_id'],
    'num_sold': test_predictions.round() 
})


submission_df.head()

SMAPE Score: 15.2215
SMAPE Score: 16.3875
SMAPE Score: 12.6469
SMAPE Score: 15.6064
Average SMAPE: 14.9656


Unnamed: 0,row_id,num_sold
0,26298,183.0
1,26299,322.0
2,26300,93.0
3,26301,319.0
4,26302,562.0


In [47]:
task1.close()

# Experiment 2

In [59]:
task2 = Task.init(project_name='tps-jan22', task_name='experiment_2')

In [49]:
def engineer(df):
    new_df = pd.DataFrame()

    new_df['wd'] = df.date.dt.weekday

    for country in ['Finland', 'Norway', 'Sweden']:
        new_df[country] = df.country == country
    for store in ['KaggleMart', 'KaggleRama']:
        new_df[store] = df.store == store

    for product in ['Kaggle Mug', 'Kaggle Hat', 'Kaggle Sticker']:
        new_df[product] = df['product'] == product

    dayofyear = df.date.dt.dayofyear
    for k in range(1, 6):
        new_df[f'sin{k}'] = np.sin(dayofyear / 365.25 * 2 * np.pi * k)
        new_df[f'cos{k}'] = np.cos(dayofyear / 365.25 * 2 * np.pi * k)

    return new_df.astype(np.float32)


In [50]:
train_df, test_df, y_train, year = prepare_data(engineer)
scores, models, preprocessor = train_and_validate(train_df, y_train, year)
average_smape = np.mean(scores)
print(f'Average SMAPE: {average_smape:.4f}')

test_predictions = predict_with_models(models, preprocessor, test_df, scores=scores)

submission_df = pd.DataFrame({
    'row_id': original_test_df['row_id'],
    'num_sold': test_predictions.round() 
})


submission_df.head()

SMAPE Score: 13.4136
SMAPE Score: 15.4642
SMAPE Score: 11.8090
SMAPE Score: 13.8232
Average SMAPE: 13.6275


Unnamed: 0,row_id,num_sold
0,26298,227.0
1,26299,400.0
2,26300,116.0
3,26301,395.0
4,26302,697.0


In [51]:
task2.close()

# Experiment 3

In [60]:
task3 = Task.init(project_name='tps-jan22', task_name='experiment_3')

In [53]:
def engineer(df):
    new_df = pd.DataFrame()

    new_df['wd4'] = df.date.dt.weekday == 4
    new_df['wd56'] = df.date.dt.weekday >= 5

    for country in ['Finland', 'Norway', 'Sweden']:
        new_df[country] = df.country == country
    for store in ['KaggleMart', 'KaggleRama']:
        new_df[store] = df.store == store

    for product in ['Kaggle Mug', 'Kaggle Hat', 'Kaggle Sticker']:
        new_df[product] = df['product'] == product

    dayofyear = df.date.dt.dayofyear
    for k in range(1, 6):
        new_df[f'sin{k}'] = np.sin(dayofyear / 365.25 * 2 * np.pi * k)
        new_df[f'cos{k}'] = np.cos(dayofyear / 365.25 * 2 * np.pi * k)

    return new_df.astype(np.float32)


In [54]:
train_df, test_df, y_train, year = prepare_data(engineer)
scores, models, preprocessor = train_and_validate(train_df, y_train, year)
average_smape = np.mean(scores)
print(f'Average SMAPE: {average_smape:.4f}')

test_predictions = predict_with_models(models, preprocessor, test_df, scores=scores)

submission_df = pd.DataFrame({
    'row_id': original_test_df['row_id'],
    'num_sold': test_predictions.round() 
})


submission_df.head()

SMAPE Score: 12.7412
SMAPE Score: 14.8575
SMAPE Score: 11.1512
SMAPE Score: 13.1720
Average SMAPE: 12.9805


Unnamed: 0,row_id,num_sold
0,26298,229.0
1,26299,405.0
2,26300,117.0
3,26301,400.0
4,26302,706.0


In [55]:
task3.close()

In [56]:
submission_df.to_csv('submission.csv', index=False)