In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_log_error
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import Lasso, Ridge
import lightgbm as lgb

import joblib

train_ag = True

import warnings
warnings.filterwarnings("ignore")

In [None]:
import h2o
from h2o.automl import H2OAutoML

h2o.init()

In [3]:
RUNTIME = 60 * 30

In [None]:
from pathlib import Path
from datetime import datetime
import sys

# Define the base directory
base_dir = Path.cwd().resolve().parents[1]

# Define subdirectories
data_dir = base_dir / "data"
model_dir = base_dir / "models"
notebooks_dir = base_dir / "notebooks"

# Append base_dir to sys.path
sys.path.append(str(base_dir))

In [4]:
train = pd.read_csv(data_dir/"train.csv")
test = pd.read_csv(data_dir/"test.csv")

sample = pd.read_csv(data_dir/'sample_submission.csv')

train.drop('id', axis=1, inplace=True)
test.drop('id', axis=1, inplace=True) 

In [5]:
def date(Df):

    Df['Policy Start Date'] = pd.to_datetime(Df['Policy Start Date'])
    Df['Year'] = Df['Policy Start Date'].dt.year
    Df['Day'] = Df['Policy Start Date'].dt.day
    Df['Month'] = Df['Policy Start Date'].dt.month
    Df['Month_name'] = Df['Policy Start Date'].dt.month_name()
    Df['Day_of_week'] = Df['Policy Start Date'].dt.day_name()
    Df['Week'] = Df['Policy Start Date'].dt.isocalendar().week
    Df['Year_sin'] = np.sin(2 * np.pi * Df['Year'])
    Df['Year_cos'] = np.cos(2 * np.pi * Df['Year'])
    Df['Month_sin'] = np.sin(2 * np.pi * Df['Month'] / 12) 
    Df['Month_cos'] = np.cos(2 * np.pi * Df['Month'] / 12)
    Df['Day_sin'] = np.sin(2 * np.pi * Df['Day'] / 31)  
    Df['Day_cos'] = np.cos(2 * np.pi * Df['Day'] / 31)
    Df['Group']=(Df['Year']-2020)*48+Df['Month']*4+Df['Day']//7
    
    Df.drop('Policy Start Date', axis=1, inplace=True)

    return Df

In [6]:
train = date(train)
test = date(test)

cat_cols = [col for col in train.columns if train[col].dtype == 'object']
feature_cols = list(test.columns)

In [7]:
class CategoricalEncoder:
    def __init__(self, train, test):
        self.train = train
        self.test = test

    def frequency_encode(self, cat_cols, feature_cols, drop_org=False):

        new_cat_cols = []
        for col in cat_cols:
            freq_encoding = self.train[col].value_counts().to_dict()

            self.train[f"{col}_freq"] = self.train[col].map(freq_encoding).astype('category')
            self.test[f"{col}_freq"] = self.test[col].map(freq_encoding).astype('category')

            new_col_name = f"{col}_freq"
            new_cat_cols.append(new_col_name)
            feature_cols.append(new_col_name)
            if drop_org:
                feature_cols.remove(col)

        return self.train, self.test, new_cat_cols, feature_cols

In [8]:
encoder = CategoricalEncoder(train, test)
train, test, cat_cols, feature_cols = encoder.frequency_encode(cat_cols, feature_cols, drop_org=True)

train = train[feature_cols + ['Premium Amount']]
test = test[feature_cols]

train['Premium Amount'] = np.log1p(train['Premium Amount'])

non_log_oof, non_log_test = joblib.load('nonlog_feature/cat_non_loged.pkl')

train['non_log_premium_amount'] = non_log_oof
test['non_log_premium_amount'] = non_log_test

In [None]:
train.head()

In [10]:
def rmsle(y_true, y_pred):
    return np.sqrt(mean_squared_log_error(y_true, y_pred))

In [None]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)

for i, (_, val_index) in enumerate(kf.split(train)):
    train.loc[val_index, 'fold'] = i

h_train = h2o.H2OFrame(train)
h_test = h2o.H2OFrame(test)

x = [col for col in h_train.columns if col not in ['Premium Amount', 'fold']]
y = 'Premium Amount'
fold_column = 'fold'

In [None]:
import joblib

aml = H2OAutoML(
        max_runtime_secs=RUNTIME,
        include_algos=["GBM", "DRF", "XGBoost", "StackedEnsemble"],
        keep_cross_validation_predictions=True,
        seed=42,
        nfolds=5,
        verbosity="info"
    )
aml.train(x=x, y=y, training_frame=h_train,fold_column=fold_column)

joblib.dump(aml, "model.pkl")

In [None]:
leaderboard = aml.leaderboard.as_data_frame()
print(leaderboard)

model_ids = leaderboard['model_id'].tolist()

oofs = pd.DataFrame()
for model_id in model_ids:
    model = h2o.get_model(model_id)
    oof_predictions = model.cross_validation_holdout_predictions().as_data_frame()
    oofs[model_id] = oof_predictions['predict']

preds = pd.DataFrame()
for model_id in model_ids:
    model = h2o.get_model(model_id)
    if model is not None:
        test_predictions = model.predict(h_test).as_data_frame()
        preds[model_id] = test_predictions['predict']

joblib.dump([oofs, preds], "h2o_automl.pkl")

In [None]:
models = list(oofs.columns)
for model in models:
    print(f"{model}: {rmsle(np.expm1(oofs[model]), np.expm1(train['Premium Amount']))}")

In [None]:
ridge = Ridge(alpha=0.1)  

ridge.fit(oofs, train['Premium Amount'])
oof_preds = ridge.predict(oofs)
print(rmsle(np.expm1(oof_preds), np.expm1(train['Premium Amount'])))

In [None]:
test_predictions = ridge.predict(preds)

sample['Premium Amount'] = np.expm1(test_predictions)
sample.to_csv('submission.csv', index = False)