In [16]:
import pandas as pd
import numpy as np
from catboost import CatBoostRegressor
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_log_error
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
import joblib



import warnings
warnings.filterwarnings("ignore")

In [17]:
import os, sys
# Define the base directory (where the notebook is running)
base_dir = os.path.abspath(os.path.join(os.getcwd(), "..", ".."))
# Define the data directory
data_dir = os.path.join(base_dir, "data")
model_dir = os.path.join(base_dir, "models")
sys.path.append(base_dir)

In [18]:
train_file = os.path.join(data_dir, "train.csv")
test_file = os.path.join(data_dir, "test.csv")
sample_file = os.path.join(data_dir, "sample_submission.csv")

train = pd.read_csv(train_file)
test = pd.read_csv(test_file)

sample = pd.read_csv(sample_file)

train.drop('id', axis=1, inplace=True)
test.drop('id', axis=1, inplace=True) 

#nonlog_fe , nonlog = joblib.load("cat_non_loged.pkl")
nonlog_fe , nonlog = joblib.load("nonlog.pkl")


train['nonlog'] = nonlog_fe
test['nonlog'] = nonlog

In [19]:
def date(Df):

    Df['Policy Start Date'] = pd.to_datetime(Df['Policy Start Date'])
    Df['Year'] = Df['Policy Start Date'].dt.year
    Df['Day'] = Df['Policy Start Date'].dt.day
    Df['Month'] = Df['Policy Start Date'].dt.month
    Df['Month_name'] = Df['Policy Start Date'].dt.month_name()
    Df['Day_of_week'] = Df['Policy Start Date'].dt.day_name()
    Df['Week'] = Df['Policy Start Date'].dt.isocalendar().week
    Df['Year_sin'] = np.sin(2 * np.pi * Df['Year'])
    Df['Year_cos'] = np.cos(2 * np.pi * Df['Year'])
    Df['Month_sin'] = np.sin(2 * np.pi * Df['Month'] / 12) 
    Df['Month_cos'] = np.cos(2 * np.pi * Df['Month'] / 12)
    Df['Day_sin'] = np.sin(2 * np.pi * Df['Day'] / 31)  
    Df['Day_cos'] = np.cos(2 * np.pi * Df['Day'] / 31)
    Df['Group']=(Df['Year']-2020)*48+Df['Month']*4+Df['Day']//7
    
    Df.drop('Policy Start Date', axis=1, inplace=True)

    return Df

In [20]:
def fe(df):
    
    df['contract length'] = pd.cut(
        df["Insurance Duration"].fillna(99),  
        bins=[-float('inf'), 1, 3, float('inf')],  
        labels=[0, 1, 2]  
    ).astype(int)
    return df
    

In [21]:
def feature_engineering(df):
    
    #df = freq_encode(df)
    
    df['MissingValuesCount'] = df.isna().sum(axis=1)
    
    # df['MissingHealth'] = df['Health Score'].isna().astype(int)
    # df['Claims v Duration'] = df['Previous Claims'] / df['Insurance Duration']
    # df['Health vs Claims'] = df['Health Score'] / df['Previous Claims']
    
    def duplicate_int(col, df):
        df[col + ' Integer'] = df[col].apply(lambda x: int(x) if pd.notna(x) else x)
        df[col] = df[col].fillna('None').astype('string')
        
        return df
    
    df = duplicate_int('Health Score', df)
    df = duplicate_int('Credit Score', df)
    df = duplicate_int('Previous Claims', df)
    df = duplicate_int('Annual Income', df)
    
    #df = freq_encode(df)

    return df

train = feature_engineering(train)
test = feature_engineering(test)

train = date(train)
test = date(test)

train = fe(train)
test = fe(test)

cat_cols = [col for col in train.columns if train[col].dtype == 'object']
feature_cols = list(test.columns)


In [22]:
class CategoricalEncoder:
    def __init__(self, train, test):
        self.train = train
        self.test = test

    def frequency_encode(self, cat_cols, feature_cols, drop_org=False):
        combined = pd.concat([self.train, self.test], axis=0, ignore_index=True)

        new_cat_cols = [] 
        for col in cat_cols:
            freq_encoding = combined[col].value_counts().to_dict()
            
            self.train[f"{col}_freq"] = self.train[col].map(freq_encoding).astype('float')
            self.test[f"{col}_freq"] = self.test[col].map(freq_encoding).astype('float')

            new_col_name = f"{col}_freq"
            new_cat_cols.append(new_col_name)
            feature_cols.append(new_col_name)
            if drop_org:
                feature_cols.remove(col)

        return self.train, self.test, new_cat_cols, feature_cols

In [23]:
encoder = CategoricalEncoder(train, test)
train, test, cat_cols, feature_cols = encoder.frequency_encode(cat_cols, feature_cols, drop_org=True)

train = train[feature_cols + ['Premium Amount']]
test = test[feature_cols]

# train = train.fillna(-111)
# test = test.fillna(-111)

In [24]:
train.head()

Unnamed: 0,Age,Annual Income,Number of Dependents,Health Score,Previous Claims,Vehicle Age,Credit Score,Insurance Duration,nonlog,MissingValuesCount,...,Occupation_freq,Location_freq,Policy Type_freq,Customer Feedback_freq,Smoking Status_freq,Exercise Frequency_freq,Property Type_freq,Month_name_freq,Day_of_week_freq,Premium Amount
0,19.0,10049.0,1.0,22.59876067181393,2.0,17.0,372.0,5.0,1178.491281,0,...,470636.0,663201.0,669475.0,625952.0,996268.0,510693.0,667500.0,162307.0,284861.0,2869.0
1,39.0,31678.0,3.0,15.569730989408043,1.0,12.0,694.0,2.0,967.349082,1,...,,668067.0,665822.0,629122.0,1003732.0,498230.0,667500.0,164442.0,287191.0,1483.0
2,23.0,25602.0,3.0,47.17754928786464,1.0,14.0,,3.0,1105.930282,1,...,470636.0,668732.0,669475.0,614826.0,1003732.0,510693.0,667500.0,165556.0,284861.0,567.0
3,21.0,141855.0,2.0,10.938144158664583,1.0,0.0,367.0,1.0,1259.426494,1,...,,668067.0,664703.0,625952.0,1003732.0,491143.0,666022.0,164442.0,287424.0,765.0
4,21.0,39651.0,1.0,20.376093627736925,0.0,8.0,598.0,4.0,1311.820508,0,...,470636.0,668067.0,669475.0,625952.0,1003732.0,510693.0,667500.0,162307.0,287424.0,2022.0


In [25]:
X = train.drop('Premium Amount', axis=1)  
y = train['Premium Amount']

y_log = np.log1p(y)

In [26]:
def rmsle(y_true, y_pred):
    return np.sqrt(mean_squared_log_error(y_true, y_pred))

In [27]:
def train_model():
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    oof = np.zeros(len(X))
    models = []

    for fold, (train_idx, valid_idx) in enumerate(kf.split(X)):
        print(f"Fold {fold + 1}")
        X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
        y_train, y_valid = y_log.iloc[train_idx], y_log.iloc[valid_idx]

        model = CatBoostRegressor(
            iterations=3000,
            learning_rate=0.05,
            depth=6,
            eval_metric="RMSE",
            random_seed=42,
            verbose=200,
            task_type='GPU',
            l2_leaf_reg =  0.7,
        )
        
        model.fit(X_train,
                  y_train,
                  eval_set=(X_valid, y_valid), 
                  early_stopping_rounds=300,
                  # cat_features=cat_cols,
                 )
        models.append(model)
        oof[valid_idx] = np.maximum(0, model.predict(X_valid))
        fold_rmsle = rmsle(np.expm1(y_valid), np.expm1(oof[valid_idx]))
        print(f"Fold {fold + 1} RMSLE: {fold_rmsle}")
        
    return models, oof

In [28]:
# from axyom_utilities.training import train_model_cv
# from axyom_utilities.wrappers import CatBoostRegressorWrapper

# model_generator = lambda: CatBoostRegressorWrapper(
#             iterations=3000,
#             learning_rate=0.05,
#             depth=6,
#             eval_metric="RMSE",
#             random_seed=42,
#             verbose=200,
#             task_type='GPU',
#             l2_leaf_reg =  0.7,
#         )

# results = train_model_cv(model_generator, X, y_log, early_stopping_rounds=300, seed=42)

# models, oof = results['models'], results['oof_preds']

models,oof = train_model()

Fold 1
0:	learn: 1.0913273	test: 1.0920441	best: 1.0920441 (0)	total: 8.55ms	remaining: 25.6s
200:	learn: 1.0348360	test: 1.0356772	best: 1.0356772 (200)	total: 1.52s	remaining: 21.2s
400:	learn: 1.0325596	test: 1.0343266	best: 1.0343266 (400)	total: 3.04s	remaining: 19.7s
600:	learn: 1.0310873	test: 1.0339286	best: 1.0339286 (600)	total: 4.57s	remaining: 18.2s
800:	learn: 1.0297576	test: 1.0336781	best: 1.0336768 (799)	total: 6.11s	remaining: 16.8s
1000:	learn: 1.0285366	test: 1.0335869	best: 1.0335827 (993)	total: 7.67s	remaining: 15.3s
1200:	learn: 1.0273963	test: 1.0335613	best: 1.0335468 (1123)	total: 9.22s	remaining: 13.8s
1400:	learn: 1.0263099	test: 1.0335274	best: 1.0335214 (1396)	total: 10.8s	remaining: 12.3s
1600:	learn: 1.0252257	test: 1.0335259	best: 1.0335117 (1483)	total: 12.3s	remaining: 10.8s
bestTest = 1.033511661
bestIteration = 1483
Shrink model to first 1484 iterations.
Fold 1 RMSLE: 1.0335116316918465
Fold 2
0:	learn: 1.0915954	test: 1.0908269	best: 1.0908269 (0)	

In [29]:
print(rmsle(y, np.expm1(oof)))

1.0334342776597698


In [30]:
test_predictions = np.zeros(len(test))

for model in models:
    test_predictions += np.maximum(0, np.expm1(model.predict(test))) / len(models)


sample['Premium Amount'] = test_predictions
sample.to_csv('submission.csv', index = False)
sample.head()

Unnamed: 0,id,Premium Amount
0,1200000,868.360835
1,1200001,905.245474
2,1200002,804.51898
3,1200003,842.603526
4,1200004,801.46253
