In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

train = pd.read_csv('/kaggle/input/playground-series-s4e12/train.csv')
test = pd.read_csv('/kaggle/input/playground-series-s4e12/test.csv')

train

/kaggle/input/playground-series-s4e12/sample_submission.csv
/kaggle/input/playground-series-s4e12/train.csv
/kaggle/input/playground-series-s4e12/test.csv


Unnamed: 0,id,Age,Gender,Annual Income,Marital Status,Number of Dependents,Education Level,Occupation,Health Score,Location,...,Previous Claims,Vehicle Age,Credit Score,Insurance Duration,Policy Start Date,Customer Feedback,Smoking Status,Exercise Frequency,Property Type,Premium Amount
0,0,19.0,Female,10049.0,Married,1.0,Bachelor's,Self-Employed,22.598761,Urban,...,2.0,17.0,372.0,5.0,2023-12-23 15:21:39.134960,Poor,No,Weekly,House,2869.0
1,1,39.0,Female,31678.0,Divorced,3.0,Master's,,15.569731,Rural,...,1.0,12.0,694.0,2.0,2023-06-12 15:21:39.111551,Average,Yes,Monthly,House,1483.0
2,2,23.0,Male,25602.0,Divorced,3.0,High School,Self-Employed,47.177549,Suburban,...,1.0,14.0,,3.0,2023-09-30 15:21:39.221386,Good,Yes,Weekly,House,567.0
3,3,21.0,Male,141855.0,Married,2.0,Bachelor's,,10.938144,Rural,...,1.0,0.0,367.0,1.0,2024-06-12 15:21:39.226954,Poor,Yes,Daily,Apartment,765.0
4,4,21.0,Male,39651.0,Single,1.0,Bachelor's,Self-Employed,20.376094,Rural,...,0.0,8.0,598.0,4.0,2021-12-01 15:21:39.252145,Poor,Yes,Weekly,House,2022.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1199995,1199995,36.0,Female,27316.0,Married,0.0,Master's,Unemployed,13.772907,Urban,...,,5.0,372.0,3.0,2023-05-03 15:21:39.257696,Poor,No,Daily,Apartment,1303.0
1199996,1199996,54.0,Male,35786.0,Divorced,,Master's,Self-Employed,11.483482,Rural,...,,10.0,597.0,4.0,2022-09-10 15:21:39.134960,Poor,No,Weekly,Apartment,821.0
1199997,1199997,19.0,Male,51884.0,Divorced,0.0,Master's,,14.724469,Suburban,...,0.0,19.0,,6.0,2021-05-25 15:21:39.106582,Good,No,Monthly,Condo,371.0
1199998,1199998,55.0,Male,,Single,1.0,PhD,,18.547381,Suburban,...,1.0,7.0,407.0,4.0,2021-09-19 15:21:39.190215,Poor,No,Daily,Apartment,596.0


In [2]:
train.dtypes

id                        int64
Age                     float64
Gender                   object
Annual Income           float64
Marital Status           object
Number of Dependents    float64
Education Level          object
Occupation               object
Health Score            float64
Location                 object
Policy Type              object
Previous Claims         float64
Vehicle Age             float64
Credit Score            float64
Insurance Duration      float64
Policy Start Date        object
Customer Feedback        object
Smoking Status           object
Exercise Frequency       object
Property Type            object
Premium Amount          float64
dtype: object

In [3]:
from sklearn.model_selection import train_test_split

X = train.drop(['Premium Amount', 'id'], axis=1)
y = train['Premium Amount']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=10, test_size=.3)

In [4]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, FunctionTransformer

numerical_columns = X_train.select_dtypes(exclude=['object']).columns.tolist()
categorical_columns = X_train.select_dtypes(include=['object']).columns

numerical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()),
    ('convert_to_float32', FunctionTransformer(lambda x: x.astype(np.float32)))
])

categorical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('ordinal', OrdinalEncoder(dtype=np.int32, handle_unknown='use_encoded_value', unknown_value=-1))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_pipeline, numerical_columns),
        ('cat', categorical_pipeline, categorical_columns)
    ]
)

X_train_preprocessed = preprocessor.fit_transform(X_train)
X_test_preprocessed = preprocessor.fit_transform(X_test)
test_preprocessed = preprocessor.transform(test)

In [5]:
from sklearn import ensemble
from sklearn.metrics import mean_squared_log_error

hgbr = ensemble.HistGradientBoostingRegressor()
model = hgbr.fit(X_train_preprocessed, y_train)

predictions = model.predict(X_test_preprocessed)

print('RMSLE: ', np.sqrt(mean_squared_log_error(y_test, predictions)))

RMSLE:  1.1406322557189277


In [6]:
prediction = pd.DataFrame()
prediction['id'] = test.id

predictions = model.predict(test_preprocessed)

prediction['Premium Amount'] = predictions
prediction.to_csv('PS_S4E12.csv', index=False)

prediction

Unnamed: 0,id,Premium Amount
0,1200000,1256.802206
1,1200001,1120.774837
2,1200002,1075.106878
3,1200003,1093.475876
4,1200004,1042.167031
...,...,...
799995,1999995,1207.712934
799996,1999996,1418.341910
799997,1999997,1111.143341
799998,1999998,1134.023726


## Public Score: 

In [7]:
from xgboost import XGBRegressor

xgbr = XGBRegressor(n_estimators=1000, learning_rate=.1, max_depth=3, verbosity=0)
xgbr.fit(X_train_preprocessed, y_train, early_stopping_rounds=5, eval_set=[(X_test_preprocessed, y_test)], verbose=0)

print('Score: ', xgbr.score(X_train_preprocessed, y_train))

predictions = xgbr.predict(X_test_preprocessed)

print('RMSLE: ', np.sqrt(mean_squared_log_error(y_test, predictions)))



Score:  0.03630852679430918
RMSLE:  1.1474701050564247


In [8]:
prediction = pd.DataFrame()
prediction['id'] = test.id

predictions = xgbr.predict(test_preprocessed)

prediction['Premium Amount'] = predictions
prediction.to_csv('PS_S4E12.csv', index=False)

prediction

Unnamed: 0,id,Premium Amount
0,1200000,1384.075439
1,1200001,1093.506104
2,1200002,1056.324585
3,1200003,1063.421509
4,1200004,1035.237427
...,...,...
799995,1999995,1237.221313
799996,1999996,1202.550049
799997,1999997,1132.023804
799998,1999998,1149.421021
