In [17]:
import pandas as pd
import numpy as np


In [18]:
df_train = pd.read_csv('train.csv')

In [19]:
df_train.dropna(inplace=True)

In [20]:
df_train.isnull().sum()

id                      0
Age                     0
Gender                  0
Annual Income           0
Marital Status          0
Number of Dependents    0
Education Level         0
Occupation              0
Health Score            0
Location                0
Policy Type             0
Previous Claims         0
Vehicle Age             0
Credit Score            0
Insurance Duration      0
Policy Start Date       0
Customer Feedback       0
Smoking Status          0
Exercise Frequency      0
Property Type           0
Premium Amount          0
dtype: int64

In [21]:
df_train.shape

(384004, 21)

In [22]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
objs = ['Gender', 'Marital Status', 'Education Level', 'Occupation', 'Location', 'Policy Type', 'Policy Start Date', 'Customer Feedback', 'Smoking Status', 'Exercise Frequency', 'Property Type']
df_train[objs] = df_train[objs].apply(label_encoder.fit_transform)


In [23]:
X = df_train.drop(['id', 'Premium Amount'], axis=1)
y = df_train[['Premium Amount']]

In [24]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [50]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [51]:
from sklearn.decomposition import PCA

pca = PCA(n_components=12)
pca.fit(X_train)


In [52]:
print(pca.explained_variance_ratio_)

[0.06352878 0.05369595 0.0533335  0.053252   0.05315464 0.05300885
 0.05287936 0.05283368 0.05273712 0.0526276  0.0525958  0.05251564]


In [53]:
X_train_pca = pca.transform(X_train)
X_test_pca = pca.transform(X_test)
X_train_pca.shape


(268802, 12)

## Lasso Regression

In [54]:
def calRMSLE(y_test, y_pred):
    from sklearn.metrics import mean_squared_error

    y_test = np.log1p(y_test)
    y_pred = np.clip(y_pred, 0, None)
    y_pred = np.log1p(y_pred)

    mse = mean_squared_error(y_test, y_pred)
    rmsle = np.sqrt(mse)

    return rmsle




In [55]:
from sklearn.linear_model import LassoCV
from sklearn.metrics import mean_squared_error

alphas = np.linspace(0.001, 100, 1000)
lasso = LassoCV(cv = 10, alphas=alphas, random_state=42)
lasso.fit(X_train_pca, y_train)
y_pred = lasso.predict(X_test_pca)
mse = mean_squared_error(y_test, y_pred)
rmsle = calRMSLE(y_test, y_pred)
print("mse", mse)
print("rmsle", rmsle)




  y = column_or_1d(y, warn=True)


mse 752539.5892077509
rmsle 1.1676781470354731


## Random Forest

In [48]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(max_depth=10, n_estimators=400, max_features=0.5)
rf.fit(X_train_pca, y_train)

y_pred = rf.predict(X_test_pca)
mse = mean_squared_error(y_test, y_pred)
rsm = calRMSLE(y_test, y_pred)

print('mse', mse)
print('rsm', rsm)

  return fit_method(estimator, *args, **kwargs)


mse 717631.260604343
rsm 1.1422765682112308


## Adaboost

In [57]:
from sklearn.ensemble import AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor

base_estimator = DecisionTreeRegressor(max_depth=3)

# Initialize AdaBoostRegressor with the base estimator
ada_boost = AdaBoostRegressor(estimator=base_estimator, n_estimators=250, learning_rate=0.6, random_state=42)

# Fit the model
ada_boost.fit(X_train, y_train)

# Predict on the test set
y_pred = ada_boost.predict(X_test)

# Calculate Mean Squared Error
mse = mean_squared_error(y_test, y_pred)
rse = calRMSLE(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"RSE: {rse}")


  y = column_or_1d(y, warn=True)


Mean Squared Error: 853916.007858458
RSE: 1.2828832759967987
