#Import required Python libraries

In [9]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.impute import SimpleImputer
import joblib
import os
from pprint import pprint

#Load the dataset files

In [2]:
train_path="/content/train.csv"
test_path="/content/test.csv"
submission_path="/content/sample_submission.csv"

In [11]:
train_df = pd.read_csv(train_path)

In [12]:
test_df = pd.read_csv(test_path)

In [13]:
sample_submission = pd.read_csv(submission_path)

##Quick look at the data

In [14]:
print("Train shape:", train_df.shape)

Train shape: (1200000, 21)


In [15]:
print("Test shape:", test_df.shape)

Test shape: (800000, 20)


In [16]:
print("Sample submission shape:", sample_submission.shape)

Sample submission shape: (800000, 2)


In [17]:
train_df.head()

Unnamed: 0,id,Age,Gender,Annual Income,Marital Status,Number of Dependents,Education Level,Occupation,Health Score,Location,...,Previous Claims,Vehicle Age,Credit Score,Insurance Duration,Policy Start Date,Customer Feedback,Smoking Status,Exercise Frequency,Property Type,Premium Amount
0,0,19.0,Female,10049.0,Married,1.0,Bachelor's,Self-Employed,22.598761,Urban,...,2.0,17.0,372.0,5.0,2023-12-23 15:21:39.134960,Poor,No,Weekly,House,2869.0
1,1,39.0,Female,31678.0,Divorced,3.0,Master's,,15.569731,Rural,...,1.0,12.0,694.0,2.0,2023-06-12 15:21:39.111551,Average,Yes,Monthly,House,1483.0
2,2,23.0,Male,25602.0,Divorced,3.0,High School,Self-Employed,47.177549,Suburban,...,1.0,14.0,,3.0,2023-09-30 15:21:39.221386,Good,Yes,Weekly,House,567.0
3,3,21.0,Male,141855.0,Married,2.0,Bachelor's,,10.938144,Rural,...,1.0,0.0,367.0,1.0,2024-06-12 15:21:39.226954,Poor,Yes,Daily,Apartment,765.0
4,4,21.0,Male,39651.0,Single,1.0,Bachelor's,Self-Employed,20.376094,Rural,...,0.0,8.0,598.0,4.0,2021-12-01 15:21:39.252145,Poor,Yes,Weekly,House,2022.0


In [18]:
test_df.head()

Unnamed: 0,id,Age,Gender,Annual Income,Marital Status,Number of Dependents,Education Level,Occupation,Health Score,Location,Policy Type,Previous Claims,Vehicle Age,Credit Score,Insurance Duration,Policy Start Date,Customer Feedback,Smoking Status,Exercise Frequency,Property Type
0,1200000,28.0,Female,2310.0,,4.0,Bachelor's,Self-Employed,7.657981,Rural,Basic,,19.0,,1.0,2023-06-04 15:21:39.245086,Poor,Yes,Weekly,House
1,1200001,31.0,Female,126031.0,Married,2.0,Master's,Self-Employed,13.381379,Suburban,Premium,,14.0,372.0,8.0,2024-04-22 15:21:39.224915,Good,Yes,Rarely,Apartment
2,1200002,47.0,Female,17092.0,Divorced,0.0,PhD,Unemployed,24.354527,Urban,Comprehensive,,16.0,819.0,9.0,2023-04-05 15:21:39.134960,Average,Yes,Monthly,Condo
3,1200003,28.0,Female,30424.0,Divorced,3.0,PhD,Self-Employed,5.136225,Suburban,Comprehensive,1.0,3.0,770.0,5.0,2023-10-25 15:21:39.134960,Poor,Yes,Daily,House
4,1200004,24.0,Male,10863.0,Divorced,2.0,High School,Unemployed,11.844155,Suburban,Premium,,14.0,755.0,7.0,2021-11-26 15:21:39.259788,Average,No,Weekly,House


In [19]:
sample_submission.head()

Unnamed: 0,id,Premium Amount
0,1200000,1102.545
1,1200001,1102.545
2,1200002,1102.545
3,1200003,1102.545
4,1200004,1102.545


In [20]:
train_df.columns.tolist()

['id',
 'Age',
 'Gender',
 'Annual Income',
 'Marital Status',
 'Number of Dependents',
 'Education Level',
 'Occupation',
 'Health Score',
 'Location',
 'Policy Type',
 'Previous Claims',
 'Vehicle Age',
 'Credit Score',
 'Insurance Duration',
 'Policy Start Date',
 'Customer Feedback',
 'Smoking Status',
 'Exercise Frequency',
 'Property Type',
 'Premium Amount']

In [21]:
test_df.columns.tolist()

['id',
 'Age',
 'Gender',
 'Annual Income',
 'Marital Status',
 'Number of Dependents',
 'Education Level',
 'Occupation',
 'Health Score',
 'Location',
 'Policy Type',
 'Previous Claims',
 'Vehicle Age',
 'Credit Score',
 'Insurance Duration',
 'Policy Start Date',
 'Customer Feedback',
 'Smoking Status',
 'Exercise Frequency',
 'Property Type']

In [22]:
ID_COL = "id"
TARGET_COL = "Premium Amount"

In [23]:
feature_cols = [col for col in train_df.columns if col not in [ID_COL, TARGET_COL]]

In [24]:
print("Number of features:", len(feature_cols))

Number of features: 19


In [25]:
print("Feature columns:", feature_cols)

Feature columns: ['Age', 'Gender', 'Annual Income', 'Marital Status', 'Number of Dependents', 'Education Level', 'Occupation', 'Health Score', 'Location', 'Policy Type', 'Previous Claims', 'Vehicle Age', 'Credit Score', 'Insurance Duration', 'Policy Start Date', 'Customer Feedback', 'Smoking Status', 'Exercise Frequency', 'Property Type']


##Check for missing or null values

In [26]:
train_df.isnull().sum()

Unnamed: 0,0
id,0
Age,18705
Gender,0
Annual Income,44949
Marital Status,18529
Number of Dependents,109672
Education Level,0
Occupation,358075
Health Score,74076
Location,0


In [27]:
test_df.isnull().sum()

Unnamed: 0,0
id,0
Age,12489
Gender,0
Annual Income,29860
Marital Status,12336
Number of Dependents,73130
Education Level,0
Occupation,239125
Health Score,49449
Location,0


In [28]:
print("Train data types:\n", train_df.dtypes)

Train data types:
 id                        int64
Age                     float64
Gender                   object
Annual Income           float64
Marital Status           object
Number of Dependents    float64
Education Level          object
Occupation               object
Health Score            float64
Location                 object
Policy Type              object
Previous Claims         float64
Vehicle Age             float64
Credit Score            float64
Insurance Duration      float64
Policy Start Date        object
Customer Feedback        object
Smoking Status           object
Exercise Frequency       object
Property Type            object
Premium Amount          float64
dtype: object


### Function to extract date features

In [29]:
def add_date_features(df, date_col="Policy Start Date"):
    df = df.copy()
    df[date_col] = pd.to_datetime(df[date_col], errors='coerce')
    df["Policy_Start_Year"] = df[date_col].dt.year.fillna(0).astype(int)
    df["Policy_Start_Month"] = df[date_col].dt.month.fillna(0).astype(int)
    df["Policy_Start_DayOfWeek"] = df[date_col].dt.dayofweek.fillna(0).astype(int)
    df = df.drop(columns=[date_col])
    return df

### Apply to train & test

In [30]:
train_df = add_date_features(train_df)

In [31]:
test_df = add_date_features(test_df)

In [33]:
feature_cols = [col for col in train_df.columns if col not in [ID_COL, TARGET_COL]]
numeric_cols = train_df[feature_cols].select_dtypes(include=["int64", "float64"]).columns.tolist()
print("Numeric columns:", numeric_cols)

Numeric columns: ['Age', 'Annual Income', 'Number of Dependents', 'Health Score', 'Previous Claims', 'Vehicle Age', 'Credit Score', 'Insurance Duration', 'Policy_Start_Year', 'Policy_Start_Month', 'Policy_Start_DayOfWeek']


In [34]:
categorical_cols = [col for col in feature_cols if col not in numeric_cols]
print("Categorical columns:", categorical_cols)

Categorical columns: ['Gender', 'Marital Status', 'Education Level', 'Occupation', 'Location', 'Policy Type', 'Customer Feedback', 'Smoking Status', 'Exercise Frequency', 'Property Type']


# Data Preprocessing Pipeline

In [35]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.impute import SimpleImputer
import joblib

In [37]:
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())])

## Categorical transformer

In [40]:
categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1))
])

In [41]:
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_cols),
        ("cat", categorical_transformer, categorical_cols)])

## Save pipeline for reuse

In [42]:
joblib.dump(preprocessor, "preprocessor_pipeline.joblib")
print("Preprocessing pipeline saved as preprocessor_pipeline.joblib")

Preprocessing pipeline saved as preprocessor_pipeline.joblib


# Applying Data Preprocessing

In [43]:
X_train = train_df[feature_cols]
y_train = train_df[TARGET_COL]

In [44]:
X_test = test_df[feature_cols]

## Fit the preprocessor

In [45]:
X_train_processed = preprocessor.fit_transform(X_train)
print("Processed train shape:", X_train_processed.shape)

Processed train shape: (1200000, 21)


In [46]:
X_test_processed = preprocessor.transform(X_test)
print("Processed test shape:", X_test_processed.shape)

Processed test shape: (800000, 21)


#  Train the model & Predict on test set

In [47]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import numpy as np

In [52]:
model = RandomForestRegressor(
    n_estimators=50,   # number of trees
    max_depth=10,      # maximum depth of the trees
    random_state=42,
    n_jobs=-1
)

In [53]:
model.fit(X_train_processed, y_train)

In [54]:
test_preds = model.predict(X_test_processed)

# Create submission file

In [56]:
submission_df = pd.DataFrame({
    ID_COL: test_df[ID_COL],
    TARGET_COL: test_preds})

In [57]:
submission_df.to_csv("submission.csv", index=False)
print("Submission file saved as submission.csv")

Submission file saved as submission.csv
