In [1]:
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest,chi2

from sklearn.neighbors import KNeighborsClassifier

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [3]:
test.head()

Unnamed: 0,id,Age,Gender,Annual Income,Marital Status,Number of Dependents,Education Level,Occupation,Health Score,Location,Policy Type,Previous Claims,Vehicle Age,Credit Score,Insurance Duration,Policy Start Date,Customer Feedback,Smoking Status,Exercise Frequency,Property Type
0,1200000,28.0,Female,2310.0,,4.0,Bachelor's,Self-Employed,7.657981,Rural,Basic,,19.0,,1.0,2023-06-04 15:21:39.245086,Poor,Yes,Weekly,House
1,1200001,31.0,Female,126031.0,Married,2.0,Master's,Self-Employed,13.381379,Suburban,Premium,,14.0,372.0,8.0,2024-04-22 15:21:39.224915,Good,Yes,Rarely,Apartment
2,1200002,47.0,Female,17092.0,Divorced,0.0,PhD,Unemployed,24.354527,Urban,Comprehensive,,16.0,819.0,9.0,2023-04-05 15:21:39.134960,Average,Yes,Monthly,Condo
3,1200003,28.0,Female,30424.0,Divorced,3.0,PhD,Self-Employed,5.136225,Suburban,Comprehensive,1.0,3.0,770.0,5.0,2023-10-25 15:21:39.134960,Poor,Yes,Daily,House
4,1200004,24.0,Male,10863.0,Divorced,2.0,High School,Unemployed,11.844155,Suburban,Premium,,14.0,755.0,7.0,2021-11-26 15:21:39.259788,Average,No,Weekly,House


In [4]:
train['Policy Start Date'] = pd.to_datetime(train['Policy Start Date'], errors='coerce')
test['Policy Start Date'] = pd.to_datetime(test['Policy Start Date'], errors='coerce')

for df in [train, test]:
    df['Policy_Start_Year'] = df['Policy Start Date'].dt.year
    df['Policy_Start_Month'] = df['Policy Start Date'].dt.month


train.drop('Policy Start Date', axis=1, inplace=True)
test.drop('Policy Start Date', axis=1, inplace=True)

In [5]:
X_train = train.drop('Premium Amount', axis=1)
X_test = test

Y_train = train['Premium Amount']
X_train = X_train.drop(['id'], axis=1)


In [6]:
column_names = ['Age','Annual Income','Number of Dependents','Health Score','Previous Claims','Vehicle Age','Credit Score','Insurance Duration','Marital Status','Occupation','Customer Feedback','Education Level','Location','Policy Type','Gender','Smoking Status','Exercise Frequency','Property Type','Policy_Start_Year','Policy_Start_Month']

# Convert numpy ndarray to pandas DataFrame
X_train = pd.DataFrame(X_train, columns=column_names)
X_test = pd.DataFrame(X_test, columns=column_names)


In [7]:
numerical_features = ['Age', 'Annual Income', 'Number of Dependents', 'Health Score', 'Previous Claims', 'Vehicle Age', 'Credit Score', 'Insurance Duration','Policy_Start_Year','Policy_Start_Month']
categorical_features = ['Marital Status', 'Education Level', 'Occupation', 'Location', 'Policy Type', 'Customer Feedback', 'Gender', 'Smoking Status', 'Exercise Frequency', 'Property Type']

numerical_pipeline = Pipeline([
    ('impute_num', SimpleImputer()),
    ('scale', MinMaxScaler()),
])

categorical_pipeline = Pipeline([
    ('impute_cat', SimpleImputer(strategy='most_frequent')),
    ('ohe', OneHotEncoder(sparse_output=False, handle_unknown='ignore')),  
])


In [8]:

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_pipeline, numerical_features),
        ('cat', categorical_pipeline, categorical_features),
    ],
    remainder='passthrough'  
)

pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('feature_selection', SelectKBest(score_func=chi2, k=30)), 
    ('classifier', KNeighborsClassifier(n_neighbors=3)),
])

In [9]:
X_train_small = X_train.sample(n=250000)  
Y_train_small = Y_train.sample(n=250000)

In [10]:
pipe.fit(X_train_small,Y_train_small)

In [11]:

Y_pred = pipe.predict(X_test)

In [12]:
submission = pd.DataFrame({
    'id': test['id'],
    'Premium Amount': Y_pred
})

submission.to_csv('submission.csv', index=False)