In [1]:
import numpy as np
import pandas as pd

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [None]:
train.columns

In [None]:
train.head()

## Data preprocessing

In [None]:
train["Response"].value_counts()

In [None]:
train.isna().sum()

Note: 
+ no null
+ label: Response 
+ id: no use
+ Gender: binary 
+ Age: -> change to category
+ Driving_License: binary  
+ Region_Code: change to category
+ previous_insured: binary 
+ Vehicle_Damage: binary
+ Vehicle_Damage: category
+ Annual_Premium: numerical 
+ Policy_Sales_Channel: -> change to category
+ vintage: -> change to category

Important: 
+ this is imbalance dataset

## EDA & Feature Selection

### Binary cols

In [None]:
import seaborn as sns 
import matplotlib.pyplot as plt

In [None]:
group_res = train.groupby('Response')

In [None]:
train.groupby('Previously_Insured')["Response"].value_counts()

In [None]:
train.groupby('Gender')["Response"].value_counts()

In [None]:
train.groupby('Driving_License')["Response"].value_counts()

In [None]:
train.groupby('Vehicle_Age')["Response"].value_counts()

In [None]:
train.groupby('Vehicle_Damage')["Response"].value_counts()

Note: 
+ people with no insure has higher chance to respond
+ Male has slightly higher rate of response
+ people without license is not likely to respond
+ the more people ride, the more they respond
+ people with vihicle damage arre likely to respond

### Numerical cols

Age, Region_Code, Annual_Premium, Policy_Sales_Channel, Vintage


In [None]:
def plot_all_cols(data,cols):
    id_res = (data["Response"] == 1)
    id_no_res = (data["Response"] == 0)
    l = len(cols)
    plt.figure(figsize=(20,5*l))
    for i,col in enumerate(cols):
        print(col)
        plt.subplot(l,2,i*2+1)
        value,count = np.unique(data[col].loc[id_res],return_counts=True)
        plt.bar(value,count)
        plt.subplot(l,2,i*2+2)
        value,count = np.unique(data[col].loc[id_no_res],return_counts=True)
        plt.bar(value,count,color='r')
    plt.tight_layout()
        

In [None]:
plot_all_cols(train,["Age","Region_Code","Policy_Sales_Channel"])

In [7]:
def cal_pct_diff(data,col):
    id_res = (data["Response"] == 1)
    id_no_res = (data["Response"] == 0)
    value_res,count_res = np.unique(data[col].loc[id_res],return_counts=True)
    value_no_res,count_no_res = np.unique(data[col].loc[id_no_res],return_counts=True)
    res = {}
    for i in range(len(value_res)):
        res[value_res[i]] = count_res[i]/count_no_res[i]
    return res

In [4]:
pct_res = (train["Response"] == 1).sum()/(train["Response"] == 0).sum()
pct_res

0.14024733444542023

In [None]:
dict_pct = cal_pct_diff(train,"Vintage")
keys = np.array(list(dict_pct.keys()))
values = np.array(list(dict_pct.values()))
# plt.figure(figsize=(20,5))
plt.bar(keys,values)
plt.plot([10,300],[pct_res,pct_res],'r')
plt.plot([10,300],[pct_res,pct_res],'r')

In [None]:
def draw_plot_range_pct(range_pct,keys,values,pct_res=pct_res):
    plt.bar(keys,sorted(values))
    plt.plot([min(keys),max(keys)],[pct_res,pct_res],'r')
    for pct in range_pct:
        plt.plot([min(keys),max(keys)],[pct,pct],'g')

In [None]:
range_pct = [0.08,0.1,0.12,0.17,0.22,0.28,0.3,0.33,0.37]
draw_plot_range_pct(range_pct,keys,values)

In [3]:
def create_groups(df, column_name,keys,values, range_pct):    
    bins = [float('-inf')] + range_pct + [float('inf')]
    cutten_groups = []
    
    for i in range(1, len(bins)):
        idx = (values > bins[i-1]) & (values <= bins[i])
        cutten_groups.append(keys[idx])
    
    # Check if any key is missing
    total_grouped = sum(len(g) for g in cutten_groups)
    if total_grouped != len(keys):
        print(f"Warning: {len(keys) - total_grouped} keys are missing from groups")
    
    dict_cutten_groups = {}
    for idx, group in enumerate(cutten_groups):
        for value in group:
            dict_cutten_groups[value] = f'Group{idx}'
    
    new_column_name = f'{column_name}Group'
    df[new_column_name] = df[column_name].map(dict_cutten_groups).fillna('Unknown')
    
    return df

In [None]:
create_groups(train,"Vintage",keys,values,range_pct)

In [None]:
train["VintageGroup"].value_counts()

In [6]:
dict_pct = cal_pct_diff(train,"Region_Code")
keys = np.array(list(dict_pct.keys()))
values = np.array(list(dict_pct.values()))
idx = values < 0.9
plt.bar(keys[idx],values[idx])
plt.bar(keys[~idx],1,color='r')
plt.plot([0,52],[pct_res,pct_res],'r')

NameError: name 'cal_pct_diff' is not defined

In [None]:
values_tmp = values.copy() 
values_tmp[~idx] = 0.9

In [None]:
range_pct = [0.04,0.07,0.11,0.2,0.26,0.42,0.6,1]

In [None]:
draw_plot_range_pct(range_pct=range_pct,keys=keys,values=values_tmp)

In [None]:
range_pct += [3]

In [5]:
create_groups(train,"Region_Code",keys,values,range_pct)

NameError: name 'keys' is not defined

In [None]:
train["Region_Code"][train["Region_CodeGroup"] == "Unknown"]

In [None]:
train["Policy_Sales_Channel"].value_counts()

In [None]:
dict_pct = cal_pct_diff(train,"Policy_Sales_Channel")
keys = np.array(list(dict_pct.keys()))
values = np.log1p(np.array(list(dict_pct.values())))
idx = values < 0.4
# plt.bar(keys,values)
plt.bar(keys[idx],values[idx])
# plt.bar(keys[~idx],1,color='r')
plt.plot([min(keys),max(keys)],[pct_res,pct_res],'r')

In [None]:
values_tmp = values.copy() 
values_tmp[~idx] = 0.4

In [None]:
range_pct = [0.05,0.1,0.2,0.25,0.39]

In [None]:
draw_plot_range_pct(range_pct=range_pct,keys=keys,values=values_tmp)

In [None]:
idx2 = values > 0.4
# plt.bar(keys,values)
plt.bar(keys[idx2],values[idx2])
# plt.bar(keys[~idx],1,color='r')
plt.plot([min(keys),max(keys)],[pct_res,pct_res],'r')

In [None]:
range_pct_2 = [1.1,2,2.7,3.3,4,4.9,6.8]
draw_plot_range_pct(range_pct=range_pct_2,keys=keys,values=values)

In [None]:
final_range_pct = range_pct + range_pct_2

In [None]:
create_groups(train,"Policy_Sales_Channel",keys,values,final_range_pct)

In [None]:
train["Policy_Sales_ChannelGroup"].value_counts()

In [None]:
dict_pct = cal_pct_diff(train,"Age")
keys = np.array(list(dict_pct.keys()))
values = np.array(list(dict_pct.values()))
# plt.figure(figsize=(20,5))
plt.bar(keys,values)
plt.plot([20,84],[pct_res,pct_res],'r')

In [None]:
range_pct = [0.04,0.07,0.12,0.16,0.205,0.235,0.265,0.287]

In [None]:
draw_plot_range_pct(range_pct=range_pct,keys=keys,values=values)

In [None]:
create_groups(train,"Age",keys,values,range_pct)

In [None]:
train["AgeGroup"].value_counts()

In [None]:
train["Annual_log"] = train["Annual_Premium"].apply(np.log1p)

In [None]:
train["Annual_log"].describe()

In [None]:
bins = [float('-inf'), 8.0, 8.5,9.0,9.5,10,10.5,11,11.5,12,12.5,13,float('inf')]
labels = ['-8','8-8.5','8.5-9','9-9.5','9.5-10','10-10.5','10.5-11','11-11.5','11.5-12','12-12.5','12.5-13','13+']
train.loc[:, "AnnualGroup"]= pd.cut(train["Annual_log"],bins=bins,labels=labels,right=True, include_lowest=True)

In [None]:
grouped_age= train.groupby(['AnnualGroup', 'Response']).size().unstack(fill_value=0)
grouped_age.plot(kind='bar', stacked=False, color=['red', 'green'])

In [None]:
dict_pct = cal_pct_diff(train,"AnnualGroup")
keys = np.array(list(dict_pct.keys()))
values = np.array(list(dict_pct.values()))
# plt.figure(figsize=(20,5))
plt.bar(keys,values)
plt.plot([0,len(keys)-1],[pct_res,pct_res],'r')

In [None]:
train["AnnualGroup"].value_counts()

In [None]:
train

## Feature engineering

In [None]:
columns = ["Gender","Driving_License","Previously_Insured","Vehicle_Age","Vehicle_Damage","Region_CodeGroup","Policy_Sales_ChannelGroup","VintageGroup","AgeGroup","AnnualGroup","Annual_Premium"]
label = ["Response"]

In [None]:
df_train = train[columns+label]
X_train = df_train.drop(columns=label)
y_train = df_train[label]

In [None]:
binary_cols = ["Gender","Driving_License","Previously_Insured","Vehicle_Damage"]
multi_cols = ["Vehicle_Age","Region_CodeGroup","Policy_Sales_ChannelGroup","VintageGroup","AgeGroup","AnnualGroup"]
num_cols = ["Annual_Premium"]

In [None]:
# One hot encoding
X_train = pd.get_dummies(X_train, columns=multi_cols)
X_train = pd.get_dummies(X_train, columns=binary_cols, drop_first=True)

In [None]:
X_train.columns

In [None]:
# Standardization
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train[num_cols] = scaler.fit_transform(X_train[num_cols])

In [None]:
concat_df_train = pd.concat([X_train,y_train],axis=1) 

In [None]:
corr_matrix = concat_df_train.corr()

In [None]:
corr_threshold  = 0.03
high_corr_cols = corr_matrix[abs(corr_matrix['Response']) > corr_threshold].index
sorted_corr_matrix = concat_df_train[high_corr_cols].corr()
sorted_corr_matrix['Response'].sort_values(ascending=False)
sns.heatmap(sorted_corr_matrix, annot=True,cmap='coolwarm')


## Choosing Model

### Split val-train

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42,stratify=y_train)

In [None]:
y_train.value_counts(), y_val.value_counts()

### RandomForest

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, f1_score

rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1, warm_start=True)
chunk_size = 1000000  # Adjust based on your available memory
for i in range(0, len(X_train), chunk_size):
    X_chunk = X_train[i:i+chunk_size]
    y_chunk = y_train[i:i+chunk_size].values.reshape(-1)
    rf_classifier.n_estimators += 10  # Grow 10 new trees each iteration
    rf_classifier.fit(X_chunk, y_chunk)

y_pred = rf_classifier.predict(X_val)

# Evaluate the model
auc = roc_auc_score(y_val, y_pred)
f1 = f1_score(y_val, y_pred)
print(f'AUC: {auc:.4f}')
print(f'F1 Score: {f1:.4f}')



In [None]:
import joblib
joblib.dump(rf_classifier, 'rf_classifier_model.joblib')

Logistic Regression

In [None]:
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import roc_auc_score, f1_score
clf = SGDClassifier(loss='log_loss', random_state=42)

batch_size = 1000000
for i in range(0, X_train.shape[0], batch_size):
    print(f'Training batch {i // batch_size + 1}/{X_train.shape[0] // batch_size}')
    X_batch = X_train[i:i + batch_size]
    y_batch = y_train[i:i + batch_size].values.reshape(-1)
    clf.partial_fit(X_batch, y_batch, classes=np.unique(y_train))

# Predict on the validation set
y_pred = clf.predict(X_val)

# Evaluate the model
auc = roc_auc_score(y_val, y_pred)
f1 = f1_score(y_val, y_pred)
print(f'AUC: {auc:.4f}')
print(f'F1 Score: {f1:.4f}')

In [None]:
y_train.shape