### Importing the required dependencies

In [1]:
# Ignoring warnings
import warnings
warnings.filterwarnings('ignore')

# Importing other required libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Importing required ML libraries
from sklearn.model_selection import train_test_split,KFold,GridSearchCV

# Importing the required regressors
from catboost import CatBoostRegressor
from xgboost import XGBRegressor,XGBClassifier
from sklearn.ensemble import RandomForestRegressor

# Importing the score maker
from sklearn.metrics import make_scorer

# Checking all the columns 
pd.set_option('display.max_columns',None)

### Loading the data

In [2]:
# Filepaths for train and test dataset
train_dataset=r'C:\Users\DELL\playground-series-s4e12\train.csv'
test_dataset=r'C:\Users\DELL\playground-series-s4e12\test.csv'

In [3]:
# Reading the required training & testing data into a pandas dataframe
df_train=pd.read_csv(train_dataset)
df_test=pd.read_csv(test_dataset)

### Metadata Analysis


In [None]:
# Checking the basic data available
df_train.info()

In [None]:
# Checking the shape
df_train.shape

In [None]:
# Checking the null values
(100*df_train.isnull().sum()/len(df_train))

In [None]:
# Checking the first five rows
df_train.head()

In [8]:
# Extracting the Month & Year, then dropping the Policy Start Date column
def date_time_handler(df:pd.DataFrame):
  df['Policy Start Date']=pd.to_datetime(df['Policy Start Date'])
  df['Month']=df['Policy Start Date'].dt.month.astype('object') # Making Month object data type considering its usecase
  df['Year']=df['Policy Start Date'].dt.year.astype('object') # Making Year object data type considering its usecase
  df.drop(['Policy Start Date'],axis=1,inplace=True)
  return df

In [9]:
# Applying the date time handler function on train and test dataframe
df_train=date_time_handler(df_train)
df_test=date_time_handler(df_test)

In [10]:
# Making a function which will extract the columns which have missing values below certain threshold
def missing_value_col_below_threshold(df:pd.DataFrame,threshold_value:int)->list:
  
  below_threshold_missing_value=[]

  if not isinstance(threshold_value,int):
    raise TypeError(f"The given value for threshold is {threshold_value}, which is not an integer")
  
  if not isinstance(df,pd.DataFrame):
    raise TypeError(f'The given value of the dataframe is not a pandas dataframe')
  
  rqd_result=(100*df.isnull().sum()/len(df))

  for each_col,each_val in rqd_result.items():
    if each_val<=threshold_value and each_val!=0:
      below_threshold_missing_value.append((each_col,each_val,df[each_col].dtype))

  below_threshold_missing_value=sorted(below_threshold_missing_value,key=lambda x:x[1],reverse=True)

  return below_threshold_missing_value


In [None]:
# Checking if train & test data set has similar columns

missing_train=np.array([each_col for each_col,_,_ in missing_value_col_below_threshold(df_train,50)])
missing_test=np.array([each_col for each_col,_,_ in missing_value_col_below_threshold(df_test,50)])

print(missing_train==missing_test)

In [None]:
# Also checking if the missing values in these columns are close to each other

missing_train_values=np.array([each_val for _,each_val,_ in missing_value_col_below_threshold(df_train,50)])
missing_test_values=np.array([each_val for _,each_val,_ in missing_value_col_below_threshold(df_test,50)])

print(missing_train_values-missing_test_values)

In [None]:
# Checking the columns in train data set which have missing values below or equal to 15 %
missing_value_col_below_threshold(df_train,15)

In [14]:
# Making a function to extract the values from the missing_value_col_below_threshold function
# Replacing the missing values in respective columns based upon their data types

def replacing_missing_values(df:pd.DataFrame,missing_value_tuple:list[tuple]):
  result=missing_value_tuple
  col_name,_,data_types =zip(*result)

  for col,data_type in zip(col_name,data_types):
    if data_type=='float64':
      df[col]=df[col].fillna(df[col].median())
    elif data_type=='object':
      df[col]=df[col].fillna(df[col].mode()[0])
  
  return df


In [15]:
# Replacing the missing values in the train dataframe by the median or mode if the missing value percentage is less than or equal to 15
col_to_impute=missing_value_col_below_threshold(df_train,15)
df_train=replacing_missing_values(df_train,col_to_impute)

In [None]:
# Checking the missing value columns
df_train.info()

In [None]:
# Checking the values in the Previous Claims & Occupation columns
print(df_train['Previous Claims'].unique())
print(df_train.Occupation.unique())

In [18]:
# Returning the columns which are having higher percentage of missing values
col_with_high_missing_values=missing_value_col_below_threshold(df_train,100)

In [19]:
# Function to convert the Occupation column into a numerical variable so that XGBoost classifier will be able to impute missing values
def occupation_converter(occu:str)->int:
  if occu=="Self-Employed":
    return 0
  elif occu=="Employed":
    return 1
  elif occu=="Unemployed":
    return 2
  elif pd.isnull(occu): # np.Nan is special type of float so need to remove it 
    return -1 # To mark missing values
  else:
    return -1 # to handle unexpected values

In [20]:
# Formulating a function to impute the missing values for columns with high percentage of missing values
def preprocessing_high_missing_value_col(df:pd.DataFrame,list_of_tuples:list[tuple])->pd.DataFrame:
  result=list_of_tuples # Taking the list of tuples which contain the columns with high missing values
  high_missing_col,_,_=zip(*result) # Extracting the columns with high missing values
  
  for each_col in high_missing_col:
    if each_col=="Previous Claims":
      df[each_col]=df[each_col].fillna(-1).astype("int64") # Filling missing values with a placeholder of -1 & changing the datatype for XGBClassifer application
    elif each_col=="Occupation":
      df[each_col]=df[each_col].apply(lambda x:occupation_converter(x))
    else:
      pass

  return df

In [21]:
# Making the train dataset ready for the final missing value imputation
df_train=preprocessing_high_missing_value_col(df_train,col_with_high_missing_values)

### Data preparation for the missing value imputation using XGBoost

In [22]:
# Creating a helper function for the conversion of the object columns in to the category columns

def object_to_category(df:pd.DataFrame)->pd.DataFrame:
  return df.astype({col:'category' for col in df.select_dtypes(include='object').columns})

In [23]:
# Making the suitable datasets
df_prev_claim=df_train.drop(columns=['Occupation','Premium Amount','id'],axis=1)
df_occu=df_train.drop(columns=['Previous Claims','Premium Amount','id'],axis=1)

In [None]:
# Checking the first five rows
df_occu.head()

In [25]:
# Extracting the train and test data from occupation columns
df_occu_train=df_occu[df_occu['Occupation']!=-1]
df_occu_test=df_occu[df_occu['Occupation']==-1]

In [26]:
# Extracting the label
y_occu=df_occu_train.pop('Occupation')
X_occu=df_occu_train.copy()

In [27]:
# XGBoost needs all columns to be of numeric type but to handle this with some advanced functionalities, I needed to convert the object columns into categorical columns
X_occu=object_to_category(X_occu)

In [28]:
# Using XGB Classifier
xgb_occu=XGBClassifier(objective='multi:softmax',
                       num_class=y_occu.nunique(),
                       eval_metric='merror',
                       max_depth=5,
                       learning_rate=0.01,
                       n_estimators=100,
                       tree_method='hist',
                       enable_categorical=True)

In [None]:
# Fitting the xgb classifier object on the y_occu & X_occu
xgb_occu.fit(X_occu,y_occu)

In [30]:
# Dropping the occupation column from the df_occu_test
df_occu_test.drop(['Occupation'],axis=1,inplace=True)

In [31]:
# Before predicitng on the df_occu_test, I need to make sure that the object datatype columns are converted into category datatype
df_occu_test=object_to_category(df_occu_test)

In [32]:
# Predicting the occupation column from the df_occu_test
y_occu_test=xgb_occu.predict(df_occu_test)

In [33]:
# Now adding the y_occu_test directly into the df_train
df_train.loc[df_train['Occupation']==-1,'Occupation']=y_occu_test

In [None]:
# Checking the first 5 rows of df_prev_claim
df_prev_claim.head()

In [35]:
# Extracting the train and the test from previous claims column
df_prev_claim_train=df_prev_claim[df_prev_claim['Previous Claims']!=-1]
df_prev_claim_test=df_prev_claim[df_prev_claim['Previous Claims']==-1]

In [36]:
# Extracting the previous claims labels
y_prev_claim=df_prev_claim_train.pop('Previous Claims')
X_prev_claim=df_prev_claim_train.copy()

In [37]:
# XGBoost needs all columns to be of numeric type but to handle this with some advanced functionalities, I needed to convert the object columns into categorical columns
X_prev_claim=object_to_category(X_prev_claim)

In [38]:
# Using XGB Classifier
xgb_prev_claim=XGBClassifier(objective='multi:softmax',
                             num_class=y_prev_claim.nunique(),
                             eval_metric='merror',
                             max_depth=5,
                             learning_rate=0.01,
                             n_estimators=100,
                             tree_method='hist',
                             enable_categorical=True)

In [None]:
# Fitting XGB Classifier object on X_prev_claim & y_prev_claim
xgb_prev_claim.fit(X_prev_claim,y_prev_claim)

In [40]:
# Dropping the unnecessary column of previous claims from the df_prev_claim_test
df_prev_claim_test.drop(['Previous Claims'],axis=1,inplace=True)

In [41]:
# Changing the datatype of the object columns to the category for using the XGB Classifier 
df_prev_claim_test=object_to_category(df_prev_claim_test)

In [42]:
# Predicting the y_prev_claims_test
y_prev_claims_test=xgb_prev_claim.predict(df_prev_claim_test)

In [43]:
# Now adding the y_prev_claims_test to df_train
df_train.loc[df_train['Previous Claims']==-1,'Previous Claims']=y_prev_claims_test

In [None]:
# Checking the null values 
df_train.isnull().sum()

In [None]:
# Checking the unique elements in previous claims and occupation
print(df_train['Occupation'].unique())
print(df_train['Previous Claims'].unique())

In [46]:
# Now that I have got all the values in the Occupation column after missing value imputation, I would rever the effect of occupation_converter 
occupation_mapping={0:"Self-Employed",1:"Employed",2:"Unemployed"}

df_train['Occupation']=df_train['Occupation'].apply(lambda x : occupation_mapping.get(x))

### Starting EDA as the missing value imputation has been completed 

In [None]:
# Checking the info of the df_train
df_train.info()

In [48]:
# Dropping the id column from the df_train
df_train.drop(columns=['id'],axis=1,inplace=True)

In [49]:
# Extracting the numeric columns for plotting the box plots
num_col=set(df_train.select_dtypes(include=['float64','int64']).columns)
high_unique_values=set([col for col,unique_value in df_train.nunique().items() if unique_value>=10])
rqd_num_col=num_col.intersection(high_unique_values)

#### Boxplot of numeric features

In [None]:
# Plotting the box plots
n_cols=2 # I always want to see only two columns
n_rows=((len(rqd_num_col)+1)//n_cols)
fig,axes=plt.subplots(nrows=n_rows,ncols=n_cols,figsize=(20,20))
axes=axes.flatten()

# Plotting the box plots
for col_no,each_col in enumerate(rqd_num_col):
  sns.boxplot(data=df_train,x=each_col,ax=axes[col_no])
  axes[col_no].set_title(f'Box-plot for {each_col}')

# Deleting unnecessary axes
for del_plot in range(col_no+1,len(axes)):
  fig.delaxes(ax=axes[del_plot])

# Adjusting the visualization of the graphs
plt.tight_layout()
plt.show()


In [None]:
# Plotting the histograms of the numeric features to understand their distribution
n_cols=2
n_rows=((len(rqd_num_col)+1)//n_cols)
fig,axes=plt.subplots(nrows=n_rows,ncols=n_cols,figsize=(20,20))
axes=axes.flatten()

# Plotting the histograms for the numeric columns
for col_no,each_col in enumerate(rqd_num_col):
  ax=axes[col_no]
  ax.hist(df_train[each_col])
  ax.set_title(f'Histogram for {each_col}')
  ax.set_xlabel(f'{each_col}')
  ax.set_ylabel('Frequency')

# Deleting the unnncessary axes
for axes_to_del in range(col_no+1,len(axes)):
  fig.delaxes(ax=axes[axes_to_del])

# Making the adjustments for plotting the histograms effectively
plt.tight_layout()
plt.show()

In [52]:
# Extracting the columns which are object
object_columns=list(df_train.select_dtypes(include='object').columns)

In [None]:
# plotting the countplots of the object columns 
n_cols=2
n_rows=((len(object_columns)+1)//n_cols)
fig,axes=plt.subplots(nrows=n_rows,ncols=n_cols,figsize=(20,20))
axes=axes.flatten()

# plotting the countplots
for col_no,each_col in enumerate(object_columns):
  sns.countplot(data=df_train,x=each_col,ax=axes[col_no])
  axes[col_no].set_title(f'Count plot for {each_col}')

# Deleting unnecessary axes
for axes_to_del in range(col_no+1,len(axes)):
  fig.delaxes(ax=axes[axes_to_del])

# Arranging the layout as per the need
plt.tight_layout()
plt.show()

In [None]:
# Checking the basic info about df_train once again
df_train.info()

In [55]:
# Making a dummy dataframe from the object columns
df_dummies=pd.get_dummies(data=df_train,columns=object_columns,drop_first=True)

# Changing the boolean columns into 1/0 for correlation matrix creation
bool_col=df_dummies.select_dtypes(include='bool').columns
df_dummies[bool_col]=df_dummies[bool_col].astype(int)

# Making the premium amount as the first column for easier visualization & inference creation
dummy_columns=['Premium Amount']+[col for col in df_dummies.columns if col!='Premium Amount']
df_dummies=df_dummies[dummy_columns]

In [56]:
# Creating a correlation matrix
corr_mat=df_dummies.corr()

In [None]:
# Create a heat map
plt.figure(figsize=(30,30))
sns.heatmap(data=corr_mat,cmap='coolwarm',annot=True,fmt='.2f')
plt.show()

In [None]:
# I decided to check the basic info of the all columns in the df_dummies --> I decided to drop the dummy columns i.e.columns with datatype = int 32
df_dummies.info()

In [59]:
# Getting all the columns from the df_dummies which are not int32
col_int_32=df_dummies.select_dtypes(include='int32')
col_not_int_32=[col for col in df_dummies.columns if col not in col_int_32]

In [60]:
# Making a correlation matrix of the columns which are not obtained by making dummies
corr_mat_reduced=df_dummies[col_not_int_32].corr()

In [None]:
# Plotting the simplified heatmap 
plt.figure(figsize=(10,5))
sns.heatmap(data=corr_mat_reduced,cmap='coolwarm',annot=True,fmt='.3f')
plt.show()

### Starting the training phase of the ML algorithm - By Using K Fold CV & Grid Search

In [62]:
# Splitting extracting the dependent & independent variables
X_train=df_train[[col for col in df_train.columns if col!='Premium Amount']]
y_train=df_train['Premium Amount']

In [63]:
# Chaniging the object columns to category type for using them in Cat Boost Algorithm
X_train=object_to_category(X_train)

In [64]:
# Making a manual function to calculate Root mean squared logarithmic error ( RMSLE )

def rmsle(y_actual,y_predicted):

  """
  This function is made to calculate the RMSLE ( Root Mean Squared Logarithmic Error)
  """
  return np.sqrt(np.mean((np.log1p(y_actual)-np.log1p(y_predicted))**2))

In [65]:
# Creating a custome scorer for calculating the RMSLE
rmsle_scorer=make_scorer(rmsle,greater_is_better=False)

In [66]:
# Setting up K Folde cross validation
kf=KFold(n_splits=3,shuffle=True,random_state=42)

In [67]:
# Defining hyperparameter grids for all regression algorithms

rf_params_grid={
  'n_estimators':[50,100],
  'max_depth':[3,5],
  'min_samples_split':[3,5]
}


xgb_params_grid={
  'n_estimators':[100,200],
  'max_depth':[3,5],
  'learning_rate':[0.01,0.1]
}


cat_params_grid={
  'iterations':[50,75],
  'depth':[2,5],
  'learning_rate':[0.05,0.1]
}


In [68]:
# Making a list of the categorical features
cat_features_list=[col for col in X_train.select_dtypes(include='category').columns ]

In [69]:
# Extracting the numerical columns
num_features_list=[col for col in X_train.columns if col not in cat_features_list]

In [70]:
# Making a dataframe fo the dummies from the categorical columns
X_train_dummies=pd.get_dummies(data=X_train,columns=cat_features_list,drop_first=True)

In [71]:
# Now checking the columns from the X_train_dummies which are introduced due to one hot encoding & chaning their datatype from boolean to integer
final_dummy_columns=[col for col in X_train_dummies.columns if col not in num_features_list]
X_train_dummies[final_dummy_columns]=X_train_dummies[final_dummy_columns].astype(int)

In [None]:
# Starting the model training by using the Random Forest Regressor
rf_model=RandomForestRegressor(random_state=42)
rf_model_grid_search=GridSearchCV(estimator=rf_model,param_grid=rf_params_grid,cv=kf,scoring=rmsle_scorer,verbose=0)
rf_model_grid_search.fit(X=X_train_dummies,y=y_train)

# Checking the params which provide the best score and also checking the best scores for the random forest
print(f'The params which provide the best score for the Random Forest are {rf_model_grid_search.best_params_}')
print(f'The best scores obtained are {rf_model_grid_search.best_score_}')

In [None]:
# Starting the model training by using the XGBoost Regressor
xgb_model=XGBRegressor(random_state=42,tree_method='hist',enable_categorical=True)
xgb_model_grid_search=GridSearchCV(estimator=xgb_model,param_grid=xgb_params_grid,cv=kf,scoring=rmsle_scorer,verbose=0)
xgb_model_grid_search.fit(X=X_train,y=y_train)

# Checking the params which provide the best score & also checking the best scores
print(f' The params which provide the best score for the XGBoost are {xgb_model_grid_search.best_params_}')
print(f' The best scores obtained are {xgb_model_grid_search.best_score_}') 

In [None]:
# Starting by using CAT Boost Regressor --> As CATBoost does not need any prior preprocessing for categorical features
cat_model=CatBoostRegressor(random_state=42,cat_features=cat_features_list)
cat_model_grid_search=GridSearchCV(estimator=cat_model,param_grid=cat_params_grid,cv=kf,scoring=rmsle_scorer,verbose=0)
cat_model_grid_search.fit(X=X_train,y=y_train)

# Checking the params which provide the best scores & also checking the best scores
print(f'The best params for CatBoost are: {cat_model_grid_search.best_params_}')
print(f'The best score for CatBoost are: {cat_model_grid_search.best_score_}')
