In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib as plt

import missingno
from IPython.display import display

# Feature Engineering
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
import category_encoders as ce

# Model Selection
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV,StratifiedKFold,train_test_split, cross_validate, cross_val_score
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
import lightgbm as lgb
from sklearn.metrics import f1_score, accuracy_score, recall_score, precision_score
from sklearn.metrics import roc_curve, roc_auc_score

# Imbalance Dataset
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.under_sampling import RandomUnderSampler, NearMiss
from imblearn.combine import SMOTEENN

# Ignore Warning
import sys
import warnings
if not sys.warnoptions:
    warnings.simplefilter("ignore")


from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler 




In [2]:
df= pd.read_csv(r"C:\Users\Adib Fardan\Downloads\data_travel_insurance.csv")
df.head()


Unnamed: 0,Agency,Agency Type,Distribution Channel,Product Name,Gender,Duration,Destination,Net Sales,Commision (in value),Age,Claim
0,C2B,Airlines,Online,Annual Silver Plan,F,365,SINGAPORE,216.0,54.0,57,No
1,EPX,Travel Agency,Online,Cancellation Plan,,4,MALAYSIA,10.0,0.0,33,No
2,JZI,Airlines,Online,Basic Plan,M,19,INDIA,22.0,7.7,26,No
3,EPX,Travel Agency,Online,2 way Comprehensive Plan,,20,UNITED STATES,112.0,0.0,59,No
4,C2B,Airlines,Online,Bronze Plan,M,8,SINGAPORE,16.0,4.0,28,No


In [3]:
df.shape

(44328, 11)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44328 entries, 0 to 44327
Data columns (total 11 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Agency                44328 non-null  object 
 1   Agency Type           44328 non-null  object 
 2   Distribution Channel  44328 non-null  object 
 3   Product Name          44328 non-null  object 
 4   Gender                12681 non-null  object 
 5   Duration              44328 non-null  int64  
 6   Destination           44328 non-null  object 
 7   Net Sales             44328 non-null  float64
 8   Commision (in value)  44328 non-null  float64
 9   Age                   44328 non-null  int64  
 10  Claim                 44328 non-null  object 
dtypes: float64(2), int64(2), object(7)
memory usage: 3.7+ MB


# DATA CLEANING 
### First, we need to check the data, is there any duplicate, null/nan values or unique wording, if it so,  We need to clean this data first.

In [5]:
df.describe(include='all')

Unnamed: 0,Agency,Agency Type,Distribution Channel,Product Name,Gender,Duration,Destination,Net Sales,Commision (in value),Age,Claim
count,44328,44328,44328,44328,12681,44328.0,44328,44328.0,44328.0,44328.0,44328
unique,16,2,2,26,2,,138,,,,2
top,EPX,Travel Agency,Online,Cancellation Plan,M,,SINGAPORE,,,,No
freq,24656,32113,43572,12979,6504,,9267,,,,43651
mean,,,,,,49.424292,,40.550948,9.707692,39.9256,
std,,,,,,109.153961,,48.66197,19.625637,13.954926,
min,,,,,,-1.0,,-357.5,0.0,0.0,
25%,,,,,,9.0,,18.0,0.0,35.0,
50%,,,,,,22.0,,26.5,0.0,36.0,
75%,,,,,,53.0,,48.0,11.55,43.0,


Agency, Product Name and Destination have many unique values, so we will use binary endcoding
Agency Type, Distribution Channel, Gender have 2 unique values, so we will use one hot endcoding

We will apply scaller to numeric feature, such as Duration, Net Sales, Commision and Age

In [6]:
df.rename(columns={'Commision (in value)':'Commision' }, inplace=True)
df

Unnamed: 0,Agency,Agency Type,Distribution Channel,Product Name,Gender,Duration,Destination,Net Sales,Commision,Age,Claim
0,C2B,Airlines,Online,Annual Silver Plan,F,365,SINGAPORE,216.0,54.00,57,No
1,EPX,Travel Agency,Online,Cancellation Plan,,4,MALAYSIA,10.0,0.00,33,No
2,JZI,Airlines,Online,Basic Plan,M,19,INDIA,22.0,7.70,26,No
3,EPX,Travel Agency,Online,2 way Comprehensive Plan,,20,UNITED STATES,112.0,0.00,59,No
4,C2B,Airlines,Online,Bronze Plan,M,8,SINGAPORE,16.0,4.00,28,No
...,...,...,...,...,...,...,...,...,...,...,...
44323,EPX,Travel Agency,Online,2 way Comprehensive Plan,,14,CHINA,30.0,0.00,36,Yes
44324,CWT,Travel Agency,Online,Rental Vehicle Excess Insurance,,17,AUSTRALIA,79.2,47.52,47,No
44325,TST,Travel Agency,Offline,Travel Cruise Protect,M,64,THAILAND,30.0,10.50,50,No
44326,EPX,Travel Agency,Online,2 way Comprehensive Plan,,51,CHINA,36.0,0.00,36,No


### Find the duplicate

In [7]:
df.drop_duplicates()

Unnamed: 0,Agency,Agency Type,Distribution Channel,Product Name,Gender,Duration,Destination,Net Sales,Commision,Age,Claim
0,C2B,Airlines,Online,Annual Silver Plan,F,365,SINGAPORE,216.0,54.00,57,No
1,EPX,Travel Agency,Online,Cancellation Plan,,4,MALAYSIA,10.0,0.00,33,No
2,JZI,Airlines,Online,Basic Plan,M,19,INDIA,22.0,7.70,26,No
3,EPX,Travel Agency,Online,2 way Comprehensive Plan,,20,UNITED STATES,112.0,0.00,59,No
4,C2B,Airlines,Online,Bronze Plan,M,8,SINGAPORE,16.0,4.00,28,No
...,...,...,...,...,...,...,...,...,...,...,...
44323,EPX,Travel Agency,Online,2 way Comprehensive Plan,,14,CHINA,30.0,0.00,36,Yes
44324,CWT,Travel Agency,Online,Rental Vehicle Excess Insurance,,17,AUSTRALIA,79.2,47.52,47,No
44325,TST,Travel Agency,Offline,Travel Cruise Protect,M,64,THAILAND,30.0,10.50,50,No
44326,EPX,Travel Agency,Online,2 way Comprehensive Plan,,51,CHINA,36.0,0.00,36,No


In [8]:
df.duplicated().sum()

4667

### Cleaning the duplicate

In [9]:
df.drop_duplicates(inplace=True)

In [10]:
df.duplicated().sum()

0

### Find NaN/Null

In [11]:
df.isna()

Unnamed: 0,Agency,Agency Type,Distribution Channel,Product Name,Gender,Duration,Destination,Net Sales,Commision,Age,Claim
0,False,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,True,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,True,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...
44323,False,False,False,False,True,False,False,False,False,False,False
44324,False,False,False,False,True,False,False,False,False,False,False
44325,False,False,False,False,False,False,False,False,False,False,False
44326,False,False,False,False,True,False,False,False,False,False,False


In [12]:
df.isna().sum()

Agency                      0
Agency Type                 0
Distribution Channel        0
Product Name                0
Gender                  27667
Duration                    0
Destination                 0
Net Sales                   0
Commision                   0
Age                         0
Claim                       0
dtype: int64

### Cleaning NaN/Null

In [13]:
df.drop('Gender', axis=1, inplace=True) 

In [14]:
df.isna().sum()

Agency                  0
Agency Type             0
Distribution Channel    0
Product Name            0
Duration                0
Destination             0
Net Sales               0
Commision               0
Age                     0
Claim                   0
dtype: int64

In [15]:
df.shape

(39661, 10)

# DATA PREP

In [16]:
from sklearn.model_selection import train_test_split 

# preprocessing 
from sklearn.preprocessing import OneHotEncoder 
from category_encoders import OrdinalEncoder, BinaryEncoder  
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler 
from sklearn.compose import ColumnTransformer 

# modeling 
from sklearn.tree import DecisionTreeRegressor 

# metric 
from sklearn.metrics import mean_squared_error 

# SPLITTING DATA

In [17]:
X = df.drop(columns='Claim')
y = df['Claim'].map({'Yes': 1, 'No': 0})

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [19]:
# X_train, X_test, y_train, y_test = train_test_split(
#     X,
#     y,
#     test_size=0.2,
#     random_state=10
# )

In [20]:
unique_values_summary = {
    'Column': [],
    'Total Unique Values': [],
    'Unique Values': []
}
pd.set_option('display.max_colwidth', None)

for column in df.columns:
    unique_vals = df[column].unique()
    unique_values_summary['Column'].append(column)
    unique_values_summary['Unique Values'].append(unique_vals)
    unique_values_summary['Total Unique Values'].append(len(unique_vals))

display(pd.DataFrame(unique_values_summary))

pd.reset_option('display.max_colwidth')

Unnamed: 0,Column,Total Unique Values,Unique Values
0,Agency,16,"[C2B, EPX, JZI, CWT, LWC, ART, CSR, SSI, RAB, KML, TST, TTW, JWT, ADM, CCR, CBH]"
1,Agency Type,2,"[Airlines, Travel Agency]"
2,Distribution Channel,2,"[Online, Offline]"
3,Product Name,26,"[Annual Silver Plan, Cancellation Plan, Basic Plan, 2 way Comprehensive Plan, Bronze Plan, 1 way Comprehensive Plan, Rental Vehicle Excess Insurance, Single Trip Travel Protect Gold, Silver Plan, Value Plan, 24 Protect, Annual Travel Protect Gold, Comprehensive Plan, Ticket Protector, Travel Cruise Protect, Single Trip Travel Protect Silver, Individual Comprehensive Plan, Gold Plan, Annual Gold Plan, Child Comprehensive Plan, Premier Plan, Annual Travel Protect Silver, Single Trip Travel Protect Platinum, Annual Travel Protect Platinum, Spouse or Parents Comprehensive Plan, Travel Cruise Protect Family]"
4,Duration,437,"[365, 4, 19, 20, 8, 2, 25, 90, 24, 43, 103, 376, 6, 16, 27, 34, 3, 14, 12, 31, 5, 7, 56, 29, 30, 58, 9, 38, 42, 11, 13, 40, 79, 370, 366, 26, 83, 55, 18, 100, 1, 35, 36, 75, 23, 73, 32, 232, 37, 10, 4736, 15, 17, 99, 173, 74, 82, 66, 156, 39, 95, 98, 111, 430, 53, 77, 87, 70, 151, 45, 54, 114, 118, 62, 28, 112, 326, 22, 59, 44, 21, 97, 63, 60, 51, 180, 160, 368, 65, 93, 71, 57, 0, 76, 69, 110, 146, 116, 129, 84, ...]"
5,Destination,138,"[SINGAPORE, MALAYSIA, INDIA, UNITED STATES, KOREA, REPUBLIC OF, THAILAND, GERMANY, JAPAN, INDONESIA, VIET NAM, AUSTRALIA, FINLAND, UNITED KINGDOM, SRI LANKA, SPAIN, HONG KONG, MACAO, CHINA, UNITED ARAB EMIRATES, IRAN, ISLAMIC REPUBLIC OF, TAIWAN, PROVINCE OF CHINA, POLAND, CANADA, OMAN, PHILIPPINES, GREECE, BELGIUM, TURKEY, BRUNEI DARUSSALAM, DENMARK, SWITZERLAND, NETHERLANDS, SWEDEN, MYANMAR, KENYA, CZECH REPUBLIC, FRANCE, RUSSIAN FEDERATION, PAKISTAN, ARGENTINA, TANZANIA, UNITED REPUBLIC OF, SERBIA, ITALY, CROATIA, NEW ZEALAND, PERU, MONGOLIA, CAMBODIA, QATAR, NORWAY, LUXEMBOURG, MALTA, LAO PEOPLE'S DEMOCRATIC REPUBLIC, ISRAEL, SAUDI ARABIA, AUSTRIA, PORTUGAL, NEPAL, UKRAINE, ESTONIA, ICELAND, BRAZIL, MEXICO, CAYMAN ISLANDS, PANAMA, BANGLADESH, TURKMENISTAN, BAHRAIN, KAZAKHSTAN, TUNISIA, IRELAND, ETHIOPIA, NORTHERN MARIANA ISLANDS, MALDIVES, SOUTH AFRICA, VENEZUELA, COSTA RICA, JORDAN, MALI, CYPRUS, MAURITIUS, LEBANON, KUWAIT, AZERBAIJAN, HUNGARY, BHUTAN, BELARUS, MOROCCO, ECUADOR, UZBEKISTAN, CHILE, FIJI, PAPUA NEW GUINEA, ANGOLA, FRENCH POLYNESIA, NIGERIA, MACEDONIA, THE FORMER YUGOSLAV REPUBLIC OF, NAMIBIA, GEORGIA, COLOMBIA, ...]"
6,Net Sales,1006,"[216.0, 10.0, 22.0, 112.0, 16.0, 50.0, 78.0, 80.0, 40.0, 26.0, 252.85, 47.0, 25.0, 23.0, 20.0, 30.0, 42.0, 56.0, 19.8, 14.0, 29.7, 36.0, 31.0, 58.0, 9.9, 15.0, 21.0, 29.0, 32.0, 125.0, 48.0, 0.0, 21.04, 33.0, 204.6, 34.0, 30.55, 35.0, 49.5, 18.0, 98.0, 77.0, 69.3, 64.0, 0.32, 227.7, 48.5, -216.75, 24.0, 161.0, 99.0, 59.4, 45.0, 37.0, 44.0, 19.0, 75.0, 39.0, 17.55, -59.4, 51.0, -29.7, 80.25, 63.0, 12.0, 72.0, 53.0, 38.0, 17.0, 54.0, 69.0, 115.0, 76.3, 19.5, 29.5, 62.0, 11.0, 3.73, 79.0, 138.6, 22.5, 59.0, 5.59, 33.5, 53.25, 62.25, 93.0, -12.0, 1.03, 27.0, 56.5, 39.6, 28.0, 41.0, 97.0, 128.0, 43.0, 66.0, 26.5, 0.19, ...]"
7,Commision,915,"[54.0, 0.0, 7.7, 4.0, 9.1, 63.21, 10.5, 11.88, 17.82, 23.4, 7.5, 5.94, 43.75, 23.76, 7.36, 132.99, 9.57, 7.64, 12.25, 29.7, 6.3, 5.63, 41.58, 5.25, 0.09, 6.0, 136.62, 31.53, 54.19, 11.55, 10.15, 40.25, 59.4, 35.64, 15.75, 12.95, 28.5, 9.75, 4.39, 20.06, 18.0, 4.25, 19.07, 4.88, 7.38, 15.5, 1.05, 83.16, 14.7, 15.4, 1.57, 8.38, 13.31, 15.56, 0.29, 8.0, 36.73, 13.65, 20.3, 5.0, 10.0, 6.63, 0.05, 24.15, 29.05, 8.88, 14.0, 3.38, 15.88, 0.45, 10.89, 47.52, 0.37, 27.25, 18.13, 15.6, 70.2, 12.4, 72.94, 8.63, 4.63, 6.75, 11.25, 0.96, 53.46, 77.22, 1.13, 4.94, 1.73, 10.38, 58.45, 23.5, 6.94, 6.13, 41.42, 17.5, 13.16, 0.14, 3.23, 15.93, ...]"
8,Age,89,"[57, 33, 26, 59, 28, 36, 22, 39, 27, 48, 43, 51, 65, 47, 35, 54, 23, 46, 58, 42, 31, 29, 37, 68, 8, 32, 61, 49, 56, 25, 50, 34, 38, 44, 118, 45, 53, 20, 30, 41, 69, 52, 40, 70, 76, 74, 66, 63, 55, 21, 60, 75, 24, 62, 73, 64, 71, 5, 14, 72, 84, 81, 16, 19, 67, 83, 13, 79, 18, 9, 80, 17, 82, 78, 15, 77, 87, 88, 11, 12, 86, 7, 10, 2, 3, 1, 4, 85, 0]"
9,Claim,2,"[No, Yes]"


In [21]:
df['Claim'].value_counts()


Claim
No     38986
Yes      675
Name: count, dtype: int64

# MODELING

In [22]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 39661 entries, 0 to 44327
Data columns (total 10 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Agency                39661 non-null  object 
 1   Agency Type           39661 non-null  object 
 2   Distribution Channel  39661 non-null  object 
 3   Product Name          39661 non-null  object 
 4   Duration              39661 non-null  int64  
 5   Destination           39661 non-null  object 
 6   Net Sales             39661 non-null  float64
 7   Commision             39661 non-null  float64
 8   Age                   39661 non-null  int64  
 9   Claim                 39661 non-null  object 
dtypes: float64(2), int64(2), object(6)
memory usage: 3.3+ MB


In [23]:
transformer = ColumnTransformer([
    ('onehot', OneHotEncoder(), ['Agency Type', 'Distribution Channel',]), 
    ('binary', BinaryEncoder(), ['Agency','Product Name', 'Destination']),
    ('robust', RobustScaler(), ['Net Sales', 'Duration','Commision','Age'])
], remainder='passthrough') 

In [24]:
transformer.fit(X_train)

# transform to X_train & X_test
X_train_prep = transformer.transform(X_train)
X_test_prep = transformer.transform(X_test)

In [25]:
X_train_prep_df = pd.DataFrame(X_train_prep)
X_train_prep_df.head() 

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,16,17,18,19,20,21,22,23,24,25
0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,-0.290323,-0.5,0.0,0.0
1,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.193548,3.282609,0.0,0.0
2,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,1.0,0.980645,-0.326087,3.0,0.909091
3,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.16129,0.478261,0.0,0.0
4,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,-0.935484,1.5,3.0,-0.454545


In [26]:
col_onehot = list(transformer.transformers_[0][1].get_feature_names_out())
col_onehot

['Agency Type_Airlines',
 'Agency Type_Travel Agency',
 'Distribution Channel_Offline',
 'Distribution Channel_Online']

In [27]:
col_binary = list(transformer.transformers_[1][1].get_feature_names_out())
col_binary

['Agency_0',
 'Agency_1',
 'Agency_2',
 'Agency_3',
 'Agency_4',
 'Product Name_0',
 'Product Name_1',
 'Product Name_2',
 'Product Name_3',
 'Product Name_4',
 'Destination_0',
 'Destination_1',
 'Destination_2',
 'Destination_3',
 'Destination_4',
 'Destination_5',
 'Destination_6',
 'Destination_7']

In [28]:
col_robust = list(transformer.transformers_[2][2])
col_robust

['Net Sales', 'Duration', 'Commision', 'Age']

In [29]:
all_features = col_onehot + col_binary + col_robust
all_features 

['Agency Type_Airlines',
 'Agency Type_Travel Agency',
 'Distribution Channel_Offline',
 'Distribution Channel_Online',
 'Agency_0',
 'Agency_1',
 'Agency_2',
 'Agency_3',
 'Agency_4',
 'Product Name_0',
 'Product Name_1',
 'Product Name_2',
 'Product Name_3',
 'Product Name_4',
 'Destination_0',
 'Destination_1',
 'Destination_2',
 'Destination_3',
 'Destination_4',
 'Destination_5',
 'Destination_6',
 'Destination_7',
 'Net Sales',
 'Duration',
 'Commision',
 'Age']

In [30]:
print(len(X_train_prep_df.columns))
print(len(all_features)) 

26
26


In [31]:
# X_train
X_train_prep_df.columns = all_features
X_train_prep_df.head() 

Unnamed: 0,Agency Type_Airlines,Agency Type_Travel Agency,Distribution Channel_Offline,Distribution Channel_Online,Agency_0,Agency_1,Agency_2,Agency_3,Agency_4,Product Name_0,...,Destination_2,Destination_3,Destination_4,Destination_5,Destination_6,Destination_7,Net Sales,Duration,Commision,Age
0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,-0.290323,-0.5,0.0,0.0
1,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.193548,3.282609,0.0,0.0
2,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,1.0,0.980645,-0.326087,3.0,0.909091
3,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.16129,0.478261,0.0,0.0
4,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,-0.935484,1.5,3.0,-0.454545


In [32]:
X_test_prep_df = pd.DataFrame(data=X_test_prep, columns=all_features)
X_test_prep_df.head()

Unnamed: 0,Agency Type_Airlines,Agency Type_Travel Agency,Distribution Channel_Offline,Distribution Channel_Online,Agency_0,Agency_1,Agency_2,Agency_3,Agency_4,Product Name_0,...,Destination_2,Destination_3,Destination_4,Destination_5,Destination_6,Destination_7,Net Sales,Duration,Commision,Age
0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,-0.225806,-0.391304,0.648148,1.0
1,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,1.0,1.0,-1.935484,-0.043478,0.0,0.0
2,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,1.0,0.022581,-0.326087,1.5,-0.818182
3,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,1.0,1.3,0.956522,3.5,0.181818
4,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,-0.032258,-0.195652,0.0,0.0


In [33]:
display (X_train_prep_df)

Unnamed: 0,Agency Type_Airlines,Agency Type_Travel Agency,Distribution Channel_Offline,Distribution Channel_Online,Agency_0,Agency_1,Agency_2,Agency_3,Agency_4,Product Name_0,...,Destination_2,Destination_3,Destination_4,Destination_5,Destination_6,Destination_7,Net Sales,Duration,Commision,Age
0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,-0.290323,-0.500000,0.000000,0.000000
1,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.193548,3.282609,0.000000,0.000000
2,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,1.0,0.980645,-0.326087,3.000000,0.909091
3,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.161290,0.478261,0.000000,0.000000
4,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,-0.935484,1.500000,3.000000,-0.454545
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31723,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,...,0.0,0.0,0.0,1.0,1.0,1.0,0.193548,1.521739,1.031145,-0.181818
31724,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,2.896774,0.521739,6.000000,-0.818182
31725,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,1.0,1.0,2.677419,0.456522,0.000000,0.363636
31726,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,-0.225806,1.304348,0.648148,2.000000


In [34]:
from sklearn.preprocessing import LabelEncoder
# le = LabelEncoder()
# y_train = le.fit_transform(y_train)

In [35]:
display (y_train)

7222     0
36472    0
34186    0
24775    0
31270    0
        ..
36662    0
23607    1
14327    0
1637     0
7008     0
Name: Claim, Length: 31728, dtype: int64

In [36]:

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report, roc_auc_score
# define model 
model = GradientBoostingClassifier()
model.fit(X_train_prep_df, y_train)

# y_pred = model.predict(X_test)
# predict
y_pred = model.predict(X_test_prep_df)
y_pred

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [37]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder


# Preprocess 'y_test' 
y_test = y_test.replace({'Yes': 1, 'No': 0})  # Replace categorical labels with numbers

# Initialize and fit

In [38]:
print(y_test.shape)
print(y_pred.shape)

(7933,)
(7933,)


# MODEL & EVALUATION

In [39]:
logreg = LogisticRegression()
knn = KNeighborsClassifier()
dt = DecisionTreeClassifier()
rf = RandomForestClassifier()
xgb = XGBClassifier()
lgbm = lgb.LGBMClassifier()
svc= SVC(probability=True)


# MODEL BENCHMARKING: K-FOLD

In [40]:

models = [logreg, knn, dt, rf, xgb, lgbm]
model_names = ['Logistic Regression', 'KNN', 'Decision Tree', 'Random Forest', 'XGBoost', 'LightGBM']

best_recall = []

skfold = StratifiedKFold(n_splits=5)

for model, name in zip(models, model_names):
    estimator = Pipeline([
        ('preprocess', transformer),
        ('model', model)
    ])

    model_cv = cross_val_score(estimator, X_train, y_train, cv=skfold, scoring='recall')

    best_recall.append(model_cv.max())

    print(f"{name} - Recall Scores: {model_cv}, Best Recall: {model_cv.max():.4f}")

results_df = pd.DataFrame({
    'model': model_names,
    'best recall': best_recall
}).set_index('model').sort_values(by='best recall', ascending=False)

results_df


Logistic Regression - Recall Scores: [0. 0. 0. 0. 0.], Best Recall: 0.0000
KNN - Recall Scores: [nan nan nan nan nan], Best Recall: nan
Decision Tree - Recall Scores: [0.08333333 0.0462963  0.03703704 0.06481481 0.09259259], Best Recall: 0.0926
Random Forest - Recall Scores: [0.01851852 0.00925926 0.         0.00925926 0.00925926], Best Recall: 0.0185
XGBoost - Recall Scores: [0.         0.         0.         0.         0.00925926], Best Recall: 0.0093
[LightGBM] [Info] Number of positive: 432, number of negative: 24950
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001424 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 864
[LightGBM] [Info] Number of data points in the train set: 25382, number of used features: 25
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.017020 -> initscore=-4.056204
[LightGBM] [Info] Start training from scor

Unnamed: 0_level_0,best recall
model,Unnamed: 1_level_1
Decision Tree,0.092593
Random Forest,0.018519
XGBoost,0.009259
LightGBM,0.009259
Logistic Regression,0.0
KNN,


In [41]:
from sklearn.svm import SVC
from xgboost import XGBClassifier

models = [logreg,dt,rf,svc,knn,xgb]

# A dictionary containing metrics used
metrics = {'balanced_accuracy': 'balanced_accuracy',
        'precision': 'precision',
        'recall': 'recall',
        'f1': 'f1'}

accuracy_arr = []
precision_arr = []
recall_arr = []
score_f1_arr = []

for i in models:
    skfold=StratifiedKFold(n_splits=5)
    estimator=Pipeline([
        ('preprocess',transformer),
        ('model',i)])
    model_cv=cross_validate(estimator,X_train,y_train,cv=skfold,scoring=metrics)
    accuracy_arr.append(model_cv['test_balanced_accuracy'].mean())
    precision_arr.append(model_cv['test_precision'].mean())
    recall_arr.append(model_cv['test_recall'].mean())
    score_f1_arr.append(model_cv['test_f1'].mean())
    
pd.DataFrame({'models':['Logistic Regression', 'Decision Tree', 'Random Forest', 'svc', 'KNN', 'XGBoost'],
            'accuracy':accuracy_arr,
            'precision':precision_arr,
            'recall':recall_arr,
            'f1 score':score_f1_arr}).set_index('models').sort_values(by='recall',ascending=False)

Unnamed: 0_level_0,accuracy,precision,recall,f1 score
models,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Decision Tree,0.521129,0.053051,0.061111,0.056711
Random Forest,0.505087,0.094093,0.012963,0.022519
XGBoost,0.500653,0.028571,0.001852,0.003478
Logistic Regression,0.499952,0.0,0.0,0.0
svc,0.5,0.0,0.0,0.0
KNN,,,,


# MODEL BENCHMARKING TEST DATA

In [42]:
# models = [logreg,dt,rf,svc,knn,xgb]
models= [logreg,dt,rf,svc,xgb]

accuracy_arr = []
precision_arr = []
recall_arr = []
score_f1_arr = []

def y_pred_func(i):
    estimator=Pipeline([
        ('preprocess',transformer),
        ('models',i)])
    X_train,X_test
    
    estimator.fit(X_train,y_train)
    return(estimator,estimator.predict(X_test),X_test)

for i,j in zip(models, ['Logistic Regression', 'Decision Tree', 'Random Forest', 'SVC', 'KNN', 'XGBoost']):
    estimator, y_pred, X_test = y_pred_func(i)
    accuracy_arr.append(accuracy_score(y_test, y_pred))
    precision_arr.append(precision_score(y_test, y_pred))
    recall_arr.append(recall_score(y_test, y_pred))
    score_f1_arr.append(f1_score(y_test, y_pred))

pd.DataFrame({'models':['Logistic Regeression', 'Decision Tree','Random Forest','svc','XGBoost'],
            'accuracy':accuracy_arr,
            'precision':precision_arr,
            'recall':recall_arr,
            'f1 score':score_f1_arr}).set_index('models').sort_values(by='recall',ascending=False)
    
# # pd.DataFrame({'models':['Logistic Regression', 'Decision Tree', 'Random Forest', 'SVC', 'KNN', 'XGBoost'],
#             'accuracy':accuracy_arr,
#             'precision':precision_arr,
#             'recall':recall_arr,
#             'f1 score':score_f1_arr}).set_index('models').sort_values(by='recall',ascending=False)

Unnamed: 0_level_0,accuracy,precision,recall,f1 score
models,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Decision Tree,0.961679,0.048128,0.066667,0.055901
Random Forest,0.978823,0.076923,0.022222,0.034483
XGBoost,0.982604,0.2,0.007407,0.014286
Logistic Regeression,0.982982,0.0,0.0,0.0
svc,0.982982,0.0,0.0,0.0


In [43]:
print(len(X_test))
print(len(X_train))
print(len(y_test))
print(len(y_train))

7933
31728
7933
31728


# IMBALANCE

In [44]:
classifiers = [
    ('Logistic Regression', LogisticRegression(random_state=17)),
    ('Decision Tree', DecisionTreeClassifier(random_state=17)),
    ('Gradient Boosting', GradientBoostingClassifier(random_state=17)),
    ('XGBoost', XGBClassifier(eval_metric='logloss', random_state=17)),
    
]

resamplers = [
    ('None', None),
    ('RandomOverSampler', RandomOverSampler(random_state=17)),
    ('RandomUnderSampler', RandomUnderSampler(random_state=17)),
    ('SMOTE', SMOTE(random_state=17)),
    ('NearMiss', NearMiss()),
    ('SMOTEENN', SMOTEENN(random_state=17))
]

cv_results = []
test_results = []

metrics = {
    'accuracy': 'accuracy',
    'precision': 'precision',
    'recall': 'recall',
    'f1': 'f1'
}
for resample_name, resample in resamplers:
    for clf_name, clf in classifiers:
        print(f"Processing {clf_name} with {resample_name} ...")

        steps = [('preprocessor', transformer)]
        if resample:
            steps.append(('resampler', resample))
        steps.append(('classifier', clf))

        pipeline = Pipeline(steps=steps)

        try:
            # Perform cross-validation for detailed metrics
            cv_scores = cross_validate(
                pipeline, X_train, y_train, cv=5, scoring=metrics, return_train_score=False
            )

            cv_results.append({
                'Model': clf_name,
                'Resampler': resample_name,
                'CV Accuracy': cv_scores['test_accuracy'].mean(),
                'CV Precision': cv_scores['test_precision'].mean(),
                'CV Recall': cv_scores['test_recall'].mean(),
                'CV F1 Score': cv_scores['test_f1'].mean()
            })

            pipeline.fit(X_train, y_train)

            y_pred = pipeline.predict(X_test)

            test_f1 = f1_score(y_test, y_pred)
            test_accuracy = accuracy_score(y_test, y_pred)
            test_precision = precision_score(y_test, y_pred)
            test_recall = recall_score(y_test, y_pred)

            test_results.append({
                'Model': clf_name,
                'Resampler': resample_name,
                'Test Accuracy': test_accuracy,
                'Test Precision': test_precision,
                'Test Recall': test_recall,
                'Test F1 Score': test_f1
            })

        except Exception as e:
            print(f"Error processing {clf_name} with {resample_name}: {e}")
cv_results_df = pd.DataFrame(cv_results).sort_values(by='CV Recall', ascending=False)
test_results_df = pd.DataFrame(test_results).sort_values(by='Test Recall', ascending=False)

print("Cross-Validation Results:")
print(cv_results_df)
print("\nTest Results:")
print(test_results_df)

Processing Logistic Regression with None ...
Processing Decision Tree with None ...
Processing Gradient Boosting with None ...
Processing XGBoost with None ...
Processing Logistic Regression with RandomOverSampler ...
Processing Decision Tree with RandomOverSampler ...
Processing Gradient Boosting with RandomOverSampler ...
Processing XGBoost with RandomOverSampler ...
Processing Logistic Regression with RandomUnderSampler ...
Processing Decision Tree with RandomUnderSampler ...
Processing Gradient Boosting with RandomUnderSampler ...
Processing XGBoost with RandomUnderSampler ...
Processing Logistic Regression with SMOTE ...
Error processing Logistic Regression with SMOTE: 
All the 5 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (mo

In [45]:
pd.DataFrame(cv_results).sort_values('CV Recall', ascending=False)


Unnamed: 0,Model,Resampler,CV Accuracy,CV Precision,CV Recall,CV F1 Score
10,Gradient Boosting,RandomUnderSampler,0.757437,0.049774,0.72963,0.093179
8,Logistic Regression,RandomUnderSampler,0.775434,0.05284,0.712963,0.098332
11,XGBoost,RandomUnderSampler,0.717631,0.041761,0.709259,0.078874
4,Logistic Regression,RandomOverSampler,0.791319,0.055435,0.7,0.102723
9,Decision Tree,RandomUnderSampler,0.658503,0.033572,0.685185,0.064002
6,Gradient Boosting,RandomOverSampler,0.80503,0.056695,0.668519,0.104521
7,XGBoost,RandomOverSampler,0.927351,0.076623,0.296296,0.121717
1,Decision Tree,,0.965425,0.055239,0.064815,0.059507
5,Decision Tree,RandomOverSampler,0.968387,0.063927,0.062963,0.06335
3,XGBoost,,0.982476,0.028571,0.001852,0.003478


In [46]:
pd.DataFrame(test_results).sort_values(by='Test Recall',ascending=False)

Unnamed: 0,Model,Resampler,Test Accuracy,Test Precision,Test Recall,Test F1 Score
10,Gradient Boosting,RandomUnderSampler,0.73856,0.046327,0.733333,0.087148
11,XGBoost,RandomUnderSampler,0.702004,0.040412,0.725926,0.076563
4,Logistic Regression,RandomOverSampler,0.783941,0.053198,0.696296,0.098843
6,Gradient Boosting,RandomOverSampler,0.795664,0.056153,0.696296,0.103925
8,Logistic Regression,RandomUnderSampler,0.765284,0.048615,0.688889,0.09082
9,Decision Tree,RandomUnderSampler,0.62032,0.026654,0.6,0.05104
7,XGBoost,RandomOverSampler,0.918568,0.072027,0.318519,0.117486
1,Decision Tree,,0.961175,0.04712,0.066667,0.055215
5,Decision Tree,RandomOverSampler,0.967226,0.062937,0.066667,0.064748
3,XGBoost,,0.982604,0.2,0.007407,0.014286


## from this table we can see the best model that we can use is Gradient Boosting with resampler eaither under or over sampling

# HYPERPARAMETER TUNING

In [47]:
lgbm = GradientBoostingClassifier()
ros = RandomOverSampler(random_state=42)

estimator=Pipeline([
    ('oversampling',ros),
    ('preprocess',transformer),
    ('model',lgbm)
])

In [58]:
param_grid_simple = {
    'model__n_estimators': [50, 100, 150],
    'model__learning_rate': [0.01, 0.1, 0.2],
    'model__max_depth': [3, 5, 7],
    'model__min_samples_split': [2, 10]}


In [56]:
grid = GridSearchCV(estimator, n_jobs=-1, param_grid=param_grid_simple, scoring='roc_auc', cv=StratifiedKFold(n_splits=10, shuffle=True, random_state=42))

In [57]:
y_train.value_counts()

Claim
0    31188
1      540
Name: count, dtype: int64

In [None]:
grid.fit(X_train,y_train)
print(grid.best_score_)
print(grid.best_params_)

In [None]:
best_model = grid.best_estimator_
best_model.fit(X_train, y_train)

In [None]:
estimator.fit(X_train, y_train)

In [None]:


y_pred_default = estimator.predict(X_test)
y_pred_proba_default = estimator.predict_proba(X_test)
y_pred_tuned = best_model.predict(X_test)
y_pred_proba_tuned = best_model.predict_proba(X_test)

roc_auc_default = roc_auc_score(y_test, y_pred_proba_default[:,1])
roc_auc_tuned = roc_auc_score(y_test, y_pred_proba_tuned[:,1])

print('ROC AUC Score Default GBM : ', roc_auc_default)
print('ROC AUC Score Tuned GBM : ', roc_auc_tuned)

ROC AUC Score Default GBM :  0.8042724571614637
ROC AUC Score Tuned GBM :  0.8195535023810493


### Setelah kita melakukan tuning, terdapat peningkatan score ROC AUC GBM  dari 0.80 menjadi 0.81. Biarpun peningkatannya kecil namun, score ini masuk dalam kategori bagus mengingat nilainya diatas 0.70

In [None]:
report_default = classification_report(y_test, y_pred_default)
report_tuned = classification_report(y_test, y_pred_tuned)

print('Classification Report Default GBM : \n', report_default)
print('Classification Report Tuned GBM : \n', report_tuned)

Classification Report Default GBM : 
               precision    recall  f1-score   support

           0       0.99      0.82      0.90      7775
           1       0.07      0.65      0.13       158

    accuracy                           0.82      7933
   macro avg       0.53      0.74      0.51      7933
weighted avg       0.97      0.82      0.88      7933

Classification Report Tuned GBM : 
               precision    recall  f1-score   support

           0       0.99      0.83      0.91      7775
           1       0.08      0.67      0.14       158

    accuracy                           0.83      7933
   macro avg       0.53      0.75      0.52      7933
weighted avg       0.97      0.83      0.89      7933



In [None]:
coef1 = pd.Series(best_model['model'].feature_importances_, transformer.get_feature_names()).sort_values(ascending = False).head(10)
coef1.plot(kind='barh', title='Feature Importances')
plt.show()

AttributeError: 'ColumnTransformer' object has no attribute 'get_feature_names'

In [54]:
cm = confusion_matrix(y_test, y_pred)
cm

array([[5471, 2327],
       [  37,   98]], dtype=int64)

In [None]:
print('Classification Report Tuned LGBM : \n', report_tuned)

Classification Report Tuned LGBM : 
               precision    recall  f1-score   support

           0       0.99      0.83      0.91      7775
           1       0.08      0.67      0.14       158

    accuracy                           0.83      7933
   macro avg       0.53      0.75      0.52      7933
weighted avg       0.97      0.83      0.89      7933



# EDA

In [None]:
for i in df.select_dtypes('O'):
     print(f'{i} punya : {df[i].nunique()} unique values')

Agency punya : 16 unique values
Agency Type punya : 2 unique values
Distribution Channel punya : 2 unique values
Product Name punya : 26 unique values
Destination punya : 138 unique values
Claim punya : 2 unique values


In [None]:
df['Agency Type'].nunique()
df.value_counts()

Agency  Agency Type    Distribution Channel  Product Name                          Duration  Destination        Net Sales  Commision  Age  Claim
JWT     Airlines       Online                Value Plan                            20        INDIA              31.0       12.40      118  No       3
                                                                                   9         INDIA              31.0       12.40      118  No       3
CCR     Travel Agency  Offline               Comprehensive Plan                    4         MALAYSIA           29.0       9.57       118  No       3
RAB     Airlines       Online                Value Plan                            6         BRUNEI DARUSSALAM  15.0       6.00       30   No       3
JWT     Airlines       Online                Value Plan                            19        INDIA              39.0       15.60      118  No       2
                                                                                                         

In [None]:
df['Product Name'].mode()

0    Basic Plan
Name: Product Name, dtype: object

In [None]:
df['Product Name'].value_counts()

Product Name
Basic Plan                              3653
Bronze Plan                             2718
Silver Plan                             1504
Value Plan                              1465
Annual Silver Plan                       897
Travel Cruise Protect                    340
Comprehensive Plan                       244
Gold Plan                                236
24 Protect                               169
Annual Gold Plan                         138
Single Trip Travel Protect Gold          134
Single Trip Travel Protect Silver        116
Premier Plan                             107
Annual Travel Protect Gold                63
Annual Travel Protect Silver              59
Single Trip Travel Protect Platinum       47
Individual Comprehensive Plan             46
Annual Travel Protect Platinum            34
Spouse or Parents Comprehensive Plan      13
Child Comprehensive Plan                   7
Rental Vehicle Excess Insurance            3
Travel Cruise Protect Family              

In [None]:
df.head()

Unnamed: 0,Agency,Agency Type,Distribution Channel,Product Name,Gender,Duration,Destination,Net Sales,Commision,Age,Claim
0,C2B,Airlines,Online,Annual Silver Plan,F,365,SINGAPORE,216.0,54.0,57,No
2,JZI,Airlines,Online,Basic Plan,M,19,INDIA,22.0,7.7,26,No
4,C2B,Airlines,Online,Bronze Plan,M,8,SINGAPORE,16.0,4.0,28,No
8,JZI,Airlines,Online,Basic Plan,M,4,THAILAND,22.0,7.7,39,No
11,C2B,Airlines,Online,Annual Silver Plan,M,365,SINGAPORE,216.0,54.0,27,No
