In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.pipeline import make_pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier

from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import learning_curve

from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.metrics import auc
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import f1_score
from sklearn.metrics import roc_curve

In [None]:
df=pd.read_csv('travel_insurance/train.csv')

In [None]:
df

In [None]:
df.isna().sum()

In [None]:
df.drop(columns=['Customer Id'],inplace=True)

In [None]:
cat_column=['Employment Type','GraduateOrNot','FrequentFlyer','EverTravelledAbroad','TravelInsurance']
for i in cat_column:
    df[i] = pd.Categorical(df[i])
    df[i] = df[i].cat.codes

In [None]:
a=df.corr()
a['TravelInsurance'].sort_values()

In [None]:
x=df.drop(columns=['TravelInsurance'])
y=df['TravelInsurance']

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=54,)


GridSearch 

In [None]:
classifier = [DecisionTreeClassifier(),
             SVC(),
             RandomForestClassifier(),
             LogisticRegression(),
             KNeighborsClassifier(),
             SGDClassifier()
            ]

dt_param_grid = {"min_samples_split" : range(10,100,10),
                "max_depth": range(1,10)}

svc_param_grid = {"kernel" : ["rbf",'poly'],
                 "gamma": [0.001, 0.1, 1],
                 "C": [0.1, 1,10],
                 'degree':[2,3]}

rf_param_grid = {"max_features": [1,10],
                "min_samples_split":[2,10],
                "min_samples_leaf":[1,10],
                "bootstrap":[False],
                "n_estimators":[100],
                "criterion":["gini",'antropy']}

logreg_param_grid = {"C":np.logspace(-2, 2),
                    "penalty": ["l1","l2","none"]}

knn_param_grid = {"n_neighbors": np.linspace(2,20, dtype = int).tolist(),
                 "metric":["manhattan"],
                 "leaf_size": [1,5,10]}
sgdc_param_grid = {
    "loss" : ["hinge", "log"],
    "alpha" : [ 0.001, 0.01, 0.1],
    "penalty" : ["l2", "l1", "none"]}




classifier_param = [dt_param_grid,
                   svc_param_grid,
                   rf_param_grid,
                   logreg_param_grid,
                   knn_param_grid,
                   sgdc_param_grid
                 ]

cv_result = []
best_estimators = []
mean_squared_errors = []
recall_scores = []
precision_scores = []
f1_scores = []
roc_auc=[]

for i in range(len(classifier)):
    print("---------------------------------------------------------------------------")
    clf = GridSearchCV(classifier[i],
                       param_grid=classifier_param[i],
                       cv = StratifiedKFold(n_splits = 3),
                       scoring = "accuracy",
                       n_jobs = -1,verbose = 2)
    clf.fit(X_train,y_train)
    
    cv_result.append(clf.best_score_)
    
    mean_squared_errors.append(mean_squared_error(y_test,clf.predict(X_test)))

    roc_auc.append(roc_auc_score(y_test,clf.predict(X_test)))
    recall_scores.append(recall_score(y_test, clf.predict(X_test), average='weighted'))
    
    precision_scores.append(precision_score(y_test, clf.predict(X_test), average='weighted'))
    
    f1_scores.append(f1_score(y_test, clf.predict(X_test), average='weighted'))
    
    best_estimators.append(clf.best_estimator_)
    
    print("Model: {}".format(classifier[i]))
    print("Accuracy: %{}".format(round(cv_result[i]*100,2)))
    print("MSE: {}".format(mean_squared_errors[i]))
    print('roc score:{}'.format(roc_auc[i]))
    print("Recall: {}".format(recall_scores[i]))
    print("Precision: {}".format(precision_scores[i]))
    print("F1-Score: {}".format(f1_scores[i]))
    print("Best Estimator: {}".format(clf.best_estimator_))
    
print("---------------------------------------------------------------------------")
sns.set_style("darkgrid")
cv_results = pd.DataFrame({"Accuracy":cv_result,
                           "MSE":mean_squared_errors,
                           "Recall": recall_scores,
                           "Precision": precision_scores,
                           "F1-Score":f1_scores,
                           'roc':roc_auc,
                           "Models":["DecisionTreeClassifier", 
                                     "SVC",
                                     "RandomForestClassifier",
                                     "LogisticRegression",
                                     "KNeighborsClassifier",
                                     "SGDClassifier"
                                   ]})

cv_results.index = cv_results["Models"]

cv_results  = cv_results.drop(["Models"], axis = 1)

f,ax = plt.subplots(figsize=(14,10))

sns.heatmap(cv_results, annot=True,cmap = "Blues",fmt= '.3f',
            ax=ax,linewidths = 5, cbar = False,
            annot_kws={"size": 18})

plt.xticks(size = 18)
plt.yticks(size = 18, rotation = 0)
plt.ylabel("Models")
plt.title("Grid Search Results", size = 16)
plt.show()
    

In [None]:
sns.set_style("darkgrid")
cv_results = pd.DataFrame({"Cross Validation Means":cv_result,
                           "Models":["DecisionTreeClassifier", "SVC",
                                     "RandomForestClassifier",
                                     "LogisticRegression",
                                     "KNeighborsClassifier",
                                     "SGDClassifier"
                                    ]})

plt.figure(figsize = (10,6))
sns.barplot("Cross Validation Means", "Models",
            data = cv_results, palette = "Set1")
plt.xlabel("Mean Accuracy",
           size = 12)
plt.yticks(size = 14)
plt.title("Cross Validation Scores",
          size = 12)
plt.show()

Scaling

In [None]:
scale_min_max=MinMaxScaler()
x_train_scale=scale_min_max.fit_transform(X_train,y_train)
x_test_scale=scale_min_max.transform(X_test)
scale_standard=StandardScaler()
x_train_scale=scale_standard.fit_transform(X_train,y_train)
x_test_scale=scale_standard.fit_transform(X_test)

In [None]:
best_estimators

RandomForestClassifier

In [None]:
RFmodel=RandomForestClassifier(bootstrap=False, max_features=1, min_samples_leaf=10)
RFmodel.fit(x_train_scale,y_train)

y_pred_rf = RFmodel.predict(x_test_scale)

print(classification_report(y_test, y_pred_rf))

In [None]:
train_sizes, train_scores, valid_scores = learning_curve(
   RFmodel, X_train, y_train, train_sizes=np.linspace(0.01, 1.0, 40), cv=5,
    scoring="neg_root_mean_squared_error"
)

train_errors = -train_scores.mean(axis=1)
valid_errors = -valid_scores.mean(axis=1)

plt.plot(train_sizes, train_errors, "r-+", linewidth=2, label="train")
plt.plot(train_sizes, valid_errors, "b-", linewidth=3, label="valid")

DecisionTreeClassifier

In [None]:
DTmodel= DecisionTreeClassifier(max_depth=3, min_samples_split=4)


DTmodel.fit(X_train,y_train)

y_pred = DTmodel.predict(X_test)

print(classification_report(y_test, y_pred))

In [None]:
DTmodel_scale= DecisionTreeClassifier(max_depth=3, min_samples_split=8)


DTmodel_scale.fit(x_train_scale,y_train)

y_pred_scale= DTmodel_scale.predict(x_test_scale)

print(classification_report(y_test, y_pred_scale))

scaling was not effective.

In [None]:
train_sizes, train_scores, valid_scores = learning_curve(
   DTmodel, X_train, y_train, train_sizes=np.linspace(0.01, 1.0, 40), cv=5,
    scoring="neg_root_mean_squared_error"
)

train_errors = -train_scores.mean(axis=1)
valid_errors = -valid_scores.mean(axis=1)

plt.plot(train_sizes, train_errors, "r-+", linewidth=2, label="train")
plt.plot(train_sizes, valid_errors, "b-", linewidth=3, label="valid")

SVC

In [None]:
svc=SVC(C=1, gamma=0.1,probability=True)

svc.fit(X_train,y_train)

y_pred_svc= svc.predict(X_test)

print(classification_report(y_test, y_pred_svc))

voting weak classifier

In [None]:
svc=SVC(C=1, gamma=0.1,probability=True)
lr=LogisticRegression(C=0.01)
votting_clf =VotingClassifier( estimators = [ ('lr',lr),('svc', svc)], voting = 'soft')
for clf in (svc,lr,votting_clf):
    clf.fit(X_train, y_train)
    y_pred_vot = clf.predict(X_test)
    print(clf.__class__.__name__ , accuracy_score(y_test, y_pred_vot))

BaggingClassifier

In [None]:
bag_clf = BaggingClassifier(
    DecisionTreeClassifier(max_depth=3, min_samples_split=10), n_estimators=500,
    max_samples=100, bootstrap=True, n_jobs=-1)
bag_clf.fit(X_train, y_train)
y_pred_bag= bag_clf.predict(X_test)
accuracy_score(y_test, y_pred_bag)

AdaBoostClassifier

In [None]:
ada_clf = AdaBoostClassifier(
    DecisionTreeClassifier(max_depth=3,min_samples_split=10), n_estimators=200,
    algorithm="SAMME.R", learning_rate=0.5
    )

ada_clf.fit(X_train, y_train)

In [None]:
y_pred_ada= ada_clf.predict(X_test)
accuracy_score(y_test, y_pred_ada)

test

In [None]:
df_test=pd.read_csv('travel_insurance/test.csv')

In [None]:
df_test

In [None]:
cat_column=['Employment Type','GraduateOrNot','FrequentFlyer','EverTravelledAbroad']
for i in cat_column:
    df_test[i] = pd.Categorical(df_test[i])
    df_test[i] = df_test[i].cat.codes
    
df_test

In [None]:
df_test1=df_test.drop(columns=['Customer Id'])
df_test1

In [None]:
y_pred_dt= DTmodel.predict(df_test1)
y_pred_dt

In [None]:
y_pred_v= votting_clf.predict(df_test1)
y_pred_v

In [None]:
pred_df=pd.DataFrame(y_pred_v)

pred_df['prediction']=pred_df[0]
result = pd.concat([df_test, pred_df], axis=1)
result

In [None]:
drop_column=['Employment Type','Age','GraduateOrNot','ChronicDiseases','AnnualIncome','FamilyMembers','FrequentFlyer','EverTravelledAbroad',0]
result.drop(columns=drop_column,inplace=True)
result

In [None]:
result.to_csv('output.csv',index=False)
pd.read_csv('output.csv')