# Import Libraries

In [None]:
import pandas as pd
import numpy as np
pd.plotting.register_matplotlib_converters()
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
print("Setup Complete")

### First thing we should do is to see our dataset and answer some questions 

In [None]:
insurance_train_filepath = "/kaggle/input/playground-series-s4e7/train.csv"

# Read the file into a variable insurance_data_train
insurance_data_train = pd.read_csv(insurance_train_filepath, index_col="id")

In [None]:
insurance_test_filepath = "/kaggle/input/playground-series-s4e7/test.csv"

# Read the file into a variable insurance_data_train
insurance_data_test = pd.read_csv(insurance_test_filepath)

### The best start you need to do on your project is to see your dataset and the shape of it 

In [None]:
insurance_data_train.head()

In [None]:
insurance_data_train.shape

In [None]:
insurance_data_train.info()

### It seems that there are some features which are categorical

### Checking the missing values in this dataset 

In [None]:
missing_values = insurance_data_train.isnull().sum()
missing_values

### Good! There are no missing values

# Exploratory Data Analysis(EDA)

In [None]:
insurance_data_train.columns

In [None]:
plt.figure(figsize=(7, 4))
sns.boxplot(x=insurance_data_train['Annual_Premium'])
plt.title('Distribution of Annual_Premium')
plt.xlabel('Annual_Premium')
plt.ylabel('Frequency')
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.histplot(insurance_data_train['Age'], kde=False, bins=10)
plt.title('Distribution of Age')
plt.xlabel('Age')
plt.ylabel('Frequency')
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.histplot(insurance_data_train['Region_Code'], kde=False, bins=12)
plt.title('Distribution of Region_Code')
plt.xlabel('Region_Code')
plt.ylabel('Frequency')
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.histplot(insurance_data_train['Vehicle_Age'], kde=False, bins=10)
plt.title('Distribution of Vehicle_Age')
plt.xlabel('Vehicle_Age')
plt.ylabel('Frequency')
plt.show()

In [None]:
response_data = insurance_data_train['Response'].value_counts()
plt.figure(figsize=(6,6))
fig, ax = plt.subplots()
ax.pie(response_data, labels = [0,1])
ax.set_title('Checking Imbalance in Training Data Or Response')

### It seems that the data is imbalanced 

# Feature Engineering

In [None]:
insurance_data_train.head()

### Handle Categorical Data

### For Training

In [None]:
insurance_data_train['Vehicle_Age'] = insurance_data_train['Vehicle_Age'].astype('category')
insurance_data_train = pd.get_dummies(insurance_data_train, columns=['Vehicle_Age'])
#insurance_data_train.head()

### For Testing 

In [None]:
insurance_data_test['Vehicle_Age'] = insurance_data_test['Vehicle_Age'].astype('category')
insurance_data_test = pd.get_dummies(insurance_data_test, columns=['Vehicle_Age'])
#insurance_data_test.head()

In [None]:
def vehicle_damage(Vehicle_Damage):
    if Vehicle_Damage == 'Yes':
        return 1
    return 0

In [None]:
insurance_data_train['Vehicle_Damages'] = insurance_data_train['Vehicle_Damage'].apply(vehicle_damage)
insurance_data_train.drop(['Vehicle_Damage'],axis=1, inplace=True)

In [None]:
insurance_data_test['Vehicle_Damages'] = insurance_data_test['Vehicle_Damage'].apply(vehicle_damage)
insurance_data_test.drop(['Vehicle_Damage'],axis=1, inplace=True)

In [None]:
insurance_data_train.head()

In [None]:
insurance_data_test['Gender'] = insurance_data_test['Gender'].astype('category')
insurance_data_test = pd.get_dummies(insurance_data_test, columns=['Gender'],drop_first=True)

insurance_data_train['Gender'] = insurance_data_train['Gender'].astype('category')
insurance_data_train = pd.get_dummies(insurance_data_train, columns=['Gender'],drop_first=True)

### Split the dataset

In [None]:
X_train = insurance_data_train[['Age', 'Driving_License', 
                                'Region_Code', 
              'Previously_Insured', 'Annual_Premium', 
              'Policy_Sales_Channel', 'Vintage', 
              'Vehicle_Damages', 'Vehicle_Age_1-2 Year',
              'Vehicle_Age_< 1 Year', 'Vehicle_Age_> 2 Years',
              'Gender_Male']]
y_train = insurance_data_train['Response']

X_test = insurance_data_test[['Age', 'Driving_License', 
                              'Region_Code', 'Previously_Insured',
                              'Annual_Premium', 
                              'Policy_Sales_Channel', 'Vintage', 
                              'Vehicle_Damages',
                              'Vehicle_Age_1-2 Year',
                              'Vehicle_Age_< 1 Year',
                              'Vehicle_Age_> 2 Years', 
                              'Gender_Male']]

### Using Smote To Handle Imbalanced Data

In [None]:
import imblearn
from imblearn.over_sampling import SMOTE
smote = SMOTE()
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)
#print(y_train.value_counts())
#print()
#print(y_train_smote.value_counts())

### Scaling the dataset

In [None]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler
scaler = StandardScaler()

In [None]:
X_train_scaled = scaler.fit_transform(X_train_smote)
#data_scaled = scaler.fit_transform(insurance_data_train)

In [None]:
test_dataset = scaler.fit_transform(insurance_data_test)

# Apply  neural network model

### Split the dataset

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_train_scaled, y_train_smote, 
                                                    test_size=0.3, random_state=42)

In [None]:
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)


In [None]:
from sklearn import metrics
from sklearn.metrics import *
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.activations import linear, relu, sigmoid
import logging
logging.getLogger("tensorflow").setLevel(logging.ERROR)
tf.autograph.set_verbosity(0)
print("Setup Completed!")

In [None]:
model = Sequential(
    [               
        tf.keras.Input(shape=(12,)),    #specify input size
        ### START CODE HERE ### 
        Dense(units = 35, activation='sigmoid'),#layer-1
        Dense(units = 25, activation='sigmoid'),#layer-2
        Dense(units = 15, activation='sigmoid'), #layer-3
        Dense(units = 1, activation='sigmoid')#output layer
        
        ### END CODE HERE ### 
    ], name = "nn_model" 
)                            


In [None]:
model.summary()

In [None]:
[layer1, layer2, layer3, layer4] = model.layers

In [None]:
#### Examine Weights shapes
W1,b1 = layer1.get_weights()
W2,b2 = layer2.get_weights()
W3,b3 = layer3.get_weights()
W4,b4 = layer4.get_weights()
print(f"W1 shape = {W1.shape}, b1 shape = {b1.shape}")
print(f"W2 shape = {W2.shape}, b2 shape = {b2.shape}")
print(f"W3 shape = {W3.shape}, b3 shape = {b3.shape}")
print(f"W4 shape = {W4.shape}, b4 shape = {b4.shape}")

In [None]:
model.compile(
    loss=tf.keras.losses.BinaryCrossentropy(),
    optimizer=tf.keras.optimizers.Adam(0.001),
    metrics=['Accuracy'],
)

history = model.fit(
    X_train,
    y_train,
    batch_size=96,
    epochs=50,
    validation_data=(X_test, y_test),
    
)

In [None]:
acc = history.history['Accuracy']
val_acc = history.history['val_Accuracy']

loss = history.history['loss']
val_loss = history.history['val_loss']

epochs_range = range(50)

plt.figure(figsize=(8, 8))
plt.subplot(1, 2, 1)
plt.plot(epochs_range, acc, label='Training Accuracy')
plt.plot(epochs_range, val_acc, label='Validation Accuracy')
plt.legend(loc='lower right')
plt.title('Training and Validation Accuracy')

plt.subplot(1, 2, 2)
plt.plot(epochs_range, loss, label='Training Loss')
plt.plot(epochs_range, val_loss, label='Validation Loss')
plt.legend(loc='upper right')
plt.title('Training and Validation Loss')
plt.show()

In [None]:
X_test = insurance_data_test[['Age', 'Driving_License', 
                              'Region_Code', 'Previously_Insured',
                              'Annual_Premium', 
                              'Policy_Sales_Channel', 'Vintage', 
                              'Vehicle_Damages',
                              'Vehicle_Age_1-2 Year',
                              'Vehicle_Age_< 1 Year',
                              'Vehicle_Age_> 2 Years', 
                              'Gender_Male']]

In [None]:
X_test_scaled = scaler.fit_transform(X_test)

In [None]:
X_test_scaled.shape

In [None]:
prediction = model.predict(X_test_scaled)

In [None]:
prediction

In [None]:
submit = pd.read_csv('/kaggle/input/playground-series-s4e7/sample_submission.csv')
submit.Response = prediction
submit[['id', 'Response']].to_csv('./kaggle/working/neural_network.csv', index=False)

In [None]:
"""def my_tf_round(X, decimals = 3):
    multiplier = tf.constant(10**decimals, dtype=X.dtype)
    return tf.math.round(X * multiplier) / multiplier"""

# Applying XGBoost 

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier
import warnings 
warnings.filterwarnings("ignore")
RANDOM_STATE = 55
print("Setup Completed!")

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_train_scaled, y_train_smote, 
                                                    test_size=0.3, random_state=RANDOM_STATE)

In [None]:
n = int(len(X_train)*0.8) ## Let's use 80% to train and 20% to eval

In [None]:
X_train_fit, X_train_eval, y_train_fit, y_train_eval = X_train[:n], X_train[n:], y_train[:n], y_train[n:]


In [None]:
"""num_parallel_tree = 20"""

In [None]:
xgb_model = XGBClassifier(learning_rate = 0.045
                          , max_depth = 40 
                          , subsample = 0.85,
                      colsample_bytree = 0.4704779253343011, n_estimators = 960,
                       reg_lambda = 29.23
                          ,min_child_weight = 30
                          ,gamma=0  , max_delta_step = 390
                       , tree_method='gpu_hist', eval_metric = 'auc', early_stopping_rounds = 50, random_state=RANDOM_STATE)
xgb_model.fit(X_train_fit,y_train_fit, eval_set = [(X_train_eval,y_train_eval)])

In [None]:
print(f"Metrics train:\n\tAccuracy score: {accuracy_score(xgb_model.predict(X_train),y_train):.4f}\nMetrics test:\n\tAccuracy score: {accuracy_score(xgb_model.predict(X_test),y_test):.4f}")

In [None]:
X_test = insurance_data_test[['Age', 'Driving_License', 
                              'Region_Code', 'Previously_Insured',
                              'Annual_Premium', 
                              'Policy_Sales_Channel', 'Vintage', 
                              'Vehicle_Damages',
                              'Vehicle_Age_1-2 Year',
                              'Vehicle_Age_< 1 Year',
                              'Vehicle_Age_> 2 Years', 
                              'Gender_Male']]

In [None]:
X_test_scaled = scaler.fit_transform(X_test)

In [None]:
prediction = xgb_model.predict_proba(X_test_scaled)[:,1].reshape(-1,1)

In [None]:
submit = pd.read_csv('/kaggle/input/playground-series-s4e7/sample_submission.csv')
submit.Response = prediction
submit[['id', 'Response']].to_csv('/kaggle/working/xgboost.csv', index=False)

In [None]:
prediction