In [75]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder,LabelEncoder
import ipywidgets

In [2]:
trainData = pd.read_csv("./train.csv")
testData = pd.read_csv("./test.csv")

In [3]:
uniqueValues = {col: trainData[col].nunique() for col in trainData.columns}
uniqueValues

{'id': 165034,
 'CustomerId': 23221,
 'Surname': 2797,
 'CreditScore': 457,
 'Geography': 3,
 'Gender': 2,
 'Age': 71,
 'Tenure': 11,
 'Balance': 30075,
 'NumOfProducts': 4,
 'HasCrCard': 2,
 'IsActiveMember': 2,
 'EstimatedSalary': 55298,
 'Exited': 2}

In [4]:
trainData.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 165034 entries, 0 to 165033
Data columns (total 14 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   id               165034 non-null  int64  
 1   CustomerId       165034 non-null  int64  
 2   Surname          165034 non-null  object 
 3   CreditScore      165034 non-null  int64  
 4   Geography        165034 non-null  object 
 5   Gender           165034 non-null  object 
 6   Age              165034 non-null  float64
 7   Tenure           165034 non-null  int64  
 8   Balance          165034 non-null  float64
 9   NumOfProducts    165034 non-null  int64  
 10  HasCrCard        165034 non-null  float64
 11  IsActiveMember   165034 non-null  float64
 12  EstimatedSalary  165034 non-null  float64
 13  Exited           165034 non-null  int64  
dtypes: float64(5), int64(6), object(3)
memory usage: 17.6+ MB


In [5]:
trainData.CreditScore.max() , trainData.CreditScore.min()

(850, 350)

# About the data

The CustomerId column has too many unique values and if of no use\
The Surname has new values in the test dataset that are not present in the train dataset do drop it\
We can add a feature GoodCreditScore if credit score is greater than 670\
Geography , Gender NumOfProducts can be oneHot Encoded\
The last column Exited is the target column

In [6]:
X_train = trainData.iloc[:,:-1] 
y_train = trainData.iloc[:,-1]
X_test = pd.read_csv('./test.csv')

In [7]:
# Lable encoding Surname
X_train.drop(["Surname","id","CustomerId"],inplace=True,axis=1)
X_test.drop(["Surname","id","CustomerId"],inplace=True,axis=1)

In [8]:
column_name = 'NumOfProducts'  # Replace with the name of your column
unique_values = trainData[column_name].unique()
print(f"Unique values in the '{column_name}' column:")
print(unique_values)

Unique values in the 'NumOfProducts' column:
[2 1 3 4]


In [9]:
# One hot encoding
catCols = ['Geography', 'Gender', 'NumOfProducts']

oneHot = OneHotEncoder()
X_train_encoded = oneHot.fit_transform(X_train[catCols])
X_test_encoded = oneHot.transform(X_test[catCols])

# Get the new column names from the one-hot encoder
newCols = oneHot.get_feature_names_out(catCols)

# Add the one-hot encoded columns to the existing DataFrames
X_train = pd.concat([X_train, pd.DataFrame(X_train_encoded.toarray(), columns=newCols, index=X_train.index)], axis=1)
X_test = pd.concat([X_test, pd.DataFrame(X_test_encoded.toarray(), columns=newCols, index=X_test.index)], axis=1)

# Drop the original categorical columns
X_train.drop(catCols, inplace=True, axis=1)
X_test.drop(catCols, inplace=True, axis=1)

In [10]:
scaler = StandardScaler()
X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train),columns=X_train.columns)
X_test_scaled = pd.DataFrame(scaler.transform(X_test),columns=X_test.columns)

In [11]:
# model = keras.Sequential([
#     layers.Dense(16,activation='relu'),
#     layers.Dropout(0.25),
#     layers.Dense(8,activation='relu'),
#     layers.Dense(4,activation='relu'),
#     layers.Dense(1,activation='sigmoid')
# ])
model = keras.Sequential([
    layers.Dense(32,activation='relu'),
    layers.Dropout(0.25),
    layers.Dense(16,activation='relu'),
    layers.Dropout(0.25),
    layers.Dense(4,activation='relu'),
    layers.Dense(1,activation='sigmoid')
])

In [12]:
from keras.metrics import AUC
model.compile( optimizer='adam',loss='binary_crossentropy',metrics=["accuracy",AUC()])

In [13]:
x,x_v,y,y_v = train_test_split(X_train_scaled,y_train) 

In [14]:
x_v.shape

(41259, 16)

In [None]:
history = model.fit(x,y,epochs=40,validation_data=(x_v,y_v))

In [None]:
y_train.dtypes

In [None]:
history_dict2 = history_dict.copy()

In [None]:
history_dict = history.history
history_dict.keys()

In [None]:
accuracy = history_dict['accuracy']
val_accuracy = history_dict['val_accuracy']
auc = history_dict['auc_3']
val_auc = history_dict['val_auc_3']
epochs = range(1,len(accuracy)+1)
plt.plot(epochs, auc, "bo", label="Training auc")
plt.plot(epochs, val_auc, "b", label="Validation auc") 
plt.legend()
plt.show()

In [None]:
plt.plot(epochs, accuracy, "ro", label="Training accuracy")
plt.plot(epochs, val_accuracy, "r", label="Validation accuracy") 
plt.legend()
plt.show()

In [None]:
predictions = model.predict(X_test_scaled)

In [None]:
predictions = predictions.reshape(-1)
results_df = pd.DataFrame(data={'id':testData.iloc[:,0], 'Exited': predictions})
results_df.head()

In [None]:
results_df.to_csv('nnSubmission.csv',index=False)

# Implementing Catboost classifier

In [25]:
from sklearn.model_selection import RandomizedSearchCV
from catboost import CatBoostClassifier

In [84]:
X = trainData.iloc[:,:-1] 
y = trainData.iloc[:,-1]
X_test = pd.read_csv('./test.csv')
X.drop(["Surname","id","CustomerId"],inplace=True,axis=1)
X_test.drop(["Surname","id","CustomerId"],inplace=True,axis=1)

In [60]:
X_train.dtypes

CreditScore                 int64
Geography          string[python]
Gender             string[python]
Age                       float64
Tenure                      int64
Balance                   float64
NumOfProducts               int64
HasCrCard                 float64
IsActiveMember            float64
EstimatedSalary           float64
dtype: object

In [85]:
# Using apply(str) method
# Convert float columns to string
X_train['HasCrCard'] = X_train['HasCrCard'].astype(str)
X_train['IsActiveMember'] = X_train['IsActiveMember'].astype(str)
X_train['Geography'] = X_train['Geography'].astype(str)
X_train['Gender'] = X_train['Gender'].astype(str)


# Repeat for X_val and X_test if necessary
X_val['HasCrCard'] = X_val['HasCrCard'].astype(str)
X_val['IsActiveMember'] = X_val['IsActiveMember'].astype(str)
X_val['Geography'] = X_val['Geography'].astype(str)
X_val['Gender'] = X_val['Gender'].astype(str)


X_test['HasCrCard'] = X_test['HasCrCard'].astype(str)
X_test['IsActiveMember'] = X_test['IsActiveMember'].astype(str)
X_test['Geography'] = X_test['Geography'].astype(str)
X_test['Gender'] = X_test['Gender'].astype(str)

X['HasCrCard'] = X['HasCrCard'].astype(str)
X['IsActiveMember'] = X['IsActiveMember'].astype(str)
X['Geography'] = X['Geography'].astype(str)
X['Gender'] = X['Gender'].astype(str)


In [72]:
%%time

catCols = ['Tenure','NumOfProducts','HasCrCard','IsActiveMember','Geography','Gender']
catColsIndices = [X.columns.get_loc(col) for col in catCols]
catColsIndices

params = {
    'eval_metric' : 'AUC' ,
    'verbose' : 200, 
    'cat_features' : catColsIndices,
    'early_stopping_rounds' : 50,
    'task_type' : 'GPU'
}



CPU times: total: 0 ns
Wall time: 0 ns


In [66]:
from catboost import Pool

trainPool = Pool(X_train,label=y_train,cat_features=  catColsIndices)
valPool = Pool(X_val,label = y_val,cat_features = catColsIndices) 
testPool = Pool(X_test,cat_features=catColsIndices) 

In [76]:
%%time 
cbcInit = CatBoostClassifier(iterations=5000,**params)
cbcInit.fit(trainPool, eval_set=valPool, verbose=2000, plot=True)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Learning rate set to 0.024282


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.8733233	best: 0.8733233 (0)	total: 80ms	remaining: 6m 40s
bestTest = 0.888651669
bestIteration = 1134
Shrink model to first 1135 iterations.
CPU times: total: 33.7 s
Wall time: 56.3 s


<catboost.core.CatBoostClassifier at 0x28fb3e56750>

In [126]:
predictions = cbcInit.predict_proba(X_test)

In [127]:
predictions = predictions[:,1]

In [128]:
predictions = predictions.reshape(-1)
results_df = pd.DataFrame(data={'id':testData.iloc[:,0], 'Exited': predictions})
results_df.head()
results_df.to_csv('nnSubmission.csv',index=False)