#1) Importing important libraries to check what we have in our dataset

In [None]:
# Importing libraries
import numpy as np
import pandas as pd
import matplotlib as plt
import seaborn as sns

In [None]:
# Laod and Read Data
F = pd.read_csv('/content/training_set.csv') # Train Dataset
F

In [None]:
G = pd.read_csv('/content/test_set.csv') # Test Dataset
G

#2) Basic Data Exploration


**Lets See what datatype we have in our dataset**

In [None]:
# Know the datatypes
F.info()  # Know the datatypes Train Dataset

In [None]:
G.info() # Test Datatype

**As data type of Coloumn X56 and X57 is of int type. we will change it to float type.**

In [None]:
# converting int datatype to float
F = F.astype({'X56':'float','X57':'float'})
G = G.astype({'X56':'float','X57':'float'})

In [None]:
F.info()

In [None]:
G.info()

**To know the shape of dataset**

In [None]:
print("Train Shape",F.shape) 
print("Test Shape",G.shape) # Test Data is without ground truth (i.e. No output colomn)

**To know null values in Dataset**

In [None]:
F.isnull().sum() 


In [None]:
G.isnull().sum()

**There is no null values in our dataset. Also we will delete a Unnamed coloumn from both training and Test Dataste as it is irrelevent to the given data.**

In [None]:
F = F.drop(columns=['Unnamed: 0']) # delete coloumn unnamed:0
F

In [None]:
G = G.drop(columns=['Unnamed: 0']) # delete coloumn unnamed:0
G

**Lets perform described method which help us to see how data is spread for numerical values.**

In [None]:
F.describe()

In [None]:
G.describe()

#3) Preprocessing

**From summery we can see that the mean value of feature X55, X57, X58 is high as compared to other feature. So we have to perform Normalization.**

In [None]:
# Preprocessing allows us to noramalize our data
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
F[["ScaledX55", "ScaledX56","ScaledX57"]] = scaler.fit_transform(F[["X55", "X56",'X57']])
F

In [None]:
G[["ScaledX55", "ScaledX56","ScaledX57"]] = scaler.fit_transform(G[["X55", "X56",'X57']])
G

**Removing old version of scaled feature**

In [None]:
# use drop method
df = F.drop(["X55", "X56",'X57'], axis = 1) 
df

In [None]:
Test_Dataset = G.drop(["X55", "X56",'X57'], axis = 1)
Test_Dataset

#4) Training Of Model

**Assigning Train and Validation Dataset**

In [None]:
Y = df['Y']
X = df.drop(['Y'],axis = 1)

In [None]:
X

In [None]:
Y

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_val,y_train,y_val = train_test_split(X,Y,train_size=0.80,random_state=42)

In [None]:
import tensorflow as tf
tf.random.set_seed(42)

model = tf.keras.Sequential([
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(256, activation='relu'),
    tf.keras.layers.Dense(256, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model.compile(
    loss=tf.keras.losses.binary_crossentropy,
    optimizer=tf.keras.optimizers.Adam(lr=0.001),
    metrics=[
        tf.keras.metrics.BinaryAccuracy(name='accuracy'),
        tf.keras.metrics.Precision(name='precision'),
        tf.keras.metrics.Recall(name='recall')
    ]
)

history = model.fit(X_train, y_train, epochs=200)

Visualizing model performance

In [None]:
import matplotlib.pyplot as plt
from matplotlib import rcParams

rcParams['figure.figsize'] = (18, 8)
rcParams['axes.spines.top'] = False
rcParams['axes.spines.right'] = False

In [None]:
plt.plot(
    np.arange(1, 201), 
    history.history['loss'], label='Loss'
)
plt.plot(
    np.arange(1, 201), 
    history.history['accuracy'], label='Accuracy'
)
plt.plot(
    np.arange(1, 201), 
    history.history['precision'], label='Precision'
)
plt.plot(
    np.arange(1, 201), 
    history.history['recall'], label='Recall'
)
plt.title('Evaluation metrics', size=20)
plt.xlabel('Epoch', size=14)
plt.legend();

In [None]:
predictions = model.predict(X_val)
predictions

In [None]:
from sklearn.metrics import roc_curve
from sklearn.metrics import auc
from sklearn.metrics import roc_auc_score
def plot_roc_curve(fpr, tpr):
    plt.plot(fpr, tpr, color='orange', label='ROC')
    plt.plot([0, 1], [0, 1], color='darkblue', linestyle='--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic (ROC) Curve')
    plt.legend()
    plt.show()
# Computing manually fpr, tpr, thresholds and roc auc 
fpr, tpr, thresholds = roc_curve(y_val, predictions)
roc_auc = auc(fpr, tpr)
print("ROC_AUC Score : ",roc_auc)
print("Function for ROC_AUC Score : ",roc_auc_score(y_val, predictions)) # Function present
optimal_idx = np.argmax(tpr - fpr)
optimal_threshold = thresholds[optimal_idx]
print("Threshold value is:", optimal_threshold)
plot_roc_curve(fpr, tpr)

In [None]:
prediction_classes = [1 if prob > 0.22 else 0 for prob in np.ravel(predictions)]
prediction_classes

Model evaluation on test data

In [None]:
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_val, prediction_classes))

[[451  22]
 [ 20 289]]


In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score
print(f'Accuracy: {accuracy_score(y_val, prediction_classes):.2f}')
print(f'Precision: {precision_score(y_val, prediction_classes):.2f}')
print(f'Recall: {recall_score(y_val, prediction_classes):.2f}')

Accuracy: 0.95
Precision: 0.93
Recall: 0.94


Prediction on Unseen Dataset

In [None]:
predictions = model.predict(Test_Dataset)
predictions

In [None]:
prediction_classes = [1 if prob > 0.22 else 0 for prob in np.ravel(predictions)]
prediction_classes

# Classification Algorithms

In [None]:
models = {}

# Logistic Regression
from sklearn.linear_model import LogisticRegression
models['Logistic Regression'] = LogisticRegression()

# Support Vector Machines
from sklearn.svm import LinearSVC
models['Support Vector Machines'] = LinearSVC()

# Decision Trees
from sklearn.tree import DecisionTreeClassifier
models['Decision Trees'] = DecisionTreeClassifier()

# Random Forest
from sklearn.ensemble import RandomForestClassifier
models['Random Forest'] = RandomForestClassifier()

# Naive Bayes
from sklearn.naive_bayes import GaussianNB
models['Naive Bayes'] = GaussianNB()

# K-Nearest Neighbors
from sklearn.neighbors import KNeighborsClassifier
models['K-Nearest Neighbor'] = KNeighborsClassifier()

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score

accuracy, precision, recall = {}, {}, {}

for key in models.keys():
    
    # Fit the classifier model
    models[key].fit(X_train, y_train)
    
    # Prediction 
    predictions = models[key].predict(X_val)
    
    # Calculate Accuracy, Precision and Recall Metrics
    accuracy[key] = accuracy_score(predictions, y_val)
    precision[key] = precision_score(predictions, y_val)
    recall[key] = recall_score(predictions, y_val)

In [None]:
import pandas as pd

df_model = pd.DataFrame(index=models.keys(), columns=['Accuracy', 'Precision', 'Recall'])
df_model['Accuracy'] = accuracy.values()
df_model['Precision'] = precision.values()
df_model['Recall'] = recall.values()
df_model

Unnamed: 0,Accuracy,Precision,Recall
Logistic Regression,0.920716,0.889968,0.907591
Support Vector Machines,0.920716,0.886731,0.910299
Decision Trees,0.911765,0.899676,0.879747
Random Forest,0.952685,0.912621,0.965753
Naive Bayes,0.818414,0.954693,0.6974
K-Nearest Neighbor,0.911765,0.889968,0.887097
