In [None]:
from pandas import read_csv, DataFrame, crosstab
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.colors import Normalize
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, LabelEncoder, StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import Perceptron
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import RandomOverSampler, SMOTE, SVMSMOTE, BorderlineSMOTE, ADASYN
import joblib
import json
from os.path import exists

In [None]:
df = read_csv("/kaggle/input/titanic-dataset-2/Titanic Data Set.csv")

In [None]:
print(df.head())

In [None]:
print(df.tail())

In [None]:
# Check if there are any duplicate values in dataset
print(sum(df.duplicated()))

In [None]:
print(df.info())

In [None]:
na_counts = DataFrame(df.isna().sum(),columns=["NA Counts"]).reset_index()
na_counts = na_counts.rename(columns={'index': 'Column Name'})
print(na_counts)

In [None]:
df.describe()

In [None]:
# Printing unique values in datasets columns
for column in ["Survived","Pclass","Sex","SibSp","Parch","Cabin","Embarked"]:
    print(f"{column}:{df[column].unique()}\n")

In [None]:
#Printing unique values in dataset columns
for column in ["Survived","Pclass","Sex","SibSp","Parch","Cabin","Embarked"]:
    print(f"{df[column].value_counts()}{df[column].value_counts(normalize=True)}\n")

In [None]:
# Replacing NA in Age with the mean age
mean_Age = df['Age'].mean()
df['Age'] = df['Age'].fillna(mean_Age)

In [None]:
# As there are several NA in Cabin, for better analysis making another column is_Cabin which is 0 if Cabin is NAN or else 1
df['is_Cabin'] = np.where(df['Cabin'].isna(), 0, 1)
df = df.drop('Cabin', axis = 1)

In [None]:
# Replacing NA in Embarked with the mode
mode_Embarked = df['Embarked'].mode()
df['Embarked'] = df['Embarked'].fillna(mode_Embarked[0])

In [None]:
# Now checking for NA
na_counts = DataFrame(df.isna().sum(),columns=["NA Counts"]).reset_index()
na_counts = na_counts.rename(columns={'index': 'Column Name'})
print(na_counts)

In [None]:
# Printing unique values in datasets columns
for column in ["Survived","Pclass","Sex","SibSp","Parch","is_Cabin","Embarked"]:
    print(f"{column}:{df[column].unique()}\n")

In [None]:
#Printing unique values in dataset columns
for column in ["Survived","Pclass","Sex","SibSp","Parch","is_Cabin","Embarked"]:
    print(f"{df[column].value_counts()}{df[column].value_counts(normalize=True)}\n")

<h1>Exploratory Data Analysis</h1>

<p>As we know that females, elderly and children were preferred to be saved via lifeboats let us check the survival on basis of these two parameters. Then let us analyze it in terms of Pclass to check if there was a preference to save 1st class passengers.</p>

In [None]:
# Create a heatmap directly from the DataFrame
sns.heatmap(crosstab(df['Survived'], df['Sex']), cmap='Blues', annot=True)  # Adjust colormap as desired

# Customize the plot
plt.xlabel('Gender')
plt.ylabel('Survived')
plt.title('Heatmap - Survived vs Gender')
plt.show()

In [None]:
 # Create the violin plot
sns.violinplot(x = "Survived",
               y = "Age",
               data = df,
               hue = "Survived",
               palette = "viridis")

# Customize the plot
plt.xlabel("Survived")
plt.ylabel("Age")
plt.title("Violin Plot of Survived vs Age")
plt.xticks(rotation=45)
plt.show()

In [None]:
 # Create the violin plot
sns.violinplot(x = "Survived",
               y = "Age",
               data = df,
               hue = "Sex",
               palette = "viridis")

# Customize the plot
plt.xlabel("Survived")
plt.ylabel("Age")
plt.title("Violin Plot of Survived vs Age and Gender")
plt.xticks(rotation=45)
plt.show()

In [None]:
# Create a heatmap directly from the DataFrame
sns.heatmap(crosstab(df['Survived'], df['Pclass']), cmap='Blues', annot=True)  # Adjust colormap as desired

# Customize the plot
plt.xlabel('Passenger Class')
plt.ylabel('Survived')
plt.title('Heatmap - Survived vs Passenger Class')
plt.show()

In [None]:
# Create a heatmap directly from the DataFrame
sns.heatmap(crosstab(df['Survived'], df['is_Cabin']), cmap='Blues', annot=True)  # Adjust colormap as desired

# Customize the plot
plt.xlabel('Cabin Passenger')
plt.ylabel('Survived')
plt.title('Heatmap - Survived vs Cabin Passenger')
plt.show()

<p>We see a clear indication that female passengers are more likely to survive. Children and the Elderly were also more likely to survive especially in males. Passengers in 1st Class are more likely to survive than passengers in 2nd Class and 3rd Class. Passengers in Cabin are also more likely to survive than passengers not having cabin tickets.</p>

<h1>Splitting the Training and Testing Data Set</h1>

In [None]:
# The only features we will use for further modelling - SibSp, Pclass, Sex, Age, Parch, is_Cabin and Embarked
X,y = df.drop(["PassengerId","Name","Ticket","Survived"],axis=1), df["Survived"]

# Split data into training and testing sets (default test_size=0.2) # Through trial and error this was found to be the best split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=4)

In [None]:
print("Training Data Shape")
print(X_train.shape)
print("Testing Data Shape")
print(X_test.shape)

In [None]:
# Define column names
ordinal_cols = ['Sex']
onehot_cols = ['Embarked']
numerical_cols = [col for col in X.columns if col not in ordinal_cols + onehot_cols]

# Define the ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('ordinal', OrdinalEncoder(), ordinal_cols),
        ('onehot', OneHotEncoder(), onehot_cols),
        ('num', 'passthrough', numerical_cols)
    ]
)

In [None]:
transformed_df = preprocessor.fit_transform(df.drop(["PassengerId","Name","Ticket"],axis=1))
transformed_df = DataFrame(transformed_df, columns=['Sex', 'Embarked_C', 'Embarked_Q', 'Survived', 'Pclass', 'Age', 'SibSp','Parch','Fare','is_Cabin'])
transformed_df["Survived"] = df["Survived"].astype(float)
transformed_df["Embarked_S"] = transformed_df.apply(lambda row: 1.0 if row['Embarked_C'] == 0.0 and row['Embarked_Q'] == 0.0 else 0.0, axis=1)

corr = transformed_df.corr()
norm = Normalize(vmin=-0.54, vmax=0.54)
plt.figure(figsize=(10, 7.5))
sns.heatmap(corr["Survived"].to_frame().sort_values(by="Survived", key=lambda x: x.abs(), ascending=False).iloc[1:11,:],
            annot=True, cmap = "coolwarm",norm = norm)
plt.show()

<p> As per the above heatmap we can analyze the feature importance and correlation with the target variable Survived. By this analysis, we find the maximum importance is of gender followed by Passenger Class, whether the passenger has a cabin ticket and fare. The other features have relatively lesser importance due to its low correlation with the target variable.

<h1> KNN </h1>

In [None]:
# If model already run from the existing model or else define the model
if exists ('/kaggle/input/knn-model/scikitlearn/knn/1/knn_model.joblib'):
    print("Loading from file")
    knn_loaded = joblib.load('/kaggle/input/knn-model/scikitlearn/knn/1/knn_model.joblib')
    y_pred_knn = knn_loaded.predict(X_test)
    y_pred_proba_knn = knn_loaded.predict_proba(X_test)
    with open('/kaggle/input/result-json-2/knn_results.json', 'r') as json_file:
        results = json.load(json_file)
        optimal_params = results['optimal_params']
        optimal_accuracy = results['optimal_accuracy']
        print(f'Best parameters: {optimal_params}')
        print(f'Best cross-validation accuracy: {optimal_accuracy:.5f}')
else:
    # Define the pipeline
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('scaler', StandardScaler()),  # Apply scaling to all columns after preprocessing
        ('knn', KNeighborsClassifier())
    ])

    # Define the parameter grid for GridSearchCV
    param_grid = {
        'knn__n_neighbors': [3, 5, 7, 9, 11],
        'knn__weights': ['uniform', 'distance'],
        'knn__metric': ['euclidean', 'manhattan', 'minkowski', 'canberra', 'braycurtis', 'chebyshev', 'cosine'],
        'knn__p': [1.5, 2.5]
    }

    # Define the GridSearchCV
    grid_search = GridSearchCV(pipeline, param_grid, cv=5, n_jobs = -1)

    # Perform GridSearchCV to find the best parameters and fit the model
    grid_search.fit(X_train, y_train)

    # Update the pipeline with the best estimator
    optimal_estimator = grid_search.best_estimator_

    y_pred_knn = optimal_estimator.predict(X_test)
    y_pred_proba_knn = optimal_estimator.predict_proba(X_test)

    # Print the best parameters and cross-validation accuracy
    print(f'Best parameters: {(optimal_params:= grid_search.best_params_)}')
    print(f'Best cross-validation accuracy: {(optimal_accuracy:= grid_search.best_score_):.5f}')

    # Writing it in json file
    results = {'optimal_params': optimal_params, 'optimal_accuracy': optimal_accuracy}
    with open('/kaggle/input/result-json-2/knn_results.json', 'w') as json_file:
        json.dump(results, json_file)

    # Save the pipeline
    joblib.dump(optimal_estimator, '/kaggle/input/knn-model/scikitlearn/knn/1/knn_model.joblib')
    print("Model trained and saved to disk.")


<h2> Model Evaluation </h2>

In [None]:
accuracy_knn = accuracy_score(y_test, y_pred_knn)
print(f"The accuracy of KNN model is {accuracy_knn:5f}")

In [None]:
print(confusion_matrix(y_test, y_pred_knn))

In [None]:
print(classification_report(y_test,y_pred_knn))

In [None]:
f1_knn = f1_score(y_test,y_pred_knn)
print(f"The f1 score of KNN model is {f1_knn:5f}")

In [None]:
roc_auc_knn = roc_auc_score(y_test, y_pred_proba_knn[:,1])
print(f"The ROC-AUC score of KNN model is {roc_auc_knn:5f}")

In [None]:
fpr_knn, tpr_knn, _ = roc_curve(y_test, y_pred_proba_knn[:, 1])
plt.plot(fpr_knn, tpr_knn, label='KNN (AUC = {:.5f})'.format(roc_auc_knn))
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve for KNN')
plt.text(0.6, 0.2, f'AUC = {roc_auc_knn:.5f}', fontsize=12, color='navy')
plt.show()

<h1>Perceptron</h1>

In [None]:
# If model already run from the existing model or else define the model
if exists ('/kaggle/input/perceptron/scikitlearn/perceptron/1/ppp_model.joblib'):
    print("Loading from file")
    ppp_loaded = joblib.load('/kaggle/input/perceptron/scikitlearn/perceptron/1/ppp_model.joblib')
    y_pred_ppp = ppp_loaded.predict(X_test)
    y_pred_proba_ppp = ppp_loaded.decision_function(X_test)
    with open('/kaggle/input/result-json-2/ppp_results.json', 'r') as json_file:
        results = json.load(json_file)
        optimal_params = results['optimal_params']
        optimal_accuracy = results['optimal_accuracy']
        print(f'Best parameters: {optimal_params}')
        print(f'Best cross-validation accuracy: {optimal_accuracy:.5f}')
else:
    # Define the pipeline
    pipeline = ImbPipeline(steps=[
        ('preprocessor', preprocessor),
        ('scaler', StandardScaler()),  # Apply scaling to all columns after preprocessing
        ('smote', BorderlineSMOTE(random_state=21)),
        ('perceptron', Perceptron(random_state = 4))
    ])

    # Define the parameter grid for GridSearchCV
    param_grid = {
        'perceptron__max_iter': [50, 100],          # Maximum number of iterations
        'perceptron__eta0': [0.1, 0.01, 0.001, 0.0001],           # Initial learning rate
    }

    # Define the GridSearchCV
    grid_search = GridSearchCV(pipeline, param_grid, cv=5)

    # Perform GridSearchCV to find the best parameters and fit the model
    grid_search.fit(X_train, y_train)

    # Update the pipeline with the best estimator
    optimal_estimator = grid_search.best_estimator_

    y_pred_ppp = optimal_estimator.predict(X_test)
    y_pred_proba_ppp = optimal_estimator.decision_function(X_test)

    # Print the best parameters and cross-validation accuracy
    print(f'Best parameters: {(optimal_params:= grid_search.best_params_)}')
    print(f'Best cross-validation accuracy: {(optimal_accuracy:= grid_search.best_score_):.5f}')

    # Writing it in json file
    results = {'optimal_params': optimal_params, 'optimal_accuracy': optimal_accuracy}
    with open('/kaggle/input/result-json-2/ppp_results.json', 'w') as json_file:
        json.dump(results, json_file)

    # Save the pipeline
    joblib.dump(optimal_estimator, '/kaggle/input/perceptron/scikitlearn/perceptron/1/ppp_model.joblib')
    print("Model trained and saved to disk.")


<h1> Model Evaluation </h1>

In [None]:
accuracy_ppp = accuracy_score(y_test, y_pred_ppp)
print(f"The accuracy of Perceptron model is {accuracy_ppp:5f}")

In [None]:
print(confusion_matrix(y_test, y_pred_ppp))

In [None]:
print(classification_report(y_test,y_pred_ppp))

In [None]:
f1_ppp = f1_score(y_test,y_pred_ppp)
print(f"The f1 score of Perceptron model is {f1_ppp:5f}")

In [None]:
roc_auc_ppp = roc_auc_score(y_test, y_pred_proba_ppp)
print(f"The ROC-AUC score of Perceptron model is {roc_auc_ppp:5f}")

In [None]:
fpr_ppp, tpr_ppp, _ = roc_curve(y_test, y_pred_proba_ppp)
plt.plot(fpr_ppp, tpr_ppp, label='Perceptron (AUC = {:.5f})'.format(roc_auc_ppp))
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve for Perceptron Model')
plt.text(0.6, 0.2, f'AUC = {roc_auc_ppp:.5f}', fontsize=12, color='navy')
plt.show()

<h1> Custom Perceptron </h1>

In [None]:
class Perceptron1(object):
    #eta : float, Learning rate (between 0.0 and 1.0)
    #n_iter : int, Passes over the training dataset.
    #random_state : int, Random number generator seed for random weight
      #initialization.

    def __init__(self, eta=0.01, n_iter=50, random_state=1):
        self.eta = eta
        self.n_iter = n_iter
        self.random_state = random_state

    def fit(self, X, y):
        rgen = np.random.RandomState(self.random_state)
        self.w_ = rgen.normal(loc=0.0, scale=0.01, size=1 + X.shape[1])
        self.errors_ = []

        for _ in range(self.n_iter):
            errors = 0
            for xi, target in zip(X, y):
                update = self.eta * (target - self.predict(xi))
                self.w_[1:] += update * xi
                self.w_[0] += update
                errors += int(update != 0.0)
            self.errors_.append(errors)
        return self

    def net_input(self, X):
        #Calculate net input
        return np.dot(X, self.w_[1:]) + self.w_[0]

    def predict(self, X):
        #Return class label after unit step
        return np.where(self.net_input(X) >= 0.0, 1, 0)

    def predict_proba(self, X):
        # Apply sigmoid function to output probabilities
        return self.sigmoid(self.net_input(X))

    def sigmoid(self, z):
        return 1.0 / (1.0 + np.exp(-z))

In [None]:
pipeline1 = ImbPipeline(steps=[
    ('preprocessor', preprocessor),
    ('scaler', StandardScaler()),  # Apply scaling to all columns after preprocessing
    ('perceptron', Perceptron1(eta = 0.001, n_iter = 100, random_state = 10))
])

In [None]:
pipeline1.fit(X_train, y_train)

<h1> Model Evaluation </h1>

In [None]:
y_pred_ppp1 = pipeline1.predict(X_test)

In [None]:
accuracy_ppp1 = accuracy_score(y_test, y_pred_ppp1)
print(f"The accuracy of Custom Perceptron model is {accuracy_ppp1:5f}")

In [None]:
print(confusion_matrix(y_test, y_pred_ppp1))

In [None]:
print(classification_report(y_test,y_pred_ppp1))

In [None]:
f1_ppp1 = f1_score(y_test,y_pred_ppp1)
print(f"The f1 score of Custom Perceptron model is {f1_ppp1:5f}")

In [None]:
y_pred_proba_ppp1 = pipeline1.predict_proba(X_test)

In [None]:
roc_auc_ppp1 = roc_auc_score(y_test, y_pred_proba_ppp1)
print(f"The ROC-AUC score of Custom Perceptron model is {roc_auc_ppp1:5f}")

In [None]:
fpr_ppp1, tpr_ppp1, _ = roc_curve(y_test, y_pred_proba_ppp1)
plt.plot(fpr_ppp1, tpr_ppp1, label='Custom Perceptron (AUC = {:.5f})'.format(roc_auc_ppp1))
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve for Custom Perceptron Model')
plt.text(0.6, 0.2, f'AUC = {roc_auc_ppp1:.5f}', fontsize=12, color='navy')
plt.show()

<h1>Comparision of Models</h1>

In [None]:
metrics = DataFrame({"KNN":[accuracy_knn, f1_knn, roc_auc_knn],
                     "Perceptron":[accuracy_ppp, f1_ppp, roc_auc_ppp],
                     "Custom Perceptron":[accuracy_ppp1, f1_ppp1, roc_auc_ppp1],},
                    index = ["Accuracy", "F1 Score", "ROC AUC Score"])
metrics = metrics.rename_axis('Metrics')
display(metrics)



In [None]:
plt.plot(fpr_knn, tpr_knn, label='KNN (AUC = {:.5f})'.format(roc_auc_knn), color = "red", alpha = 0.5)
plt.plot(fpr_ppp, tpr_ppp, label='Perceptron (AUC = {:.5f})'.format(roc_auc_ppp),color = "green", alpha = 0.5)
plt.plot(fpr_ppp1, tpr_ppp1, label='Custom Perceptron (AUC = {:.5f})'.format(roc_auc_ppp1),color = "blue", alpha = 0.5)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Comparision of ROC Curve for KNN, Perceptron and Custom Perceptron')
plt.legend(loc='lower right')
plt.show()