In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
%matplotlib inline
from collections import Counter

# **1-Load Data**

In [None]:
train_df=pd.read_csv('/kaggle/input/titanic/train.csv')
test_df=pd.read_csv('/kaggle/input/titanic/test.csv')

# **2-Explore Data**

In [None]:
train_df.head(10)

In [None]:
train_df.tail()

# **Variable Description**


1.  PassengerId: unique id number to each passenger,
2. Survived: passenger survive(1) or died(0),
3. Pclass: passenger class
4. Name: name
5. Sex: gender of passenger
6. Age: age of passenger
7. SibSp: number of siblings/spouses
8. Parch: number of parents/children
9. Ticket: ticket number
10. Fare: amount of money spent on ticket
11. Cabin: cabin category
12. Embarked: port where passenger embarked (C = Cherbourg, Q = Queenstown, S = Southampton)

train_df.describe()

In [None]:
train_df.info()
print('-'*50)
test_df.info()

int64:[PassengerId,Parch,Survived,Pclass,SibSp]

float64:[Age,Fare]

object:[Name,Sex,Cabin,Embarked,Ticket]

In [None]:
train_df.columns

In [None]:
train_df.shape

In [None]:
null_values=train_df.isnull().sum()
null_values[null_values>0]

In [None]:
sns.heatmap(train_df.isnull())

# **Variable Description** 

Categorical Variable: Survived, Sex, Pclass, Embarked, Cabin, Name, Ticket, Sibsp and Parch

Numerical Variable: Fare, age and passengerId

In [None]:
def bar_plot(variable):
    # Get feature
    var = train_df[variable]
    # Count the number of occurrences for each category
    varValue = var.value_counts()
    # Visualize
    plt.figure(figsize=(10, 5))
    plt.bar(
        varValue.index, 
        varValue, 
        color='skyblue', 
        edgecolor='black', 
        alpha=0.8
    )
    # Customize ticks and labels
    plt.xticks(varValue.index, varValue.index.values, fontsize=12, rotation=45, color='darkblue')
    plt.ylabel("Frequency", fontsize=14, color='darkblue')
    plt.title(f"Distribution of {variable}", fontsize=16, color='darkred')
    plt.grid(axis='y', linestyle='--', alpha=0.6)  # Add a grid for better readability
    plt.tight_layout()  # Adjust layout for better spacing
    plt.show()

    # Print variable details
    print(f"{variable}: \n{varValue}")


In [None]:
categorical_col=["Survived","Sex","Pclass","Embarked","SibSp", "Parch"]
for col in categorical_col:
    bar_plot(col)

In [None]:
cat_col2=["Cabin", "Name", "Ticket"]
for col in cat_col2:
    print("{} \n".format(train_df[col].value_counts()))

In [None]:
def plot_hist(variable):
    plt.figure(figsize=(10, 5))
    plt.hist(
        train_df[variable], 
        bins=30, 
        color='skyblue', 
        edgecolor='black', 
        alpha=0.7
    )
    plt.xlabel(variable, fontsize=14, color='darkblue')
    plt.ylabel("Frequency", fontsize=14, color='darkblue')
    plt.title("{} Distribution".format(variable), fontsize=16, color='darkred')
    plt.grid(axis='y', linestyle='--', alpha=0.6)  # Add a grid for better readability
    plt.tight_layout()  # Adjust layout for better spacing
    plt.show()

In [None]:
numeric_col=["Fare", "Age","PassengerId"]
for col in numeric_col:
    plot_hist(col)

In [None]:
# Pclass vs Survived
train_df[["Pclass","Survived"]].groupby(["Pclass"],as_index=False).mean().sort_values(by="Survived",ascending = False)

In [None]:
#Sex vs Survived
train_df[["Sex","Survived"]].groupby(["Sex"],as_index=False).mean().sort_values(by="Survived",ascending = False)

In [None]:
#SibSp vs Survived
train_df[["SibSp","Survived"]].groupby(["SibSp"],as_index=False).mean().sort_values(by="Survived",ascending = False)

In [None]:
# Parch vs Survived
train_df[["Parch","Survived"]].groupby(["Parch"], as_index = False).mean().sort_values(by="Survived",ascending = False)

In [None]:
def detect_outliers(df, features):
    outlier_indices = []
    
    for c in features:
        # 1st quartile
        Q1 = np.percentile(df[c], 25)
        # 3rd quartile
        Q3 = np.percentile(df[c], 75)
        # IQR
        IQR = Q3 - Q1
        # Outlier step
        outlier_step = IQR * 1.5
        # Detect outliers and their indices
        outlier_list_col = df[(df[c] < Q1 - outlier_step) | (df[c] > Q3 + outlier_step)].index
        # Store indices
        outlier_indices.extend(outlier_list_col)
    
    # Count occurrences of each index
    outlier_indices = Counter(outlier_indices)
    # Keep indices that appear more than twice
    multiple_outliers = [i for i, v in outlier_indices.items() if v > 2]
    
    return multiple_outliers


In [None]:
train_df.loc[detect_outliers(train_df,["Age","SibSp","Parch","Fare"])]

In [None]:
#drop outliers 
train_df = train_df.drop(detect_outliers(train_df,["Age","SibSp","Parch","Fare"]),axis = 0).reset_index(drop = True)

In [None]:
train_df.info()

In [None]:
train_df.isnull().sum()

In [None]:
train_dflen=len(train_df)
train_df=pd.concat([train_df,test_df],axis=0).reset_index(drop = True)

In [None]:
train_df.head()

In [None]:
train_df.isnull().sum()

In [None]:
train_df['Embarked'].value_counts()

In [None]:
train_df["Embarked"].fillna("S", inplace=True)

In [None]:
train_df['Fare'].value_counts()

In [None]:
droped_Columns=['PassengerId','Name','Ticket','Cabin']
train_df=train_df.drop(droped_Columns,axis=1)

In [None]:
train_df.columns

In [None]:
grouped_age=train_df.groupby('Survived')['Age'].mean()
grouped_age

In [None]:
survied_check=grouped_age[1]
train_df['Age'].fillna(survied_check, inplace=True)

In [None]:
unsurvied_check=grouped_age[0]
train_df['Age'].fillna(unsurvied_check, inplace=True)

In [None]:
train_df.head(n=10)

In [None]:
#Fare column Skwed right so we fill null values with median
median_fare = train_df["Fare"].median()
train_df["Fare"].fillna(median_fare, inplace=True)

In [None]:
train_df.isnull().sum()

In [None]:
plt.figure(figsize=(8, 6))
sns.countplot(data=train_df, x='Sex', hue='Survived')
plt.title('Survival Rate by Sex')
plt.xlabel('Sex')
plt.ylabel('Count')
plt.show()


In [None]:
#correlation between numeric variables
# Select only numeric columns
numeric_df = train_df.select_dtypes(include=[float, int])
# Calculate the correlation
plt.figure(figsize=(10, 8))
sns.heatmap(numeric_df.corr(), annot=True, cmap='coolwarm', vmin=-1, vmax=1)
plt.title('Correlation Heatmap')
plt.show()

In [None]:
plt.figure(figsize=(8, 6))
sns.countplot(data=train_df, x='Pclass', hue='Survived')
plt.title('Survival Rate by Pclass')
plt.xlabel('Pclass')
plt.ylabel('Count')
plt.show()


In [None]:
plt.figure(figsize=(8, 6))
sns.histplot(train_df['Age'], bins=20, kde=True, color='blue')
plt.title('Age Distribution')
plt.xlabel('Age')
plt.ylabel('Frequency')
plt.show()


In [None]:
plt.figure(figsize=(8, 6))
sns.countplot(data=train_df, x='Embarked', hue='Survived')
plt.title('Survival Rate by Embarked')
plt.xlabel('Embarked')
plt.ylabel('Count')
plt.show()


In [None]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
train_df['Sex'] = label_encoder.fit_transform(train_df['Sex'])
train_df['Embarked'] = label_encoder.fit_transform(train_df['Embarked'])

In [None]:
train_df.head()

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import RidgeClassifier
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC
from sklearn.svm import NuSVC
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import ComplementNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import CategoricalNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.isotonic import IsotonicRegression
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import IsolationForest
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

In [None]:
test = train_df[train_dflen:]
test.drop(labels = ["Survived"],axis = 1, inplace = True)

In [None]:
test.head()

In [None]:
from sklearn.model_selection import train_test_split

train = train_df[:train_dflen]
X_train = train.drop(labels = "Survived", axis = 1)
y_train = train["Survived"]
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size = 0.33, random_state = 42)
print("X_train",len(X_train))
print("X_test",len(X_test))
print("y_train",len(y_train))
print("y_test",len(y_test))
print("test",len(test))

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score

models = [
    LogisticRegression(),
    RidgeClassifier(),
    PassiveAggressiveClassifier(),
    Perceptron(),
    SGDClassifier(),
    SVC(),
    NuSVC(),
    LinearSVC(),
    KNeighborsClassifier(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    ExtraTreesClassifier(),
    GradientBoostingClassifier(),
    AdaBoostClassifier(),
    BaggingClassifier(),
    GaussianNB(),
    MultinomialNB(),
    ComplementNB(),
    BernoulliNB(),
    CategoricalNB(),
    LinearDiscriminantAnalysis(),
    QuadraticDiscriminantAnalysis(),
    GaussianProcessClassifier(),
    MLPClassifier(),
    XGBClassifier(),
    IsolationForest(),
    LGBMClassifier(),
    CatBoostClassifier(silent=True)
]

for model in models:
    model_name = model.__class__.__name__
    try:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        cross_val = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy').mean()
        print(f"{model_name} Accuracy: {accuracy:.4f} | Cross-Val Score: {cross_val:.4f}")
    except Exception as e:
        print(f"{model_name} could not be trained: {e}")

In [None]:
import optuna

# Objective function for Optuna optimization
def objective(trial, model_name):
    if model_name == 'Logistic Regression':
        C = trial.suggest_loguniform('C', 1e-5, 1e5)
        model = LogisticRegression(C=C, max_iter=200)
        
    elif model_name == 'K-Nearest Neighbors':
        n_neighbors = trial.suggest_int('n_neighbors', 1, 20)
        model = KNeighborsClassifier(n_neighbors=n_neighbors)
        
    elif model_name == 'Decision Tree':
        max_depth = trial.suggest_int('max_depth', 1, 20)
        model = DecisionTreeClassifier(max_depth=max_depth)
        
    elif model_name == 'Random Forest':
        n_estimators = trial.suggest_int('n_estimators', 10, 200)
        max_depth = trial.suggest_int('max_depth', 1, 20)
        model = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth)
        
    elif model_name == 'SVC':
        C = trial.suggest_loguniform('C', 1e-5, 1e5)
        gamma = trial.suggest_loguniform('gamma', 1e-5, 1e1)
        kernel = trial.suggest_categorical('kernel', ['linear', 'rbf', 'poly'])
        model = SVC(C=C, gamma=gamma, kernel=kernel)
        
    elif model_name == 'Naive Bayes':
        model = GaussianNB()
        
    elif model_name == 'Gradient Boosting':
        n_estimators = trial.suggest_int('n_estimators', 10, 200)
        learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 1e0)
        max_depth = trial.suggest_int('max_depth', 1, 10)
        model = GradientBoostingClassifier(n_estimators=n_estimators, learning_rate=learning_rate, max_depth=max_depth)
        
    elif model_name == 'Hist Gradient Boosting':
        max_iter = trial.suggest_int('max_iter', 100, 1000)
        max_depth = trial.suggest_int('max_depth', 1, 10)
        model = HistGradientBoostingClassifier(max_iter=max_iter, max_depth=max_depth)
        
    elif model_name == 'AdaBoost':
        n_estimators = trial.suggest_int('n_estimators', 10, 200)
        learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 1e0)
        model = AdaBoostClassifier(n_estimators=n_estimators, learning_rate=learning_rate)
        
    elif model_name == 'Bagging':
        n_estimators = trial.suggest_int('n_estimators', 10, 200)
        max_samples = trial.suggest_uniform('max_samples', 0.1, 1.0)
        model = BaggingClassifier(n_estimators=n_estimators, max_samples=max_samples)
        
    elif model_name == 'Voting':
        model = VotingClassifier(estimators=[
            ('lr', LogisticRegression(max_iter=200)),
            ('rf', RandomForestClassifier(n_estimators=100)),
            ('svc', SVC(C=1.0, kernel='linear'))
        ])
        
    elif model_name == 'Ridge Classifier':
        alpha = trial.suggest_loguniform('alpha', 1e-5, 1e5)
        model = RidgeClassifier(alpha=alpha)
        
    elif model_name == 'Perceptron':
        alpha = trial.suggest_loguniform('alpha', 1e-5, 1e-1)
        max_iter = trial.suggest_int('max_iter', 50, 1000)
        model = Perceptron(alpha=alpha, max_iter=max_iter)
        
    elif model_name == 'MLP Classifier':
        hidden_layer_sizes = trial.suggest_categorical('hidden_layer_sizes', [(50,), (100,), (100, 100)])
        learning_rate_init = trial.suggest_loguniform('learning_rate_init', 1e-5, 1e-1)
        model = MLPClassifier(hidden_layer_sizes=hidden_layer_sizes, learning_rate_init=learning_rate_init)
        
    elif model_name == 'Quadratic Discriminant Analysis':
        model = QuadraticDiscriminantAnalysis()
        
    elif model_name == 'Linear Discriminant Analysis':
        model = LinearDiscriminantAnalysis()
        
    elif model_name == 'Calibrated Classifier CV':
        model = CalibratedClassifierCV(LogisticRegression())
        
    elif model_name == 'Gaussian Process':
        kernel = trial.suggest_categorical('kernel', ['RBF', 'Matern'])
        model = GaussianProcessClassifier(kernel=RBF())
        
    elif model_name == 'KMeans':
        n_clusters = trial.suggest_int('n_clusters', 2, 5)
        model = KMeans(n_clusters=n_clusters)
        
    elif model_name == 'Gaussian Mixture':
        n_components = trial.suggest_int('n_components', 2, 5)
        model = GaussianMixture(n_components=n_components)
        
    elif model_name == 'XGBoost':
        n_estimators = trial.suggest_int('n_estimators', 50, 200)
        learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 1e0)
        max_depth = trial.suggest_int('max_depth', 3, 10)
        model = xgb.XGBClassifier(n_estimators=n_estimators, learning_rate=learning_rate, max_depth=max_depth)
        
    elif model_name == 'LightGBM':
        n_estimators = trial.suggest_int('n_estimators', 50, 200)
        learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 1e0)
        max_depth = trial.suggest_int('max_depth', 3, 10)
        model = lgb.LGBMClassifier(n_estimators=n_estimators, learning_rate=learning_rate, max_depth=max_depth)
        
    elif model_name == 'CatBoost':
        iterations = trial.suggest_int('iterations', 50, 200)
        depth = trial.suggest_int('depth', 3, 10)
        learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 1e0)
        model = cb.CatBoostClassifier(iterations=iterations, depth=depth, learning_rate=learning_rate, silent=True)
    
    # Fit the model and return the accuracy score
    model.fit(X_train, y_train)
    accuracy = model.score(X_test, y_test)
    return accuracy

# List of models
model_names = [
    'Logistic Regression', 'K-Nearest Neighbors', 'Decision Tree', 'Random Forest',
    'SVC', 'Naive Bayes', 'Gradient Boosting', 'Hist Gradient Boosting', 'AdaBoost',
    'Bagging', 'Voting', 'Ridge Classifier', 'Perceptron', 'MLP Classifier', 
    'Quadratic Discriminant Analysis', 'Linear Discriminant Analysis', 'Calibrated Classifier CV',
    'Gaussian Process', 'KMeans', 'Gaussian Mixture', 'XGBoost', 'LightGBM', 'CatBoost'
]

# Optimize for each model
for model_name in model_names:
    study = optuna.create_study(direction='maximize')
    study.optimize(lambda trial: objective(trial, model_name), n_trials=50)
    print(f"Best parameters for {model_name}: {study.best_params}")
    print(f"Best accuracy for {model_name}: {study.best_value:.4f}")
    print("-" * 50)
