In [None]:
import pandas as pd
import numpy  as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import make_column_transformer, ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.naive_bayes import GaussianNB

from sklearn.model_selection import cross_val_score, StratifiedKFold, train_test_split, GridSearchCV

In [None]:
#Importing our train & test dataset
train_df = pd.read_csv('/kaggle/input/titanic/train.csv')
test_df = pd.read_csv('/kaggle/input/titanic/test.csv')

In [None]:
train_df.columns

In [None]:
train_df.describe().round(3)

In [None]:
train_df.describe(include=['O']) # Describes categorical variables

In [None]:
test_df.head()

In [None]:
train_df.info()

In [None]:
#EDA
# Mean survival by passenger class
train_df.groupby(['Pclass'], as_index=False)['Survived'].mean()

In [None]:
# Mean survival by sex
train_df.groupby(['Sex'], as_index=False)['Survived'].mean()

In [None]:
# Mean survival by number of siblings / spouses aboard the Titanic	
train_df.groupby(['SibSp'], as_index=False)['Survived'].mean()

In [None]:
# Mean survival by number of parents / children aboard the Titanic
train_df.groupby(['Parch'], as_index=False)['Survived'].mean()

In [None]:
# By family size = SibSp + Parch + 1
train_df['Family_Size'] = train_df['SibSp'] + train_df['Parch'] + 1
#test_df['Family_Size'] = train_df['SibSp'] + train_df['Parch'] + 1
test_df['Family_Size'] = test_df['SibSp'] + test_df['Parch'] + 1

In [None]:
train_df.describe()

In [None]:
test_df.describe()

In [None]:
# Mean survival by family size
train_df.groupby(['Family_Size'], as_index=False)['Survived'].mean()

In [None]:
# Grouping by Family_Size
family_map = {1: 'Alone', 2: 'Small', 3: 'Small', 4: 'Small', 5: 'Medium', 6:'Medium', 7:'Large', 8:'Large', 9:'Large', 10:'Large', 11:'Large'}
train_df['Family_Size_Grp']= train_df['Family_Size'].map(family_map)
test_df['Family_Size_Grp']= test_df['Family_Size'].map(family_map)

In [None]:
# Mean survival by family Groups
train_df.groupby(['Family_Size_Grp'], as_index=False)['Survived'].mean().round(3)

In [None]:
# Mean survival by Port of Embarkation	C = Cherbourg, Q = Queenstown, S = Southampton
train_df.groupby(['Embarked'], as_index=False)['Survived'].mean().round(3)

In [None]:
#Building Graphs
# Disburition of age by survival
sns.displot(train_df, x='Age', col='Survived', binwidth=10, height=5) 

In [None]:
sns.displot(train_df, x='Age', height=5)

In [None]:
#By age groups of 8 bins
train_df['Age_Cut'] = pd.qcut(train_df['Age'], 8) #splits a age variable into quantile-based bins(intervals that each contain the apprx same number of data points)
test_df['Age_Cut'] = pd.qcut(test_df['Age'], 8)

In [None]:
train_df.groupby(['Age_Cut'], as_index=False)['Survived'].mean().round(3)

In [None]:
# Reclassifying age from 0 to 7
train_df.loc[train_df['Age'] <=16, 'Age']=  0
train_df.loc[(train_df['Age'] >16) & (train_df['Age'] <=20.125), 'Age']=  1
train_df.loc[(train_df['Age'] >20.125) & (train_df['Age'] <=24.0), 'Age']=  2
train_df.loc[(train_df['Age'] >24) & (train_df['Age'] <=28), 'Age']=  3
train_df.loc[(train_df['Age'] >28) & (train_df['Age'] <=32.312), 'Age']=  4
train_df.loc[(train_df['Age'] >32.312) & (train_df['Age'] <=38), 'Age']=  5
train_df.loc[(train_df['Age'] >38) & (train_df['Age'] <=47), 'Age']=  6
train_df.loc[(train_df['Age'] >47) & (train_df['Age'] <=80), 'Age']=  7
train_df.loc[train_df['Age'] >80, 'Age']



In [None]:
train_df.head(20)

In [None]:
test_df.loc[test_df['Age'] <=16, 'Age']=  0
test_df.loc[(test_df['Age'] >16) & (test_df['Age'] <=20.125), 'Age']=  1
test_df.loc[(test_df['Age'] >20.125) & (test_df['Age'] <=24.0), 'Age']=  2
test_df.loc[(test_df['Age'] >24) & (test_df['Age'] <=28), 'Age']=  3
test_df.loc[(test_df['Age'] >28) & (test_df['Age'] <=32.312), 'Age']=  4
test_df.loc[(test_df['Age'] >32.312) & (test_df['Age'] <=38), 'Age']=  5
test_df.loc[(test_df['Age'] >38) & (test_df['Age'] <=47), 'Age']=  6
test_df.loc[(test_df['Age'] >47) & (test_df['Age'] <=80), 'Age']=  7
test_df.loc[test_df['Age'] >80, 'Age']

In [None]:
test_df.head(20)

In [None]:
# Testing set
#bins = [0, 16, 20.125, 24.0, 28, 32.312, 38, 47, 80]
#labels = [0, 1, 2, 3, 4, 5, 6, 7]
#test_df['Age'] = pd.cut(test_df['Age'], bins=bins, labels=labels, right=True, include_lowest=True)
#test_df['Age'] = test_df['Age'].astype(int) # Converts back to interger 

In [None]:
# Disburition of Passenger fare by survival
sns.displot(train_df, x='Fare', col='Survived', binwidth=80, height=5) 

In [None]:
#By age groups of 8 bins
train_df['Fare_Cut'] = pd.qcut(train_df['Fare'], 6) #splits a age variable into quantile-based bins(intervals that each contain the apprx same number of data points)
test_df['Fare_Cut'] = pd.qcut(test_df['Fare'], 6)

In [None]:
train_df.groupby(['Fare_Cut'], as_index=False)['Survived'].mean().round(3)

In [None]:
# Reclassifying Fare from 0 to 5
train_df.loc[train_df['Fare'] <=7.775, 'Fare']=  0
train_df.loc[(train_df['Fare'] >7.775) & (train_df['Fare'] <=8.662), 'Fare']=  1
train_df.loc[(train_df['Fare'] >8.662) & (train_df['Fare'] <=14.454), 'Fare']=  2
train_df.loc[(train_df['Fare'] >14.454) & (train_df['Fare'] <=26), 'Fare']=  3
train_df.loc[(train_df['Fare'] >26) & (train_df['Fare'] <=52.369), 'Fare']=  4
train_df.loc[(train_df['Fare'] >52.369) & (train_df['Fare'] <=512.329), 'Fare']=  5
train_df.loc[train_df['Fare'] >512.329, 'Fare']


test_df.loc[test_df['Fare'] <=7.775, 'Fare']=  0
test_df.loc[(test_df['Fare'] >7.775) & (test_df['Fare'] <=8.662), 'Fare']=  1
test_df.loc[(test_df['Fare'] >8.662) & (test_df['Fare'] <=14.454), 'Fare']=  2
test_df.loc[(test_df['Fare'] >14.454) & (test_df['Fare'] <=26), 'Fare']=  3
test_df.loc[(test_df['Fare'] >26) & (test_df['Fare'] <=52.369), 'Fare']=  4
test_df.loc[(test_df['Fare'] >52.369) & (test_df['Fare'] <=512.329), 'Fare']=  5
test_df.loc[test_df['Fare'] >512.329, 'Fare']


In [None]:
train_df.head()

In [None]:
# Etracting titles from the names
train_df['Name']

In [None]:
train_df['Title'] = train_df['Name'].str.split(pat=",", expand=True)[1].str.split(pat=".", expand=True)[0].apply(lambda x: x.strip()) # lambda fn Removes any leading or trailing whitespace
test_df['Title'] = test_df['Name'].str.split(pat=",", expand=True)[1].str.split(pat=".", expand=True)[0].apply(lambda x: x.strip())

In [None]:
train_df.groupby(['Title'], as_index=False)['Survived'].mean().round(3)

In [None]:
train_df.groupby(['Title']).size()

In [None]:
# Grouping by titles 
#-> Military -- Capt,col, major
#-> Noble -- Jonhkheer, the countless, Don, Lady, Sir
#-> Unmarried Female --  mlle, ms, mme

In [None]:
train_df['Title'] = train_df['Title'].replace({
    'Capt': 'Military',
    'Col': 'Military',
    'Major': 'Military',
    'Jonkheer' : 'Noble',
    'the Countess': 'Noble',
    'Don': 'Noble',
    'Lady': 'Noble',
    'Sir': 'Noble',
    'Mlle': 'Noble',
    'Ms': 'Noble',
    'Mme': 'Noble'
    
})

test_df['Title'] = test_df['Title'].replace({
    'Capt': 'Military',
    'Col': 'Military',
    'Major': 'Military',
    'Jonkheer' : 'Noble',
    'the Countess': 'Noble',
    'Don': 'Noble',
    'Lady': 'Noble',
    'Sir': 'Noble',
    'Mlle': 'Noble',
    'Ms': 'Noble',
    'Mme': 'Noble'
    
})

In [None]:
#Grouping by title
train_df.groupby(['Title'], as_index=False)['Survived'].agg(['count', 'mean']).round(3)

In [None]:
#Checking the name length (someone important may have a longer name)
train_df['Name_Length'] = train_df['Name'].apply(lambda x: len(x))
test_df['Name_Length'] = test_df['Name'].apply(lambda x: len(x))

In [None]:
#kde plot
g = sns.kdeplot(train_df['Name_Length'][(train_df['Survived'] == 0) & (train_df['Name_Length'].notnull())], color='Red', fill=True)
g = sns.kdeplot(train_df['Name_Length'][(train_df['Survived'] == 1) & (train_df['Name_Length'].notnull())], ax=g, color='Blue', fill=True)
g.set_xlabel('Name_Length')
g.set_ylabel('Frequency')
g = g.legend(['Not Survived', 'Survived'])

In [None]:
train_df['Name_LengthGB'] = pd.qcut(train_df['Name_Length'], 8) #splits a age variable into quantile-based bins(intervals that each contain the apprx same number of data points)
test_df['Name_LengthGB'] = pd.qcut(test_df['Name_Length'], 8)

In [None]:
train_df.groupby(['Name_LengthGB'], as_index=False)['Survived'].mean().round(3)

In [None]:
# Reclassifying age from 0 to 7
train_df.loc[train_df['Name_Length'] <=18, 'Name_Size']=  0
train_df.loc[(train_df['Name_Length'] >18) & (train_df['Name_Length'] <=20), 'Name_Size']=  1
train_df.loc[(train_df['Name_Length'] >20) & (train_df['Name_Length'] <=23), 'Name_Size']=  2
train_df.loc[(train_df['Name_Length'] >23) & (train_df['Name_Length'] <=25), 'Name_Size']=  3
train_df.loc[(train_df['Name_Length'] >25) & (train_df['Name_Length'] <=27.25), 'Name_Size']=  4
train_df.loc[(train_df['Name_Length'] >27.5) & (train_df['Name_Length'] <=30), 'Name_Size']=  5
train_df.loc[(train_df['Name_Length'] >30) & (train_df['Name_Length'] <=38), 'Name_Size']=  6
train_df.loc[(train_df['Name_Length'] >38) & (train_df['Name_Length'] <=82), 'Name_Size']=  7
train_df.loc[train_df['Name_Length'] >82, 'Name_Size']

test_df.loc[test_df['Name_Length'] <=18, 'Name_Size']=  0
test_df.loc[(test_df['Name_Length'] >18) & (test_df['Name_Length'] <=20), 'Name_Size']=  1
test_df.loc[(test_df['Name_Length'] >20) & (test_df['Name_Length'] <=23), 'Name_Size']=  2
test_df.loc[(test_df['Name_Length'] >23) & (test_df['Name_Length'] <=25), 'Name_Size']=  3
test_df.loc[(test_df['Name_Length'] >25) & (test_df['Name_Length'] <=27.25), 'Name_Size']=  4
test_df.loc[(test_df['Name_Length'] >27.5) & (test_df['Name_Length'] <=30), 'Name_Size']=  5
test_df.loc[(test_df['Name_Length'] >30) & (test_df['Name_Length'] <=38), 'Name_Size']=  6
test_df.loc[(test_df['Name_Length'] >38) & (test_df['Name_Length'] <=82), 'Name_Size']=  7
test_df.loc[test_df['Name_Length'] >82, 'Name_Size']

In [None]:
train_df.head()

In [None]:
#Ticket var
train_df['Ticket']

In [None]:
# spliting up the ticket string by whitespace and selects the last part
train_df['TicketNumber'] = train_df['Ticket'].apply(lambda x: pd.Series({'Ticket': x.split()[-1]})) 
test_df['TicketNumber'] = test_df['Ticket'].apply(lambda x: pd.Series({'Ticket': x.split()[-1]})) 

In [None]:
# Grouping by 
train_df.groupby(['TicketNumber'], as_index=False)['Survived'].agg(['count', 'mean']).sort_values('count', ascending = False)

In [None]:
# Identfying how many passengers share the same ticket number (fam members could be sharing same ticket numbers)
train_df.groupby('TicketNumber')['TicketNumber'].transform('count')

In [None]:
train_df['TicketNumberCounts'] = train_df.groupby('TicketNumber')['TicketNumber'].transform('count')
test_df['TicketNumberCounts'] = test_df.groupby('TicketNumber')['TicketNumber'].transform('count')

In [None]:
train_df.groupby(['TicketNumberCounts'], as_index=False)['Survived'].agg(['mean', 'count']).sort_values('count', ascending = False)

In [None]:
train_df['Ticket']

In [None]:
#Analyzing the first section of tickets
train_df['Ticket'].str.split(pat=" ", expand=True) # split on spaces

In [None]:
#It creates a new array based on whether the second part of the 'Ticket' string (after splitting by space) exists or not.
train_df['TicketLocation'] = np.where(train_df['Ticket'].str.split(pat=" ", expand=True)[1].notna(), train_df['Ticket'].str.split(pat=" ", expand=True)[0].apply(lambda x:x.strip()), 'Blank')
test_df['TicketLocation'] = np.where(test_df['Ticket'].str.split(pat=" ", expand=True)[1].notna(), test_df['Ticket'].str.split(pat=" ", expand=True)[0].apply(lambda x:x.strip()), 'Blank')

In [None]:
#def extract_ticket_prefix(ticket):
#    parts = ticket.split()
#    if len(parts) > 1:
#        return parts[0].strip()
#    else:
#        return 'Blank'

#train_df['TicketPrefix'] = train_df['Ticket'].apply(extract_ticket_prefix)


In [None]:
train_df['TicketLocation'].value_counts()

In [None]:
# Classifying the above ticket locations
train_df['TicketLocation'] = train_df['TicketLocation'].replace({
    'SOTON/O.Q.':'SOTON/OQ',
    'C.A.':'CA',
    'CA.':'CA',
    'SC/PARIS':'SC/Paris',
    'S.C./PARIS':'SC/Paris',
    'A/4.':'A/4',
    'A/5.':'A/5',
    'A.5.':'A/5',
    'A./5.':'A/5',
    'W./C.':'W/C',    
})

test_df['TicketLocation'] = test_df['TicketLocation'].replace({
    'SOTON/O.Q.':'SOTON/OQ',
    'C.A.':'CA',
    'CA.':'CA',
    'SC/PARIS':'SC/Paris',
    'S.C./PARIS':'SC/Paris',
    'A/4.':'A/4',
    'A/5.':'A/5',
    'A.5.':'A/5',
    'A./5.':'A/5',
    'W./C.':'W/C',    
})

In [None]:
train_df.groupby(['TicketLocation'], as_index=False)['Survived'].agg(['count', 'mean'])

In [None]:
#Cabin analysis
train_df['Cabin'].unique

In [None]:
#Filling missing values in the Cabin column & then extracting the first letter of each cabin entry, which typically represents the deck on the Titanic.
train_df['Cabin'] = train_df['Cabin'].fillna('U') # U for unassigned
train_df['Cabin'] = pd.Series([i[0] if not pd.isnull(i) else 'x' for i in train_df['Cabin']])

test_df['Cabin'] = test_df['Cabin'].fillna('U')
test_df['Cabin'] = pd.Series([i[0] if not pd.isnull(i) else 'x' for i in test_df['Cabin']])

In [None]:
train_df.groupby(['Cabin'], as_index=False)['Survived'].agg(['count', 'mean'])

In [None]:
#assigns a binary value depending on whether a cabin is known or not.
train_df['Cabin_Assigned'] = train_df['Cabin'].apply(lambda x: 0 if x in  ['U'] else 1)
test_df['Cabin_Assigned'] = test_df['Cabin'].apply(lambda x: 0 if x in  ['U'] else 1)

In [None]:
train_df.groupby(['Cabin_Assigned'], as_index=False)['Survived'].agg(['count', 'mean'])

In [None]:
train_df.shape

In [None]:
test_df.shape

In [None]:
train_df.info()

In [None]:
test_df.info()

In [None]:
#Imputing missing var (age in both train & test) and fare_cut in test set
train_df['Age'].fillna(train_df['Age'].mean(), inplace=True)
test_df['Age'].fillna(test_df['Age'].mean(), inplace=True)
test_df['Fare'].fillna(test_df['Fare'].mean(), inplace=True)

In [None]:
# Encoding categorical vars
ohe = OneHotEncoder(sparse_output = False)
ode = OrdinalEncoder
SI = SimpleImputer(strategy = 'most_frequent')

In [None]:
train_df.columns

In [None]:
# Cat vars
ode_col = ['Family_Size_Grp']#Ordinal
ohe_col = ['Sex', 'Embarked']#Nominal


In [None]:
# Spilliting outcome (Y) and independent (x)
X = train_df.drop(['Survived'], axis=1)
y = train_df['Survived']
X_test = test_df.drop(['Age_Cut', 'Fare_Cut'], axis=1)


In [None]:
# Splitting train and test
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, stratify=y, random_state=835)

In [None]:
# Creating pipelines for imputation & encoding 
# The pipeline function  performs two preprocessing steps, commonly used for preparing categorical ordinal data before feeding it into a machine learning model.
ordinal_pipeline = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='most_frequent')),
    ('ord', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
])
#This pipeline:
#Fills in missing values in categorical data.
#Converts the categories to integers (ordinal encoding), safely handling unknown values during inference.

In [None]:
# The pipeline is used to preprocess categorical features, particularly those that don't have an inherent order (i.e., nominal categories).
ohe_pipeline = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='most_frequent')),
    ('one-hot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])
#This pipeline is great for handling nominal categorical data by:
#Filling in missing values with the most common category.
#Converting each category into a set of binary columns (one for each possible category), making the data model-ready.

In [None]:
col_tran = ColumnTransformer(
    transformers=[
        ('impute', SI, ['Age']),                      # Apply the SimpleImputer (SI) to fill missing values in the 'Age' column
        ('ord_pipeline', ordinal_pipeline, ode_col), # Apply the ordinal_pipeline (imputation + OrdinalEncoder) to ordinal categorical columns
        ('ohe_pipeline', ohe_pipeline, ohe_col),     # Apply the one-hot encoding pipeline to nominal categorical columns
        # Pass through these numerical or already-processed columns without any transformation
        ('passthrough', 'passthrough', [
            'Pclass',
            'TicketNumberCounts',
            'Cabin_Assigned',
            'Name_Size',
            'Fare',
        ])
    ],

    # Drop any remaining columns not specified above
    remainder='drop',
    # Use all available CPU cores to parallelize transformations
    n_jobs=-1
)


In [None]:
# Plotting a correlation matrix & Heat map
correlation_matrix = train_df.corr(numeric_only=True)
# Heat map
plt.figure(figsize=(8,6))# Adjusting the fig size as needed
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")

# Running Models 
## Model 1- Random Forest Classifier

In [None]:
# Import and create an instance of the RandomForestClassifier with default parameters
rfc = RandomForestClassifier()

# Define the hyperparameter grid to search over during model tuning
param_grid = {
    'n_estimators' : [100, 150, 200],    # Number of trees in the forest
    'min_samples_split': [5,10,15],      # Minimum number of samples required to split an internal node
    'max_depth':[8,9,10,15,20],          # Maximum depth of the tree (controls overfitting)
    'min_samples_leaf': [1,2,4],         # Minimum number of samples required to be at a leaf node
    'criterion': ['gini', 'entropy'],    # The function used to measure the quality of a split: Gini impurity or Information Gain (entropy)
}


In [None]:
CV_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid, cv=StratifiedKFold(n_splits=5))

In [None]:
pipefinalrfc = make_pipeline(col_tran, CV_rfc)
pipefinalrfc.fit(X_train, y_train)

In [None]:
print(CV_rfc.best_params_)
print(CV_rfc.best_score_)

## Model 2- Decision Tree

In [None]:
# Import and create an instance of the DecisionTreeClassifier
dtc = DecisionTreeClassifier()

# Define the grid of hyperparameters to search during model tuning
param_grid = {
    # Minimum number of samples required to split an internal node
    'min_samples_split': [5, 10, 15],
    # The maximum depth of the tree to control overfitting
    'max_depth': [10, 20, 30],
    # Minimum number of samples required to be at a leaf node
    'min_samples_leaf': [1, 2, 4],
    # The function to measure the quality of a split
    'criterion': ['gini', 'entropy'],
}


In [None]:
CV_dtc = GridSearchCV(estimator=dtc, param_grid=param_grid, cv=StratifiedKFold(n_splits=5))

In [None]:
pipefinaldtc = make_pipeline(col_tran, CV_dtc)
pipefinaldtc.fit(X_train, y_train)

In [None]:
print(CV_dtc.best_params_)
print(CV_dtc.best_score_)

***These are the best-performing hyperparameters that were found during the search:***

criterion: 'entropy' — The tree splits were evaluated using information gain (as opposed to 'gini').

max_depth: 20 — The maximum depth of the tree was limited to 20 levels, which helps prevent overfitting.

min_samples_leaf: 4 — A leaf node must have at least 4 samples, again reducing the chance of overfitting.

min_samples_split: 15 — A node must have at least 15 samples to be split, promoting more robust splits.

***Best Score: 0.81188811188811***
This is the best mean cross-validation score (likely accuracy) achieved using the above parameters. 
It means that across all the validation folds, the model correctly predicted the outcome about 81.19% of the time using those hyperparameters.

## Model 3- K-Nearest Neighbour (KNN)

In [None]:
knn = KNeighborsClassifier()  # Define the KNN classifier class (but missing parentheses — should be instantiated)
param_grid = {  # Define a dictionary of hyperparameters to be used in a grid search
    'n_neighbors': [3, 5, 7, 9, 11],  # Number of neighbors to consider for classification
    'weights': ['uniform', 'distance'],  # Weight function: 'uniform' = all points equal, 'distance' = closer points have more influence
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],  # Algorithm to compute nearest neighbors
    'p': [1, 2],  # Power parameter for the Minkowski metric: 1 = Manhattan, 2 = Euclidean
}


In [None]:
CV_knn = GridSearchCV(estimator=knn, param_grid=param_grid, cv=StratifiedKFold(n_splits=5))

In [None]:
pipefinalknn = make_pipeline(col_tran, CV_knn)
pipefinalknn.fit(X_train, y_train)

In [None]:
print(CV_knn.best_params_)
print(CV_knn.best_score_)

***Best Score: 0.814626***
This is the best mean cross-validation score (likely accuracy) achieved using the above parameters. 
It means that across all the validation folds, the model correctly predicted the outcome about 81.19% of the time using those hyperparameters.

## Model 4- Support Vector Machine

In [None]:
svc = SVC()  # Instantiate a Support Vector Classifier (SVC) from scikit-learn

param_grid = {  # Define a dictionary of hyperparameters to tune during model selection
    'C': [100, 10, 1.0, 0.001],  # Regularization parameter: controls trade-off between smooth decision boundary and classifying training points correctly
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid']  # Specifies the kernel type to be used in the algorithm
}


In [None]:
CV_svc = GridSearchCV(estimator=svc, param_grid=param_grid, cv=StratifiedKFold(n_splits=5))

In [None]:
pipefinalsvc = make_pipeline(col_tran, CV_svc)
pipefinalsvc.fit(X_train, y_train)

In [None]:
print(CV_svc.best_params_)
print(CV_svc.best_score_)

## Model 5- Logistic Regression

In [None]:
lr = LogisticRegression()  # Instantiate a Logistic Regression model from scikit-learn
param_grid = {  
    'C': [100, 10, 1.0, 0.001],  # Inverse of regularization strength; smaller values specify stronger regularization
}

In [None]:
CV_lr = GridSearchCV(estimator=lr, param_grid=param_grid, cv=StratifiedKFold(n_splits=5))

In [None]:
pipefinallr = make_pipeline(col_tran, CV_lr)
pipefinallr.fit(X_train, y_train)

In [None]:
print(CV_lr.best_params_)
print(CV_lr.best_score_)

## Model 6- Naive Bayes

In [None]:
gnb = GaussianNB()  # Instantiate a Gaussian Naive Bayes classifier

param_grid = {
    'var_smoothing': [0.000000001, 0.00000001],  # Smoothing parameter to account for numerical stability
}

In [None]:
CV_gnb = GridSearchCV(estimator=gnb, param_grid=param_grid, cv=StratifiedKFold(n_splits=5))

In [None]:
pipefinalgnb = make_pipeline(col_tran, CV_gnb)
pipefinalgnb.fit(X_train, y_train)

In [None]:
print(CV_gnb.best_params_)
print(CV_gnb.best_score_)

## Making predictions

In [None]:
y_pred_rfc=pipefinalrfc.predict(X_test) # for randomforest
y_pred_dtc=pipefinaldtc.predict(X_test) # decision tree
y_pred_knn=pipefinalknn.predict(X_test) # Nearest Neighbour
y_pred_svc=pipefinalsvc.predict(X_test) # Support Vector Machine
y_pred_lr=pipefinallr.predict(X_test) # Logistic regression model
y_pred_gnb=pipefinalgnb.predict(X_test) # Naive Bayes

In [None]:
#Building submission files for each of the sixx models
submission_rfc = pd.DataFrame({
    'PassengerId': test_df['PassengerId'],
    'Survived': y_pred_rfc
})

submission_dtc = pd.DataFrame({
    'PassengerId': test_df['PassengerId'],
    'Survived': y_pred_dtc
})

submission_knn = pd.DataFrame({
    'PassengerId': test_df['PassengerId'],
    'Survived': y_pred_knn
})

submission_svc = pd.DataFrame({
    'PassengerId': test_df['PassengerId'],
    'Survived': y_pred_svc
})

submission_lr = pd.DataFrame({
    'PassengerId': test_df['PassengerId'],
    'Survived': y_pred_lr
})

submission_gnb = pd.DataFrame({
    'PassengerId': test_df['PassengerId'],
    'Survived': y_pred_gnb
})


In [None]:
# Creating submission CSV files 
submission_rfc.to_csv('/kaggle/working/submission_18_04_rfc.csv', index = False)
submission_dtc.to_csv('/kaggle/working/submission_18_04_dtc.csv', index = False)
submission_knn.to_csv('/kaggle/working/submission_18_04_knn.csv', index = False)
submission_svc.to_csv('/kaggle/working/submission_18_04_svc.csv', index = False)
submission_lr.to_csv('/kaggle/working/submission_18_04_lr.csv', index = False)
submission_gnb.to_csv('/kaggle/working/submission_18_04_gnb.csv', index = False)