In [7]:
import kagglehub
from sklearn.impute import SimpleImputer
import pandas as pd
import os
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV

In [8]:
# Download latest version
path = kagglehub.dataset_download("saddasdaasda/httpswww-kaggle-comctitanicdata")
# Ensure path points to a specific file
if os.path.isdir(path):
    # Assuming the dataset contains a single CSV file
    files = [f for f in os.listdir(path) if f.endswith('.csv')]
    if files:
        path = os.path.join(path, files[0])
    else:
        raise FileNotFoundError("No CSV file found in the dataset directory.")

# load dataset into pandas dataframe
df = pd.read_csv(path)
print(df.head())

   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500   NaN        S  


In [9]:
imputer = SimpleImputer(strategy='mean')
df['Age'] = imputer.fit_transform(df[['Age']])

# remove Columns [PassengerId, Name , Ticket]
df.drop(['PassengerId', 'Name', 'Ticket'], axis=1, inplace=True)

# make label Encoding for Embarked Column
df['Embarked'] = df['Embarked'].astype('category').cat.codes

# make label Encoding for Sex Column
df['Sex'] = df['Sex'].astype('category').cat.codes

# Handle Cabin column: fill missing values and encode as category
df['Cabin'] = df['Cabin'].fillna('Unknown').astype('category').cat.codes

print(df.head())

   Survived  Pclass  Sex   Age  SibSp  Parch     Fare  Cabin  Embarked
0         0       3    1  22.0      1      0   7.2500    147         2
1         1       1    0  38.0      1      0  71.2833     81         0
2         1       3    0  26.0      0      0   7.9250    147         2
3         1       1    0  35.0      1      0  53.1000     55         2
4         0       3    1  35.0      0      0   8.0500    147         2


In [10]:
# split data into train and test
X = df.drop('Survived', axis=1)
y = df['Survived']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print('Trainning Data' ,  X_train.head())
print('Testing Data' , X_test.head())
print('Trainning Results Data' , y_train.head())
print('Testing Results Data' , y_test.head())

Trainning Data      Pclass  Sex   Age  SibSp  Parch     Fare  Cabin  Embarked
331       1    1  45.5      0      0  28.5000     56         2
733       2    1  23.0      0      0  13.0000    147         2
382       3    1  32.0      0      0   7.9250    147         2
704       3    1  26.0      1      0   7.8542    147         2
813       3    0   6.0      4      2  31.2750    147         2
Testing Data      Pclass  Sex        Age  SibSp  Parch     Fare  Cabin  Embarked
709       3    1  29.699118      1      1  15.2458    147         0
439       2    1  31.000000      0      0  10.5000    147         2
840       3    1  20.000000      0      0   7.9250    147         2
720       2    0   6.000000      0      1  33.0000    147         2
39        3    0  14.000000      1      0  11.2417    147         0
Trainning Results Data 331    0
733    0
382    0
704    0
813    0
Name: Survived, dtype: int64
Testing Results Data 709    1
439    0
840    0
720    1
39     1
Name: Survived, dtype: 

In [11]:
# Define models to compare
models = {
    'RandomForest': RandomForestClassifier(),
    'LogisticRegression': LogisticRegression(max_iter=1000)
}

# Define hyperparameter grids
param_grids = {
    'RandomForest': {
        'model__n_estimators': [50, 100, 200],
        'model__max_depth': [None, 10, 20],
        'model__min_samples_split': [2, 5, 10]
    },
    'LogisticRegression': {
        'model__C': [0.1, 1, 10],
        'model__solver': ['lbfgs', 'liblinear']
    }
}

In [12]:
# Perform hyperparameter tuning and compare models
for model_name, model in models.items():
    print(f"Tuning hyperparameters for {model_name}...")
    pipeline = Pipeline([
        ('model', model)
    ])
    grid_search = GridSearchCV(pipeline, param_grids[model_name], cv=5, scoring='accuracy', n_jobs=-1)
    grid_search.fit(X_train, y_train)
    
    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{model_name} Best Parameters: {grid_search.best_params_}")
    print(f"{model_name} Accuracy after tuning: {accuracy}")

Tuning hyperparameters for RandomForest...
RandomForest Best Parameters: {'model__max_depth': None, 'model__min_samples_split': 10, 'model__n_estimators': 50}
RandomForest Accuracy after tuning: 0.8212290502793296
Tuning hyperparameters for LogisticRegression...
LogisticRegression Best Parameters: {'model__C': 0.1, 'model__solver': 'liblinear'}
LogisticRegression Accuracy after tuning: 0.7821229050279329
