In [1]:
# import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder,MinMaxScaler,OrdinalEncoder,FunctionTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score,confusion_matrix,classification_report
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

In [2]:
# load te data as a Dataframe
df = pd.read_csv('./data/titanic_data.csv')

#print the first five rows
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
# ceck the info

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [4]:
#check for duplicates
df.duplicated().sum()

0

In [5]:
#check for missing values
df.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [6]:
# Drop columns we won't use
df = df.drop(columns=['PassengerId', 'Name', 'Ticket', 'Cabin'])

# define features & target
X = df.drop('Survived', axis=1)
y = df['Survived']

# Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [12]:
numeric_features = ['Age', 'Fare']
categorical_features = ['Sex', 'Embarked', 'Pclass']

num_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='mean')),
('scaler', MinMaxScaler())
])

cat_transformer = Pipeline(steps= [
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ohe', OneHotEncoder(drop='first', sparse=False))
    ])


preprocessor = ColumnTransformer(transformers=[
    ('num', num_transformer, numeric_features),
    ('cat', cat_transformer,categorical_features)
])

model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(max_iter=1000))
])

param_grid = {
    'classifier__penalty': ['l1', 'l2'],
    'classifier__C': [0.001, 0.01, 0.1, 1, 10],
    'classifier__solver': ['liblinear', 'saga'],
    'classifier__max_iter': [500, 1000],
    'classifier__n_jobs': [-1]
}

grid_search = GridSearchCV(estimator = model, param_grid = param_grid, cv=5, n_jobs=-1)

grid_search.fit(X_train, y_train)
print(f' Best Parameters: {grid_search.best_params_}')
print(f'Best Score: {grid_search.best_score_}')

y_pred = grid_search.predict(X_test)

print(f'Accuracy score: {accuracy_score(y_test,y_pred)}')
print(f' Classification report: {classification_report(y_test,y_pred)}')

 Best Parameters: {'classifier__C': 1, 'classifier__max_iter': 500, 'classifier__n_jobs': -1, 'classifier__penalty': 'l2', 'classifier__solver': 'liblinear'}
Best Score: 0.7935191568994385
Accuracy score: 0.7932960893854749
 Classification report:               precision    recall  f1-score   support

           0       0.80      0.86      0.83       105
           1       0.78      0.70      0.74        74

    accuracy                           0.79       179
   macro avg       0.79      0.78      0.78       179
weighted avg       0.79      0.79      0.79       179





In [13]:
y_pred_train = grid_search.predict(X_train)
y_test_pred = grid_search.predict(X_test)

train_acc = accuracy_score(y_train,y_pred_train)
test_acc = accuracy_score(y_test, y_test_pred)

print(f'Training Accuracy: {train_acc}')
print(f'Test Accuracy: {test_acc}')

Training Accuracy: 0.7921348314606742
Test Accuracy: 0.7932960893854749
