In [1]:
# import the necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, precision_score, recall_score
from sklearn.model_selection import StratifiedKFold
# ignore warnings   
import warnings
warnings.filterwarnings('ignore')

# import the df_titanic_modifiedset into pandas df_titanic_modifiedframe
df_titanic = pd.read_csv('tested.csv')

# perform EDA and df_titanic_modified cleaning
# print(df_titanic['Age'].value_counts())

# make a copy of the df_titanic_modifiedframe for the modification
df_titanic_modified = df_titanic.copy()

thershhold = len(df_titanic) * 0.05  # the threshold for drop nan
print(f"the thershhold for drop nan: ", {thershhold})

# dealing with nan in the Fare column
fare_mean = df_titanic_modified['Fare'].mean().round(2)
print(fare_mean)
df_titanic_modified['Fare'] = df_titanic_modified['Fare'].fillna(fare_mean)
print(f"the fare nan count: ", {df_titanic_modified['Fare'].isna().sum()})

# dealing with nan in the Age column
Age_mean = df_titanic_modified['Age'].mean()
Age_median = df_titanic_modified['Age'].median()
Age_mode = df_titanic_modified['Age'].mode().values[0]  # mode() returns a Series, so we need to get the value

print(f"Mean: {Age_mean}, Median: {Age_median}, Mode: {Age_mode}")
df_titanic_modified['Age'] = df_titanic_modified['Age'].fillna(Age_median)

df_titanic_modified.drop(columns=['Cabin'], inplace=True)
print(df_titanic_modified.isna().sum())
# encode categorical variables
df_titanic_modified['Embarked'] = df_titanic_modified['Embarked'].astype('category')

df_titanic_modified['Sex'] = df_titanic_modified['Sex'].map({'male': 0, 'female': 1})

df_titanic_modified = pd.get_dummies(df_titanic_modified, columns=['Embarked'], dtype='int')

print(df_titanic_modified[['Survived', 'Pclass', 'Sex', 'Embarked_C', 'Embarked_Q', 'Embarked_S']].head())
# PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
# df_titanic_modified.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1, inplace=True
# data preprocessing and feature engineering
df_titanic_modified['FamilySize'] = df_titanic_modified['SibSp'] + df_titanic_modified['Parch'] + 1

df_titanic_modified['Title'] = df_titanic_modified['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
# print(df_titanic_modified['Title'].value_counts())

def map_title(title):
    if title == 'Mr':
        return 1
    elif title == 'Miss':
        return 2
    elif title == 'Mrs':
        return 3
    elif title == 'Master':
        return 4
    else:
        return 5

df_titanic_modified['Title'] = df_titanic_modified['Title'].apply(map_title)


print(df_titanic_modified[['Title', 'Survived']].groupby(['Title'], as_index=False).mean())
df_titanic_modified.drop(['Ticket', 'PassengerId', 'SibSp', 'Parch', 'Name'], axis=1, inplace=True)
print(df_titanic_modified.info())

# split the df_titanic_modifiedset into train and test sets
X = df_titanic_modified.drop('Survived', axis=1)
y = df_titanic_modified['Survived']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# use StratifiedKFold to ensure proper stratification of the folds
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# define the models
logreg = LogisticRegression(max_iter=1000)  # increase max_iter to address convergence warning
rfc = RandomForestClassifier(n_estimators=100, random_state=42)

# define the evaluation metrics
metrics = ['accuracy', 'f1', 'precision', 'recall']

# define the scoring functions
scoring = {
    'accuracy': 'accuracy',
    'f1': 'f1_macro',
    'precision': 'precision_macro',
    'recall': 'recall_macro'
}

# perform cross-validation and evaluate the models
for train_index, test_index in skf.split(X, y):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # standardize the features
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    # oversample the minority class using SMOTE
    smote = SMOTE(random_state=42)
    X_train, y_train = smote.fit_resample(X_train, y_train)

    # train and evaluate the models
    for model in [logreg, rfc]:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        # calculate the evaluation metrics
        metrics_values = {
            'accuracy': accuracy_score(y_test, y_pred),
            'f1': f1_score(y_test, y_pred, average='macro'),
            'precision': precision_score(y_test, y_pred, average='macro'),
            'recall': recall_score(y_test, y_pred, average='macro')
        }

        # print the evaluation metrics
        print(f'Model: {model.__class__.__name__}')
        print(f'Metrics: {metrics_values}')
        print('---')

  df_titanic_modified['Title'] = df_titanic_modified['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)


the thershhold for drop nan:  {20.900000000000002}
35.63
the fare nan count:  {np.int64(0)}
Mean: 30.272590361445783, Median: 27.0, Mode: 21.0
PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       0
dtype: int64
   Survived  Pclass  Sex  Embarked_C  Embarked_Q  Embarked_S
0         0       3    0           0           1           0
1         1       3    1           0           0           1
2         0       2    0           0           1           0
3         0       3    0           0           0           1
4         1       3    1           0           0           1
   Title  Survived
0      1  0.000000
1      2  1.000000
2      3  1.000000
3      4  0.000000
4      5  0.285714
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 10 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------