# 0.

In [271]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
from sklearn.feature_selection import mutual_info_classif
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression


warnings.filterwarnings('ignore')

In [272]:
# Load Data
op = 0 # 0: Local, 1: Kaggle

if not op: # Local
    train_path = './data/train.csv'
    test_path = './data/test.csv'
    sub_path = './data/gender_submission.csv'
    save_path = './data/submission.csv'
else:  # Kaggle
    train_path = '/kaggle/input/titanic/train.csv'
    test_path = '/kaggle/input/titanic/test.csv'
    sub_path = '/kaggle/input/titanic/gender_submission.csv'    
    save_path = '/kaggle/working/submission.csv'

df_train = pd.read_csv(train_path)
df_test = pd.read_csv(test_path)
df_sub = pd.read_csv(sub_path)

print(df_train.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
None


# 1.

In [273]:
# Remove unnecessary columns
df_train = df_train.drop(columns=['Name', 'Ticket'])
df_test = df_test.drop(columns=['Name', 'Ticket'])

# Check NaN
print(df_train[df_test.columns].isnull().sum())
print("=====================")
print(df_test[df_test.columns].isnull().sum())

PassengerId      0
Pclass           0
Sex              0
Age            177
SibSp            0
Parch            0
Fare             0
Cabin          687
Embarked         2
dtype: int64
PassengerId      0
Pclass           0
Sex              0
Age             86
SibSp            0
Parch            0
Fare             1
Cabin          327
Embarked         0
dtype: int64


In [274]:
# NaN list:  Age, Cabin, Fare, Embarked

# Age
def age_to_group(age):
    if pd.isna(age):
        return 7
    elif age <= 10:
        return 0
    elif age <= 20:
        return 1
    elif age <= 30:
        return 2
    elif age <= 40:
        return 3
    elif age <= 50:
        return 4
    elif age <= 60:
        return 5
    else:
        return 6

df_train['AgeGroup'] = df_train['Age'].apply(age_to_group)
df_test['AgeGroup'] = df_test['Age'].apply(age_to_group)

# Fare
df_test['Fare'] = df_test['Fare'].fillna(df_test.groupby('Pclass')['Fare'].transform('mean'))

# Embarked
# df_train['Embarked'] = df_train['Embarked'].fillna('N')

df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,AgeGroup
0,1,0,3,male,22.0,1,0,7.25,,S,2
1,2,1,1,female,38.0,1,0,71.2833,C85,C,3
2,3,1,3,female,26.0,0,0,7.925,,S,2
3,4,1,1,female,35.0,1,0,53.1,C123,S,3
4,5,0,3,male,35.0,0,0,8.05,,S,3


# 2. Feature Engineering

In [275]:
# Encoding
df_train = pd.get_dummies(df_train, columns=['Pclass', 'Sex', 'Embarked'], drop_first=True)
df_test = pd.get_dummies(df_test, columns=['Pclass', 'Sex', 'Embarked'], drop_first=True)
df_train.head()

Unnamed: 0,PassengerId,Survived,Age,SibSp,Parch,Fare,Cabin,AgeGroup,Pclass_2,Pclass_3,Sex_male,Embarked_Q,Embarked_S
0,1,0,22.0,1,0,7.25,,2,False,True,True,False,True
1,2,1,38.0,1,0,71.2833,C85,3,False,False,False,False,False
2,3,1,26.0,0,0,7.925,,2,False,True,False,False,True
3,4,1,35.0,1,0,53.1,C123,3,False,False,False,False,True
4,5,0,35.0,0,0,8.05,,3,False,True,True,False,True


In [276]:
# Add Feature
df_train['FamilySize'] = (df_train['SibSp'] + df_train['Parch'])
df_test['FamilySize'] = (df_test['SibSp'] + df_test['Parch'])
print(df_train.columns)
df_train.head(10)

Index(['PassengerId', 'Survived', 'Age', 'SibSp', 'Parch', 'Fare', 'Cabin',
       'AgeGroup', 'Pclass_2', 'Pclass_3', 'Sex_male', 'Embarked_Q',
       'Embarked_S', 'FamilySize'],
      dtype='object')


Unnamed: 0,PassengerId,Survived,Age,SibSp,Parch,Fare,Cabin,AgeGroup,Pclass_2,Pclass_3,Sex_male,Embarked_Q,Embarked_S,FamilySize
0,1,0,22.0,1,0,7.25,,2,False,True,True,False,True,1
1,2,1,38.0,1,0,71.2833,C85,3,False,False,False,False,False,1
2,3,1,26.0,0,0,7.925,,2,False,True,False,False,True,0
3,4,1,35.0,1,0,53.1,C123,3,False,False,False,False,True,1
4,5,0,35.0,0,0,8.05,,3,False,True,True,False,True,0
5,6,0,,0,0,8.4583,,7,False,True,True,True,False,0
6,7,0,54.0,0,0,51.8625,E46,5,False,False,True,False,True,0
7,8,0,2.0,3,1,21.075,,0,False,True,True,False,True,4
8,9,1,27.0,0,2,11.1333,,2,False,True,False,False,True,2
9,10,1,14.0,1,0,30.0708,,1,True,False,False,False,False,1


In [277]:
# Select Feature
drop_cols = ['PassengerId', 'Age', 'SibSp', 'Parch', 'Cabin']
df_train.drop(columns=drop_cols, inplace=True)
df_test.drop(columns=drop_cols, inplace=True)
df_train.head()

Unnamed: 0,Survived,Fare,AgeGroup,Pclass_2,Pclass_3,Sex_male,Embarked_Q,Embarked_S,FamilySize
0,0,7.25,2,False,True,True,False,True,1
1,1,71.2833,3,False,False,False,False,False,1
2,1,7.925,2,False,True,False,False,True,0
3,1,53.1,3,False,False,False,False,True,1
4,0,8.05,3,False,True,True,False,True,0


In [278]:
# Feature Normalize
norm_cols = ['Fare', 'FamilySize', 'AgeGroup']
ss = StandardScaler().fit(df_train[norm_cols])
df_train[norm_cols] = ss.transform(df_train[norm_cols])
df_test[norm_cols]  = ss.transform(df_test[norm_cols])
df_train.head()

Unnamed: 0,Survived,Fare,AgeGroup,Pclass_2,Pclass_3,Sex_male,Embarked_Q,Embarked_S,FamilySize
0,0,-0.502445,-0.593821,False,True,True,False,True,0.05916
1,1,0.786845,-0.146573,False,False,False,False,False,0.05916
2,1,-0.488854,-0.593821,False,True,False,False,True,-0.560975
3,1,0.42073,-0.146573,False,False,False,False,True,0.05916
4,0,-0.486337,-0.146573,False,True,True,False,True,-0.560975


In [279]:
# Check Feature importance
importances = mutual_info_classif(df_train.drop(columns='Survived'), df_train['Survived'])
for col, imp in zip(df_train.drop(columns='Survived').columns, importances):
    print(f'{col:<20}: {imp:.4f}')

Fare                : 0.1286
AgeGroup            : 0.0000
Pclass_2            : 0.0173
Pclass_3            : 0.0467
Sex_male            : 0.1483
Embarked_Q          : 0.0045
Embarked_S          : 0.0090
FamilySize          : 0.0587


# 3. Model Tuning

In [291]:
# Split input and target data
X = df_train.drop(columns='Survived').to_numpy()
y = df_train['Survived'].to_numpy()
X_pred = df_test.to_numpy()
print(X.shape, y.shape, X_pred.shape)

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=0)

(891, 8) (891,) (418, 8)


In [None]:
model = LogisticRegression(
    solver='lbfgs', 
    max_iter=1000,
    penalty='l2',
    C=0.1,
    )
model.fit(X_train, y_train)
print(model.score(X_train, y_train))
print(model.score(X_val, y_val))

0.7881219903691814
0.7873134328358209


# 4. Submission

In [309]:
model.fit(X, y)
y_pred = model.predict(X_pred)
df_sub['Survived'] = y_pred
df_sub.to_csv(save_path, index=False)