# 0.

In [390]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
from sklearn.feature_selection import mutual_info_classif
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score
from lightgbm import LGBMClassifier, early_stopping, log_evaluation


warnings.filterwarnings('ignore')

In [329]:
# Load Data
op = 0 # 0: Local, 1: Kaggle

if not op: # Local
    train_path = './data/train.csv'
    test_path = './data/test.csv'
    sub_path = './data/gender_submission.csv'
    save_path = './data/submission.csv'
else:  # Kaggle
    train_path = '/kaggle/input/titanic/train.csv'
    test_path = '/kaggle/input/titanic/test.csv'
    sub_path = '/kaggle/input/titanic/gender_submission.csv'    
    save_path = '/kaggle/working/submission.csv'

df_train = pd.read_csv(train_path)
df_test = pd.read_csv(test_path)
df_sub = pd.read_csv(sub_path)

print(df_train.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
None


# 1.

In [330]:
# Remove unnecessary columns
df_train = df_train.drop(columns=['Name', 'Ticket'])
df_test = df_test.drop(columns=['Name', 'Ticket'])

# Check NaN
print(df_train[df_test.columns].isnull().sum())
print("=====================")
print(df_test[df_test.columns].isnull().sum())

PassengerId      0
Pclass           0
Sex              0
Age            177
SibSp            0
Parch            0
Fare             0
Cabin          687
Embarked         2
dtype: int64
PassengerId      0
Pclass           0
Sex              0
Age             86
SibSp            0
Parch            0
Fare             1
Cabin          327
Embarked         0
dtype: int64


In [331]:
# NaN list:  Age, Cabin, Fare, Embarked

# Age
def age_to_group(age):
    if pd.isna(age):
        return 7
    elif age <= 10:
        return 0
    elif age <= 20:
        return 1
    elif age <= 30:
        return 2
    elif age <= 40:
        return 3
    elif age <= 50:
        return 4
    elif age <= 60:
        return 5
    else:
        return 6

df_train['AgeGroup'] = df_train['Age'].apply(age_to_group)
df_test['AgeGroup'] = df_test['Age'].apply(age_to_group)

# Fare
df_test['Fare'] = df_test['Fare'].fillna(df_test.groupby('Pclass')['Fare'].transform('mean'))

# Embarked
df_train['Embarked'] = df_train['Embarked'].fillna('N')

# 2. Feature Engineering

In [332]:
# Encoding
encoder = LabelEncoder()

def data_encoding(df):
    df['Sex'] = encoder.fit_transform(df['Sex'])
    df['Embarked'] = encoder.fit_transform(df['Embarked'])
    return df

df_train = data_encoding(df_train)
df_test = data_encoding(df_test)

In [333]:
df_train['FamilySize'] = (df_train['SibSp'] + df_train['Parch']) **2
df_test['FamilySize'] = (df_test['SibSp'] + df_test['Parch']) **2

df_train['IsAlone'] = (df_train['FamilySize'] == 0).astype(int)
df_test['IsAlone'] = (df_test['FamilySize'] == 0).astype(int)

df_train['ClassByFare'] = (df_train['Fare'] / df_train['Pclass'])
df_test['ClassByFare'] = (df_test['Fare'] / df_test['Pclass'])

df_train['SexByEmbarked'] = (df_train['Sex'] + 1) * (df_train['Embarked'] + 1)
df_test['SexByEmbarked'] = (df_test['Sex'] + 1) * (df_test['Embarked'] + 1)

df_train['SexByIsAlone'] = (df_train['Sex'] + 1) * (df_train['IsAlone'] + 1)
df_test['SexByIsAlone'] = (df_test['Sex'] + 1) * (df_test['IsAlone'] + 1)

In [334]:
# Select Feature
feature_cols = ['Pclass', 'Sex', 'FamilySize', 'IsAlone', 'ClassByFare', 'Fare', 'SexByEmbarked', 'SexByIsAlone']
df_train[feature_cols].head(10)

# Check Feature importance
importances = mutual_info_classif(df_train[feature_cols], df_train['Survived'])
for col, imp in zip(feature_cols, importances):
    print(f'{col:<20}: {imp:.4f}')

Pclass              : 0.0843
Sex                 : 0.1428
FamilySize          : 0.0203
IsAlone             : 0.0136
ClassByFare         : 0.1481
Fare                : 0.1369
SexByEmbarked       : 0.1413
SexByIsAlone        : 0.1076


# 5. Model Tuning - LGBM

In [340]:
# Split input and target data
X = df_train[feature_cols].to_numpy()
y = df_train['Survived'].to_numpy()
X_pred = df_test[feature_cols].to_numpy()

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=0)

In [385]:
from sklearn.model_selection import GridSearchCV

lgbm = LGBMClassifier(
    n_jobs=-1,
    max_depth=5,
    learning_rate=0.01,
    n_estimators=300,
    verbose = -1
)

lgbm.fit(X_train, y_train)
print(lgbm.score(X_train, y_train))
print(lgbm.score(X_val, y_val))

0.8455056179775281
0.8156424581005587


In [400]:
import optuna

def objective(trial):
    params = {
        'learning_rate': trial.suggest_float('learning_rate', 1e-3, 0.1, log=True),
        'n_estimators': trial.suggest_int('n_estimators', 10, 1000),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'num_leaves': trial.suggest_int('num_leaves', 20, 150),
        'min_child_samples': trial.suggest_int('min_child_samples', 10, 100),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.0, 1.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.0, 1.0),
    }

    model = LGBMClassifier(**params)
    score = cross_val_score(model, X, y, cv=5, scoring='accuracy').mean()
    return score

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=30)

[I 2025-04-06 01:59:37,433] A new study created in memory with name: no-name-b9c81ac8-34a4-4051-b5c9-936451c25a6f
[I 2025-04-06 01:59:39,544] Trial 0 finished with value: 0.8092398468394955 and parameters: {'learning_rate': 0.05882792871828515, 'n_estimators': 316, 'max_depth': 7, 'num_leaves': 62, 'min_child_samples': 18, 'subsample': 0.687970928123946, 'colsample_bytree': 0.6048445232853208, 'reg_alpha': 0.8999542808972761, 'reg_lambda': 0.6696781440492665}. Best is trial 0 with value: 0.8092398468394955.
[I 2025-04-06 01:59:41,361] Trial 1 finished with value: 0.812591802146758 and parameters: {'learning_rate': 0.055217709892021224, 'n_estimators': 686, 'max_depth': 4, 'num_leaves': 60, 'min_child_samples': 82, 'subsample': 0.8978818803669101, 'colsample_bytree': 0.7145664856186515, 'reg_alpha': 0.11488146510133246, 'reg_lambda': 0.28150901883984303}. Best is trial 1 with value: 0.812591802146758.
[I 2025-04-06 01:59:43,516] Trial 2 finished with value: 0.8024919967359235 and parame

In [401]:
best_model = LGBMClassifier(**study.best_params)
best_model.fit(X, y)

print(best_model.score(X, y))
print(best_model.score(X_val, y_val))

0.8731762065095399
0.8770949720670391


# 6. Submission

In [402]:
y_pred = best_model.predict(X_pred)
df_sub['Survived'] = y_pred
df_sub.to_csv(save_path, index=False)