In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

## Solving Titanic Problem via Random Forest 

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

warnings.filterwarnings("ignore")

In [3]:
print("null values per column in train data")
print(train.isnull().sum())

null values per column in train data
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64


In [4]:
survival_by_gender = train.groupby("Sex")['Survived'].mean()
print("Survival rate by Gender: ")
print(survival_by_gender)
print("\n")

Survival rate by Gender: 
Sex
female    0.742038
male      0.188908
Name: Survived, dtype: float64




In [5]:
train.drop('Cabin', axis=1, inplace=True)
train['Age'].fillna(train['Age'].median(), inplace=True)
train['Embarked'].fillna(train['Embarked'].mode()[0], inplace=True)

### 데이터 전처리와 기능 분석

In [13]:
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler

def prep(df):
    df['Family'] = df['Parch'] + df['SibSp'] + 1
    
    # 이름 추출 
    df['Title'] = df['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
    df['Title'] = df['Title'].replace(['Lady', 'Countess', 'Capt', 'Col', 'Don', 
                                       'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
    df['Title'] = df['Title'].replace('Mlle', 'Miss')
    df['Title'] = df['Title'].replace('Ms', 'Miss')
    df['Title'] = df['Title'].replace('Mme', 'Mrs')

    # Embarked 결측치 최빈값으로 대체 
    df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)

    # Fare에서 결측치 중앙값으로 대체
    df['Fare'].fillna(df['Fare'].median(), inplace=True)

    df['Very Low'] = (df['Fare'] <= 4)
    df['Low'] = (df['Fare'] > 4) & (df['Fare'] <= 15)
    df['Moderate'] = (df['Fare'] > 15) & (df['Fare'] <= 25)
    df['Medium'] = (df['Fare'] > 25) & (df['Fare'] <= 50)
    df['High'] = (df['Fare'] > 50) & (df['Fare'] <= 100)
    df['Very High'] = (df['Fare'] > 100) & (df['Fare'] <= 250)
    df['Luxury'] = (df['Fare'] > 250)

    # Cabin 결측치 N으로 대체
    df['Cabin'] = df['Cabin'].fillna('N').map(lambda x: x[0])
    
    # age 결측치 KNNImputer 사용해 대체
    age_imputer = KNNImputer(n_neighbors=5)
    df['Age'] = age_imputer.fit_transform(df[['Age']])
    
    df['Baby'] = (df['Age'] <= 5)
    df['Child'] = (df['Age'] > 5) & (df['Age'] <= 14)
    df['Teenager'] = (df['Age'] > 14) & (df['Age'] <= 18)
    df['Adult'] = (df['Age'] > 18) & (df['Age'] <= 30)
    df['OldAdult'] = (df['Age'] > 30) & (df['Age'] <= 60)
    df['Old'] = (df['Age'] > 60)
    df['Alone'] = df['Family'] == 0
    
    # 원핫 인코딩
    df = pd.get_dummies(df, columns=['Sex', 'Embarked', 'Title', 'Cabin'], drop_first=True)
    for l in ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'T']:
        col = f'Cabin_{l}'
        if col not in df.columns:
            df[col] = pd.Series([False]*df.shape[0])
            df[col].fillna(False, inplace=True)
    
    cols = ['Age', 'Fare', 'Family', 'Parch', 'SibSp', 'Alone', 'Pclass', 'Very Low', 'Low', 'Moderate', 'Medium',
       'High', 'Very High', 'Luxury', 'Baby', 'Child', 'Teenager',
       'Adult', 'OldAdult', 'Old', 'Sex_male',
       'Embarked_Q', 'Embarked_S', 'Title_Miss', 'Title_Mr', 'Title_Mrs',
       'Title_Rare', 'Cabin_A', 'Cabin_B', 'Cabin_C', 'Cabin_D', 'Cabin_E', 'Cabin_F',
       'Cabin_G', 'Cabin_T']
    # Cabin_N은 유효치로 고려하지 않음
    df2 = df[cols].copy()
    
    # 수치 데이터를 표준화
    numeric_features = ['Age', 'Fare', 'Parch', 'SibSp']
    scaler = StandardScaler()
    df2[numeric_features] = scaler.fit_transform(df2[numeric_features])
    
    return df2

### 모델링

In [14]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score

train = pd.read_csv('train.csv')
train_processed = prep(train)

X = train_processed
y = train['Survived']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
classifiers = {
    'Logistic Regression': LogisticRegression(),
    'SVM': SVC(),
    'KNN': KNeighborsClassifier(),
    'Random Forest': RandomForestClassifier(),
    'Gradient Boosting': GradientBoostingClassifier()
}

results = {'Classifier': [], 'Test Accuracy': []}

for clf_name, clf in classifiers.items():
    clf.fit(X_train, y_train) 
    y_pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    results['Classifier'].append(clf_name)
    results['Test Accuracy'].append(accuracy)

results_df = pd.DataFrame(results)
results_df

Unnamed: 0,Classifier,Test Accuracy
0,Logistic Regression,0.810056
1,SVM,0.815642
2,KNN,0.804469
3,Random Forest,0.787709
4,Gradient Boosting,0.804469


### 분석

In [17]:
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

param_grid = {
    'criterion': ['gini', 'entropy'],
    'splitter': ['best', 'random'],
    'max_depth': range(8, 13),
    'min_samples_split': range(6, 11),
    'min_samples_leaf': range(1, 4),
    'max_features': ['sqrt', 'log2', None],
    'random_state': [0],
    'max_leaf_nodes': range(80, 101, 2),
    'class_weight': [{0: 1, 1: w} for w in np.linspace(1.3, 1.7, 5)],
    'ccp_alpha': np.linspace(4e-05, 6e-05, 5)
}

dt_classifier = DecisionTreeClassifier()
grid_search = GridSearchCV(estimator=dt_classifier, param_grid=param_grid, cv=5, scoring='accuracy', verbose=1, n_jobs=-1)
grid_search.fit(X_train, y_train)

print("Best Parameters found:")
print(grid_search.best_params_)
print("Best Accuracy Score:")
print(grid_search.best_score_)

y_pred = grid_search.best_estimator_.predict(X_test)
    
accuracy = accuracy_score(y_test, y_pred)
accuracy

Fitting 5 folds for each of 247500 candidates, totalling 1237500 fits
Best Parameters found:
{'ccp_alpha': 4e-05, 'class_weight': {0: 1, 1: 1.4}, 'criterion': 'gini', 'max_depth': 9, 'max_features': 'sqrt', 'max_leaf_nodes': 80, 'min_samples_leaf': 1, 'min_samples_split': 8, 'random_state': 0, 'splitter': 'random'}
Best Accuracy Score:
0.8328868314783808


0.7877094972067039

In [15]:
base_params = {
    'ccp_alpha': 4e-05,
    'class_weight': {0: 1, 1: 1.5},
    'criterion': 'gini',
    'max_depth': 9,
    'max_features': 'sqrt',
    'max_leaf_nodes': 80,
    'min_samples_leaf': 1,
    'min_samples_split': 8,
    'random_state': 0,
    'splitter': 'random'
}

### RandomForestClassifier

In [20]:
from sklearn.ensemble import RandomForestClassifier

param_grid_rf = {
    'n_estimators': range(60, 361, 50),
    'max_features': ['sqrt', 'log2', None]
}

rf_model = RandomForestClassifier(
    criterion=base_params['criterion'],
    max_depth=base_params['max_depth'],
    min_samples_split=base_params['min_samples_split'],
    min_samples_leaf=base_params['min_samples_leaf'],
    max_leaf_nodes=base_params['max_leaf_nodes'],
    class_weight=base_params['class_weight'],
    ccp_alpha=base_params['ccp_alpha'],
    random_state=1212
)

grid_search_rf = GridSearchCV(estimator=rf_model, param_grid=param_grid_rf, cv=5, scoring='accuracy', verbose=1, n_jobs=-1)
grid_search_rf.fit(X_train, y_train)

# 최적 패러미터 출력 
print("RandomForestClassifier Best Parameters found:")
print(grid_search_rf.best_params_)
print("RandomForestClassifier Best Accuracy Score:")
print(grid_search_rf.best_score_)

y_pred = grid_search_rf.best_estimator_.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
accuracy

Fitting 5 folds for each of 21 candidates, totalling 105 fits
RandomForestClassifier Best Parameters found:
{'max_features': None, 'n_estimators': 60}
RandomForestClassifier Best Accuracy Score:
0.832867132867133


0.8268156424581006

### Final Model with Random Forest

In [27]:
rf_model = RandomForestClassifier(
    n_estimators=60,
    max_depth=12,
    max_features=None,
    min_samples_split=8,
    min_samples_leaf=1,
    random_state=900,
    bootstrap=True
) 

rf_model.fit(X_train, y_train)
y_pred = rf_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
accuracy

0.88268156424581

### 최종 모델과 함께 데이터 학습 진행

In [28]:
train = pd.read_csv('train.csv')
train_processed = prep(train)

X = train_processed
y = train['Survived']

test = pd.read_csv('test.csv')
test_processed = prep(test)

ids = test['PassengerId']
X_t = test_processed

In [None]:
rf_model.fit(X, y)
predict = rf_model.predict(X_t)
ids = test['PassengerId']

PredictionDF = pd.DataFrame({'PassengerId' : ids, 'Survived' : predict})
PredictionDF.to_csv('bg_submissions.csv', index=False)