In [36]:
# 데이터 전처리 - Titanic Dataset
# Age 열의 결측지를 평균값으로 채우기
# 사용할 특징 추출
# pd.get_dummies로 one-hot encoding
import numpy as np
import pandas as pd

train = pd.read_csv('train.csv')

train['Age'] = train['Age'].fillna(train['Age'].mean()) # na 값을 나이의 평균을 넣어줌

features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']
X = pd.get_dummies(train[features], drop_first=True).values # 랜덤 포레스트에 넣기 위해 one-hot encoding
y = train['Survived'].values # values를 이용해 numpy 형태로 변환

In [65]:
# RandomForestClassifier
# n_estimators: Decision Tree 수
# max_depth: 각 Decision Tree의 최대 깊이
# max_features: random feature selection 수
# max_samples: bootstrap sampling에서 각 sample 수
from sklearn.tree import DecisionTreeClassifier

class RandomForestClassifier:
  def __init__ (self, n_estimators = 100, max_depth = None, max_features = 'sqrt', max_samples = None):
    self.n_estimators = n_estimators
    self.max_depth = max_depth
    self.max_features = max_features
    self.max_samples = max_samples
    self.estimators = []
    for i in range(n_estimators):
      dt = DecisionTreeClassifier(max_depth = max_depth, max_features = max_features)
      self.estimators.append(dt)

  # fit()
  # Training data로부터 bootstrap sampling을 수행하여 decidion tree들을 생성
  # np.random.choice(n, m, replace=True): 0~(n-1) 사이에서 m개의 sample을 추출하여 array 생성
  def fit(self, X, y):
    for i in range(self.n_estimators):
      x_sample, y_sample = self.sample(X, y) # 표본 집단을 많이 복원 추출
      self.estimators[i].fit(x_sample, y_sample) # i 번째 decision tree에 sample 데이터 fit

  def sample(self, X, y):
    if self.max_samples is None:
      n_samples = X.shape[0]
    else:
      n_samples = min(self.max_samples, X.shape[0])

    indices = np.random.choice(X.shape[0], n_samples, replace=True)
    return X[indices], y[indices]

  # predict
  # 입력 데이터 x에 대해서, 각 estimator마다 예측 결과를 생성 후, voting 방식으로 최종 예측 생성
  def predict(self, X):
    all_predictions = np.zeros((self.n_estimators, X.shape[0]), dtype = np.int64)
    for i in range(self.n_estimators):
      all_predictions[i] = self.estimators[i].predict(X)

    predictions = np.zeros(X.shape[0], dtype = np.int64)
    for i in range(X.shape[0]):
      predictions[i] = np.bincount(all_predictions[:, i]).argmax()

    return predictions

In [52]:
# RandomForestClassifier 인스턴스 생성
# fit 함수로 decision Tree들 생성
# predict 함수로 x에 대한 예측 결과 생성
rf = RandomForestClassifier(n_estimators=50, max_depth=3, max_features=2)

rf.fit(X, y)
y_pred = rf.predict(X)

print("train accuracy:", (y == y_pred).mean())

train accuracy: 0.8282828282828283


In [61]:
# RandomForestClassifier in Scikit-Learn
# oob_score: Bootstrap sampling에 의해 추출되지 않은 데이터로 정확도 측정
# min_samples_split: decision Tree에서 노드를 분할할 때 필요한 최소 샘플 수
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=50, max_depth=None, max_features=2,
                            oob_score=True, min_samples_split=20)

rf.fit(X, y)
y_pred = rf.predict(X)

print("train accuracy:", (y == y_pred).mean())
print("oob score:", rf.oob_score_)

train accuracy: 0.8709315375982043
oob score: 0.8204264870931538


In [64]:
# 캐글에 제출해보기
# test.csv 불러와서 전처리, csv 파일 생성
test = pd.read_csv("test.csv")

test['Age'] = test['Age'].fillna(train['Age'].mean())

features = ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Sex', 'Embarked']
X_test = pd.get_dummies(test[features], drop_first=True).values
y_pred = rf.predict(X_test)

with open("rf_result.csv", "w") as f:
  f.write("PassengerId,Survived\n")

  for a, b in zip(test["PassengerId"].values, y_pred):
    f.write(f"{a},{b}\n")