# 전처리 모델 만들기 : BaseEstimator

In [1]:
from sklearn.base import BaseEstimator

In [2]:
class MyDummyClassifier(BaseEstimator): # BaseEstimator 상속받음
    # fit : 아무것도 학습하지 않음
    def fit(self, X, y=None):
        pass
    
    # predict : 
    def predict(self, X):
        pred = np.zeros((X.shape[0], 1)) # X 데이터의 행 개수 * 1 행렬 생성
        for i in range(X.shape[0]):
            if X['Sex'].iloc[i] == 1: # 남성
                pred[i] = 0 # 사망
            else: # 여성
                pred[i] = 1 # 생존
                
        return pred

# 타이타닉 데이터 전처리 함수

In [3]:
# Null 처리 함수
def fillna(df):
    df['Age'].fillna(df['Age'].mean(), inplace=True)
    df['Cabin'].fillna('N', inplace=True)
    df['Embarked'].fillna('N', inplace=True)
    df['Fare'].fillna(0, inplace=True)
    
    return df

In [4]:
# 불필요한 속성 제거
def drop_features(df):
    df.drop(['PassengerId', 'Name', 'Ticket'], axis=1, inplace=True)
    
    return df

In [5]:
# 문자 형태 -> 숫자 형태
def format_features(df):
    df['Cabin'] = df['Cabin'].str[:1]
    features = ['Cabin', 'Sex', 'Embarked']
    for feature in features:
        le = LabelEncoder()
        le = le.fit(df[feature])
        df[feature] = le.transform(df[feature])
        
    return df

In [6]:
# 데이터 전처리 함수 호출
def transform_features(df):
    df = fillna(df)
    df = drop_features(df)
    df = format_features(df)
    
    return df

# 타이타닉 데이터 전처리

In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

In [8]:
titanic_df = pd.read_csv('./train.csv')
y_titanic_df = titanic_df['Survived']
X_titanic_df = titanic_df.drop('Survived', axis=1)
X_titanic_df = transform_features(X_titanic_df)
X_train, X_test, y_train, y_test = train_test_split(X_titanic_df, y_titanic_df, test_size=0.2, random_state=0)

# MyDummyClassifier 이용하여 학습, 예측, 평가

In [9]:
import numpy as np

myclf = MyDummyClassifier()
myclf.fit(X_train, y_train)
mypred = myclf.predict(X_test)
accuracy_score(y_test, mypred)

0.7877094972067039

# MNIST

In [10]:
from sklearn.base import BaseEstimator
class MyDummy(BaseEstimator):
    def fit(self, X, y=None):
        pass
    
    def predict(self, X):
        return np.zeros((len(X), 1), dtype=bool)

In [11]:
from sklearn.datasets import load_digits
digits = load_digits()

In [12]:
digits

{'data': array([[ 0.,  0.,  5., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ..., 10.,  0.,  0.],
        [ 0.,  0.,  0., ..., 16.,  9.,  0.],
        ...,
        [ 0.,  0.,  1., ...,  6.,  0.,  0.],
        [ 0.,  0.,  2., ..., 12.,  0.,  0.],
        [ 0.,  0., 10., ..., 12.,  1.,  0.]]),
 'target': array([0, 1, 2, ..., 8, 9, 8]),
 'target_names': array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]),
 'images': array([[[ 0.,  0.,  5., ...,  1.,  0.,  0.],
         [ 0.,  0., 13., ..., 15.,  5.,  0.],
         [ 0.,  3., 15., ..., 11.,  8.,  0.],
         ...,
         [ 0.,  4., 11., ..., 12.,  7.,  0.],
         [ 0.,  2., 14., ..., 12.,  0.,  0.],
         [ 0.,  0.,  6., ...,  0.,  0.,  0.]],
 
        [[ 0.,  0.,  0., ...,  5.,  0.,  0.],
         [ 0.,  0.,  0., ...,  9.,  0.,  0.],
         [ 0.,  0.,  3., ...,  6.,  0.,  0.],
         ...,
         [ 0.,  0.,  1., ...,  6.,  0.,  0.],
         [ 0.,  0.,  1., ...,  6.,  0.,  0.],
         [ 0.,  0.,  0., ..., 10.,  0.,  0.]],
 
        [[ 0

In [13]:
y = (digits.target == 7).astype(int)
X_train, X_test, y_train, y_test = train_test_split(digits.data, y, test_size=0.25, random_state=11)

In [14]:
clf = MyDummy()
clf.fit(X_train, y_train)
pred = clf.predict(X_test)
accuracy_score(y_test, pred)

0.9

In [15]:
import pandas as pd
pd.Series(y_test).value_counts()

0    405
1     45
dtype: int64

# 오차 행렬

In [16]:
from sklearn.metrics import confusion_matrix

In [17]:
confusion_matrix(y_test, pred)

array([[405,   0],
       [ 45,   0]], dtype=int64)

In [18]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix
def get_clf_eval(y_test, pred):
    confusion = confusion_matrix(y_test, pred)
    accuracy = accuracy_score(y_test, pred)
    precision = precision_score(y_test, pred)
    recall = recall_score(y_test, pred)
    print("오차 행렬")
    print(confusion)
    print("정확도 : {0}, 정밀도 : {1}, 재현율 : {2}".format(accuracy, precision, recall))

get_clf_eval(y_test, pred)

오차 행렬
[[405   0]
 [ 45   0]]
정확도 : 0.9, 정밀도 : 0.0, 재현율 : 0.0


  _warn_prf(average, modifier, msg_start, len(result))
