In [3]:
from sklearn.base import BaseEstimator
import numpy as np

class MyDummyClassifier(BaseEstimator):
    def fit(self, X, y=None):
        # Fit does nothing, as this is a dummy classifier
        pass

    def predict(self, X):
        pred = np.zeros(X.shape[0])  # Initialize predictions as zeros
        for i in range(X.shape[0]):  # Iterate over rows
            if X['Sex'].iloc[i] == 1:
                pred[i] = 0  # Class 0 for 'Sex' == 1
            else:
                pred[i] = 1  # Class 1 otherwise
        return pred


In [4]:
from sklearn.preprocessing import LabelEncoder

# Null 처리 함수
def fillna(df):
    df['Age'].fillna(df['Age'].mean(), inplace=True)
    df['Cabin'].fillna('N', inplace=True)
    df['Embarked'].fillna('N', inplace=True)
    df['Fare'].fillna(0, inplace=True)
    return df

# 머신러닝 알고리즘에 불필요한 피처 제거
def drop_features(df):
    df.drop(['PassengerId', 'Name', 'Ticket'], axis=1, inplace=True)
    return df

# 레이블 인코딩 수행.
def format_features(df):
    df['Cabin'] = df['Cabin'].str[:1]
    features = ['Cabin', 'Sex', 'Embarked']
    for feature in features:
        le = LabelEncoder()
        le = le.fit(df[feature])
        df[feature] = le.transform(df[feature])
    return df

# 앞에서 설정한 데이터 전처리 함수 호출
def transform_features(df):
    df = fillna(df)
    df = drop_features(df)
    df = format_features(df)
    return df

In [21]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# data load

titanic_df = pd.read_csv('titanic_train.csv')
y_titanic_df = titanic_df['Survived']
X_titanic_df = titanic_df.drop('Survived', axis=1)

#전처리
X_titanic_df = transform_features(X_titanic_df)
#데이터셋 분할
X_train, X_test, y_train, y_test = train_test_split(X_titanic_df,y_titanic_df, test_size=0.2, random_state=0 )



The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Age'].fillna(df['Age'].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Cabin'].fillna('N', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always

In [22]:
# 모델 생성& 학습
myclf = MyDummyClassifier()
myclf.fit(X_train, y_train)

my_pred = myclf.predict(X_test)

accuracy_score(y_test, my_pred)

0.7877094972067039

In [23]:
from sklearn.metrics import confusion_matrix

confusion_matrix(y_test, my_pred) #row : 실제값, col : 예측

array([[92, 18],
       [20, 49]])

In [24]:
from sklearn.metrics import precision_score, recall_score
precision_score(y_test, my_pred), recall_score(y_test, my_pred)

(0.7313432835820896, 0.7101449275362319)

In [25]:
class MyFakeClassifier(BaseEstimator):
    def fit(self, X,y):
        pass

    def predict(self, X):
        return np.zeros( (len(X), 1), dtype=bool )

In [26]:
from sklearn.datasets import load_digits 


#Mnist dataset
digits = load_digits()
digits.data.shape  #1797개 샘플, 64개 특성 - 8x8 픽셀

(1797, 64)

In [27]:
#titanic_df
X_titanic_df.head(), y_titanic_df.head()

(   Pclass  Sex   Age  SibSp  Parch     Fare  Cabin  Embarked
 0       3    1  22.0      1      0   7.2500      7         3
 1       1    0  38.0      1      0  71.2833      2         0
 2       3    0  26.0      0      0   7.9250      7         3
 3       1    0  35.0      1      0  53.1000      2         3
 4       3    1  35.0      0      0   8.0500      7         3,
 0    0
 1    1
 2    1
 3    1
 4    0
 Name: Survived, dtype: int64)

In [28]:
#전처리
#X_titanic_df = transform_features(X_titanic_df)
X_train, X_test, y_train, y_test = train_test_split(X_titanic_df, y_titanic_df, test_size=0.2, random_state=11)

In [29]:
#p156
def get_clf_eval(y_test, pred):
    confusion = confusion_matrix(y_test, pred)
    accuracy = accuracy_score(y_test, pred)
    precision = precision_score(y_test, pred)
    recall = recall_score(y_test, pred)

    print(confusion)
    print('*'*20)
    print(accuracy, precision, recall)

In [30]:
#로지스틱회귀 분류모델 생성
from sklearn.linear_model import LogisticRegression

lr_clf = LogisticRegression()
lr_clf.fit(X_train, y_train)
pred = lr_clf.predict(X_test)

#정확도, 정밀도, 재현율
get_clf_eval(y_test, pred)

[[104  14]
 [ 13  48]]
********************
0.8491620111731844 0.7741935483870968 0.7868852459016393


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [31]:
pred_proba = lr_clf.predict_proba(X_test)

In [32]:
pred_proba_result = np.concatenate([pred_proba, pred.reshape(-1,1)], axis=1)
pred_proba_result

array([[0.46192983, 0.53807017, 1.        ],
       [0.87873132, 0.12126868, 0.        ],
       [0.87720378, 0.12279622, 0.        ],
       [0.88258775, 0.11741225, 0.        ],
       [0.85523187, 0.14476813, 0.        ],
       [0.88221904, 0.11778096, 0.        ],
       [0.88846876, 0.11153124, 0.        ],
       [0.20877822, 0.79122178, 1.        ],
       [0.78285301, 0.21714699, 0.        ],
       [0.36928483, 0.63071517, 1.        ],
       [0.89978076, 0.10021924, 0.        ],
       [0.8750456 , 0.1249544 , 0.        ],
       [0.87719781, 0.12280219, 0.        ],
       [0.888423  , 0.111577  , 0.        ],
       [0.43664069, 0.56335931, 1.        ],
       [0.85904683, 0.14095317, 0.        ],
       [0.90373822, 0.09626178, 0.        ],
       [0.73343792, 0.26656208, 0.        ],
       [0.72468692, 0.27531308, 0.        ],
       [0.17173408, 0.82826592, 1.        ],
       [0.75360941, 0.24639059, 0.        ],
       [0.61899867, 0.38100133, 0.        ],
       [0.

In [33]:
# 이진화
custom_threshold = 0.4
pred_proba_1 = pred_proba[:, 1 ].reshape(-1,1) #새로운 예측값

In [34]:
from sklearn.preprocessing import Binarizer

binarizer = Binarizer(threshold = custom_threshold).fit(pred_proba_1) #새로운 예측값으로 이진화한 예측값
custom_predict = binarizer.transform(pred_proba_1)

get_clf_eval(y_test, custom_predict)

[[98 20]
 [10 51]]
********************
0.8324022346368715 0.7183098591549296 0.8360655737704918
