In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
x = np.load('./data/tatanic_X_train.npy')
y = np.load('./data/tatanic_y_train.npy')

In [3]:
len(x)

889

In [4]:
# 독립변수

x[:2]

array([[0.27345609, 0.01415106, 0.        , 1.        , 0.        ,
        0.125     , 0.        , 0.        , 0.        , 1.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        1.        , 0.        , 0.        , 1.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        ],
       [0.473882  , 0.13913574, 0.        , 0.        , 1.        ,
        0.125     , 0.25      , 1.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 1.        , 0.        , 0.        , 0.        ,
        0.        , 1.        , 0.        , 0.        , 0.        ,
        0.        , 0.        ]])

In [5]:
# 종속변수

y[:5]

array([0., 1., 1., 1., 0.])

In [6]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = \
        train_test_split(x,y, test_size=0.2, random_state=11)

##### votingclassifier

In [7]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import VotingClassifier

In [8]:
clfdt = DecisionTreeClassifier(random_state=1)
clflog = LogisticRegression(random_state=1)
clfknn = KNeighborsClassifier(n_neighbors=3)
clfgn = GaussianNB()

eclf_h = VotingClassifier(estimators = [('Log', clflog), ('rf', clfdt), ('gnb', clfgn)], voting='hard')
eclf_s = VotingClassifier(estimators = [('Log', clflog), ('rf', clfdt), ('gnb', clfgn)], voting='soft')
eclf_kh = VotingClassifier(estimators = [('Log', clflog), ('rf', clfdt), ('knn', clfknn)], voting='hard')
eclf_ks = VotingClassifier(estimators = [('Log', clflog), ('rf', clfdt), ('knn', clfknn)], voting='soft')
eclf_f = VotingClassifier(estimators = [('Log', clflog), ('rf', clfdt), ('knn', clfknn), ('gnb', clfgn)], voting='soft')

In [9]:
models = [eclf_h, eclf_s, eclf_kh, eclf_ks, eclf_f]

In [10]:
for m in models:
    m.fit(x_train, y_train)
    preds = m.predict(x_test)
    accuracy = m.score(x_test, y_test)
    print(m.__class__.__name__, ':', accuracy)

VotingClassifier : 0.8146067415730337
VotingClassifier : 0.8033707865168539
VotingClassifier : 0.848314606741573
VotingClassifier : 0.8426966292134831
VotingClassifier : 0.8033707865168539


##### breast_cancer

In [11]:
from sklearn.datasets import load_breast_cancer
cancer = load_breast_cancer()

In [17]:
cancer

{'data': array([[1.799e+01, 1.038e+01, 1.228e+02, ..., 2.654e-01, 4.601e-01,
         1.189e-01],
        [2.057e+01, 1.777e+01, 1.329e+02, ..., 1.860e-01, 2.750e-01,
         8.902e-02],
        [1.969e+01, 2.125e+01, 1.300e+02, ..., 2.430e-01, 3.613e-01,
         8.758e-02],
        ...,
        [1.660e+01, 2.808e+01, 1.083e+02, ..., 1.418e-01, 2.218e-01,
         7.820e-02],
        [2.060e+01, 2.933e+01, 1.401e+02, ..., 2.650e-01, 4.087e-01,
         1.240e-01],
        [7.760e+00, 2.454e+01, 4.792e+01, ..., 0.000e+00, 2.871e-01,
         7.039e-02]]),
 'target': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
        0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0,
        1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0,
        1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1,
        1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0

In [13]:
cancer.data

array([[1.799e+01, 1.038e+01, 1.228e+02, ..., 2.654e-01, 4.601e-01,
        1.189e-01],
       [2.057e+01, 1.777e+01, 1.329e+02, ..., 1.860e-01, 2.750e-01,
        8.902e-02],
       [1.969e+01, 2.125e+01, 1.300e+02, ..., 2.430e-01, 3.613e-01,
        8.758e-02],
       ...,
       [1.660e+01, 2.808e+01, 1.083e+02, ..., 1.418e-01, 2.218e-01,
        7.820e-02],
       [2.060e+01, 2.933e+01, 1.401e+02, ..., 2.650e-01, 4.087e-01,
        1.240e-01],
       [7.760e+00, 2.454e+01, 4.792e+01, ..., 0.000e+00, 2.871e-01,
        7.039e-02]])

In [14]:
cancer.target

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0,
       1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0,
       1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0,
       0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1,
       1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0,
       0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0,
       1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1,
       1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0,

In [15]:
cancer.feature_names

array(['mean radius', 'mean texture', 'mean perimeter', 'mean area',
       'mean smoothness', 'mean compactness', 'mean concavity',
       'mean concave points', 'mean symmetry', 'mean fractal dimension',
       'radius error', 'texture error', 'perimeter error', 'area error',
       'smoothness error', 'compactness error', 'concavity error',
       'concave points error', 'symmetry error',
       'fractal dimension error', 'worst radius', 'worst texture',
       'worst perimeter', 'worst area', 'worst smoothness',
       'worst compactness', 'worst concavity', 'worst concave points',
       'worst symmetry', 'worst fractal dimension'], dtype='<U23')

##### x data를 dataframe으로 전환

In [18]:
cancer_data = pd.DataFrame(cancer.data, columns = cancer.feature_names)
cancer_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 30 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   mean radius              569 non-null    float64
 1   mean texture             569 non-null    float64
 2   mean perimeter           569 non-null    float64
 3   mean area                569 non-null    float64
 4   mean smoothness          569 non-null    float64
 5   mean compactness         569 non-null    float64
 6   mean concavity           569 non-null    float64
 7   mean concave points      569 non-null    float64
 8   mean symmetry            569 non-null    float64
 9   mean fractal dimension   569 non-null    float64
 10  radius error             569 non-null    float64
 11  texture error            569 non-null    float64
 12  perimeter error          569 non-null    float64
 13  area error               569 non-null    float64
 14  smoothness error         5

##### train, test data로 분리

In [20]:
x_train, x_test, y_train, y_test = train_test_split(cancer_data, cancer.target, test_size=0.3, random_state=111)

In [21]:
models = [clfdt, clflog, clfknn, clfgn, eclf_h, eclf_s, eclf_kh, eclf_ks, eclf_f]

In [22]:
import warnings
warnings.filterwarnings(action='ignore')

In [23]:
for m in models:
    m.fit(x_train, y_train)
    preds = m.predict(x_test)
    accuracy = m.score(x_test, y_test)
    print(m.__class__.__name__, ':', accuracy)
    print('-'*20)

DecisionTreeClassifier : 0.9239766081871345
--------------------
LogisticRegression : 0.9590643274853801
--------------------
KNeighborsClassifier : 0.9122807017543859
--------------------
GaussianNB : 0.9649122807017544
--------------------
VotingClassifier : 0.9707602339181286
--------------------
VotingClassifier : 0.9707602339181286
--------------------
VotingClassifier : 0.9473684210526315
--------------------
VotingClassifier : 0.9473684210526315
--------------------
VotingClassifier : 0.9590643274853801
--------------------


##### bagging

In [24]:
from sklearn.ensemble import BaggingClassifier

In [25]:
eclfbag = BaggingClassifier(clfdt, oob_score=True)

In [26]:
eclfbag.fit(x_train, y_train)

BaggingClassifier(base_estimator=DecisionTreeClassifier(random_state=1),
                  oob_score=True)

In [27]:
preds = eclfbag.predict(x_test)

In [28]:
eclfbag.score(x_test, y_test)

0.9590643274853801

##### RandomForest

In [29]:
from sklearn.ensemble import RandomForestClassifier

In [30]:
eclf_rf = RandomForestClassifier(n_estimators = 500, max_features = 10, oob_score=True)

In [31]:
eclf_rf.fit(x_train, y_train)
score = eclf_rf.score(x_test, y_test)
print(score)

0.9649122807017544


##### boosting : AdaBoost

In [32]:
from sklearn.ensemble import AdaBoostClassifier

In [33]:
eclf_ada = AdaBoostClassifier(n_estimators=3000, learning_rate=0.3)

In [34]:
eclf_ada.fit(x_train, y_train)
preds = eclf_ada.predict(x_test)
score = eclf_ada.score(x_test, y_test)
print(score)

0.9649122807017544


##### boosting : Gradientboost

In [35]:
from sklearn.ensemble import GradientBoostingClassifier

In [36]:
eclf_gra = GradientBoostingClassifier(n_estimators = 3000, learning_rate=0.1)

In [37]:
eclf_gra.fit(x_train, y_train)
score = eclf_gra.score(x_test, y_test)
print(score)

0.9649122807017544


### boosting

    Xgboost : gradientboosting을 개선
        1. 속도 개선
        2. 과적합 개선 - 의미없는 node제거..

    lgbm
        1. xgboost느린 점을 보완..

In [38]:
import xgboost as xgb
from xgboost import XGBClassifier

In [39]:
# !conda install -c anaconda py-xgboost

In [40]:
# !pip install xgboost

In [41]:
# !pip install lightgbm

In [42]:
from lightgbm import LGBMClassifier

In [43]:
xgb = XGBClassifier(n_estimators=3000, learning_rate=0.1)
xgb.fit(x_train, y_train)
preds = xgb.predict(x_test)
score = xgb.score(x_test, y_test)
print(score)

0.9766081871345029


In [44]:
lgbm = LGBMClassifier(n_estimators = 3000, learning_rate=0.1)
lgbm.fit(x_train, y_train)
score = lgbm.score(x_test, y_test)
print(score)

0.9824561403508771
