### 1. 사이킷런 소개와 특징

In [2]:
import sklearn
print(sklearn.__version__)

1.1.2


### 2. 붓꽃 품종 예측

In [4]:
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
import pandas as pd

In [5]:
iris = load_iris()
iris_data = iris.data
iris_label = iris.target

In [16]:
iris_df = pd.DataFrame(data = iris_data, columns = iris.feature_names)
iris_df['label'] = iris.target
iris_df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),label
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [7]:
X_train, X_test, y_train, y_test = train_test_split(iris_data, iris_label, test_size = 0.2, random_state = 11)

In [8]:
dt_clf = DecisionTreeClassifier(random_state = 11)
dt_clf.fit(X_train, y_train)
pred = dt_clf.predict(X_test)

In [9]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, pred))

0.9333333333333333


### 3. 사이킷런 기반 프레임워크 소개

### 사이킷런 주요 Estimator class  
[분류]  
* DecisionTreeClassifier  
* RandomForestClassifier  
* GradientBoostingClassifier  
* GaussianNB  
* SVC  

[회귀 구현 클래스]  
* LinearRegression  
* Ridge  
* Lasso  
* RandomForestRegressor  
* GradientBoostingRegressor  

### 사이킷런 주요 모듈
[예제 데이터]  
* datasets

[피처 처리]  
* preprocessing
* feature_selection  
* feature_extraction

[피처 처리 / 차원 축소]    
* decomposition

[데이터 분리, 검증, 파라미터 튜닝]      
* model_selection 

[평가]   
* metrics

[ML 알고리즘]    
* ensemble  
* linear_model  
* naive_bayes  
* neighbors  
* svm  
* tree  
* cluster 
 
[유틸리티]  
* pipeline

### 4. Model selection module

In [10]:
from sklearn.model_selection import KFold
import numpy as np
iris = load_iris()
features = iris.data
label = iris.target 
dt_clf = DecisionTreeClassifier(random_state = 156) 
kfold = KFold(n_splits = 5)
cv_accuracy = []

In [14]:
n_iter = 0
for train_index, test_index in kfold.split(features):
    X_train, X_test = features[train_index], features[test_index]
    y_train, y_test = label[train_index], label[test_index]
    dt_clf.fit(X_train, y_train)
    pred = dt_clf.predict(X_test)
    n_iter += 1
    accuracy = np.round(accuracy_score(y_test, pred), 4)
    cv_accuracy.append(accuracy)
print(cv_accuracy)
print(np.mean(cv_accuracy))

[1.0, 0.9667, 0.8667, 0.9333, 0.7333, 1.0, 0.9667, 0.8667, 0.9333, 0.7333, 1.0, 0.9667, 0.8667, 0.9333, 0.7333]
0.9


In [17]:
from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits = 3)
n_iter = 0
for train_index, test_index in skf.split(iris_df, iris_df['label']):
    n_iter += 1
    label_train = iris_df['label'].iloc[train_index]
    label_test = iris_df['label'].iloc[test_index]
    print(label_train.value_counts())
    print(label_test.value_counts())

2    34
0    33
1    33
Name: label, dtype: int64
0    17
1    17
2    16
Name: label, dtype: int64
1    34
0    33
2    33
Name: label, dtype: int64
0    17
2    17
1    16
Name: label, dtype: int64
0    34
1    33
2    33
Name: label, dtype: int64
1    17
2    17
0    16
Name: label, dtype: int64


cross_val_score 이용

In [19]:
from sklearn.model_selection import cross_val_score, cross_validate
iris_data = load_iris()
dt_clf = DecisionTreeClassifier(random_state = 156)

In [20]:
data = iris_data.data
label = iris_data.target
scores = cross_val_score(dt_clf, data, label, scoring='accuracy', cv = 3)
print(scores)   

[0.98 0.94 0.98]


GridSearchCV 이용

In [24]:
X_train, X_test, y_train, y_test = train_test_split(iris_data.data, iris_data.target, test_size = 0.2, random_state = 121)
dtree = DecisionTreeClassifier()
parameters = {'max_depth': [1,2,3], 'min_samples_split': [2,3]}

In [25]:
from sklearn.model_selection import GridSearchCV
grid_dtree = GridSearchCV(dtree, param_grid = parameters, cv = 3, refit = True)

In [27]:
grid_dtree.fit(X_train, y_train)
scores_df = pd.DataFrame(grid_dtree.cv_results_)
scores_df

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_min_samples_split,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
0,0.000323,2.7e-05,0.000199,2.3e-05,1,2,"{'max_depth': 1, 'min_samples_split': 2}",0.7,0.7,0.7,0.7,1.110223e-16,5
1,0.000244,1.8e-05,0.000163,1.2e-05,1,3,"{'max_depth': 1, 'min_samples_split': 3}",0.7,0.7,0.7,0.7,1.110223e-16,5
2,0.000268,2.4e-05,0.000159,9e-06,2,2,"{'max_depth': 2, 'min_samples_split': 2}",0.925,1.0,0.95,0.958333,0.03118048,3
3,0.000237,4e-06,0.000169,2.9e-05,2,3,"{'max_depth': 2, 'min_samples_split': 3}",0.925,1.0,0.95,0.958333,0.03118048,3
4,0.000248,1.2e-05,0.000162,1.5e-05,3,2,"{'max_depth': 3, 'min_samples_split': 2}",0.975,1.0,0.95,0.975,0.02041241,1
5,0.000309,6.2e-05,0.000157,9e-06,3,3,"{'max_depth': 3, 'min_samples_split': 3}",0.975,1.0,0.95,0.975,0.02041241,1


In [31]:
print(grid_dtree.best_params_)
print(grid_dtree.best_score_)

{'max_depth': 3, 'min_samples_split': 2}
0.975


In [33]:
pred = grid_dtree.best_estimator_.predict(X_test)
print(accuracy_score(y_test, pred))

0.9666666666666667


 ### 5. 데이터 전처리

label encoding

In [36]:
from sklearn.preprocessing import LabelEncoder
items = ['sadf', 'sf', 'qwer', 'werre', 'ssss', 'xcvx', 'asdff']
encoder = LabelEncoder()
labels = encoder.fit_transform(items)
labels

array([2, 3, 1, 5, 4, 6, 0])

In [37]:
print(encoder.classes_)
print(encoder.inverse_transform([3, 2, 5, 6, 0]))

['asdff' 'qwer' 'sadf' 'sf' 'ssss' 'werre' 'xcvx']
['sf' 'sadf' 'werre' 'xcvx' 'asdff']


one-hot encoding

In [41]:
from sklearn.preprocessing import OneHotEncoder
items = ['Tv', '냉장고', '전자레인지', '컴퓨터', '믹서', '믹서']
items = np.array(items).reshape(-1, 1)
oh_encoder = OneHotEncoder()
oh_labels = oh_encoder.fit_transform(items)
print(oh_labels.toarray())
print(oh_labels.shape)

[[1. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0.]
 [0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 1.]
 [0. 0. 1. 0. 0.]
 [0. 0. 1. 0. 0.]]
(6, 5)


In [42]:
df = pd.DataFrame(items)
pd.get_dummies(df)

Unnamed: 0,0_Tv,0_냉장고,0_믹서,0_전자레인지,0_컴퓨터
0,1,0,0,0,0
1,0,1,0,0,0
2,0,0,0,1,0
3,0,0,0,0,1
4,0,0,1,0,0
5,0,0,1,0,0


### Scaling

In [53]:
from sklearn.preprocessing import StandardScaler
iris = load_iris()
df = pd.DataFrame(data=iris.data, columns = iris.feature_names)
df.describe()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
count,150.0,150.0,150.0,150.0
mean,5.843333,3.057333,3.758,1.199333
std,0.828066,0.435866,1.765298,0.762238
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


StandardScaler

In [54]:
scaler = StandardScaler()
df_st = pd.DataFrame(scaler.fit_transform(df), columns = df.columns)
df_st.describe()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
count,150.0,150.0,150.0,150.0
mean,-1.468455e-15,-1.823726e-15,-1.610564e-15,-9.473903e-16
std,1.00335,1.00335,1.00335,1.00335
min,-1.870024,-2.433947,-1.567576,-1.447076
25%,-0.9006812,-0.592373,-1.226552,-1.183812
50%,-0.05250608,-0.1319795,0.3364776,0.1325097
75%,0.6745011,0.5586108,0.7627583,0.7906707
max,2.492019,3.090775,1.785832,1.712096


MinMax Scaler

In [59]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
df_mm = pd.DataFrame(scaler.fit_transform(df), columns = df.columns)
df_mm.describe()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
count,150.0,150.0,150.0,150.0
mean,0.428704,0.440556,0.467458,0.458056
std,0.230018,0.181611,0.299203,0.317599
min,0.0,0.0,0.0,0.0
25%,0.222222,0.333333,0.101695,0.083333
50%,0.416667,0.416667,0.567797,0.5
75%,0.583333,0.541667,0.694915,0.708333
max,1.0,1.0,1.0,1.0


### 6, 7 생략