# 머신러닝 기초

## 0. 라이브러리 임포트

In [26]:
%pip install scikit-learn
from sklearn import datasets, model_selection, metrics, ensemble
import pandas as pd




In [4]:
# sklearn 내장 데이터셋 - 와인 데이터셋 활용
wine_data = datasets.load_wine()
df = pd.DataFrame(wine_data["data"], columns=wine_data["feature_names"])
df.head()

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline
0,14.23,1.71,2.43,15.6,127.0,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065.0
1,13.2,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050.0
2,13.16,2.36,2.67,18.6,101.0,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185.0
3,14.37,1.95,2.5,16.8,113.0,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480.0
4,13.24,2.59,2.87,21.0,118.0,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735.0


In [5]:
# 타겟 데이터 - 와인 등급
target = wine_data["target"]
target

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2])

In [6]:
# 트레이닝 데이터, 테스트 데이터 분리
# model_selection.train_test_split(df, target, test_size=0.2, random_state=0) 
X_train, X_test, y_train, y_test = model_selection.train_test_split(df, target, test_size=0.2, random_state=0)
print("Training Dataset:", X_train.shape)
print("Test Dataset:", X_test.shape)

Training Dataset: (142, 13)
Test Dataset: (36, 13)


## 1. 모델 학습

- ensemble - RandomForest분류기 활용

In [7]:
classifier = ensemble.RandomForestClassifier()
model = classifier.fit(X_train, y_train)
model

RandomForestClassifier()

## 2. 학습 평가

- model.score

In [8]:
model.score(X_test, y_test)

0.9722222222222222

## 3. 예측

- model.predict
- 정확도 보고: metrics.classification_reoport(정답, 예측값)

------

- precision : 정밀도(양성이라고 예측한 것 중 진짜 양성인 비율)
- recall : 재현율 - 실제 값(정답안) 중에 맞힌 정답
- accuracy : 전체 샘플 중 맞게 예측한 샘플 비율
- f1-score: precision과 recall의 가중 조화평균(weight harmonic average)을 F-score라 한다.
- support : 행 수
- macro avg : 평균에 평균을 내는 개념. 단순 평균. 각 클래스 별로 동일한 가중치를 부여한다. 즉 샘플 개수의 불균형을 고려하지 않는다. f1-macro-avg를 예로 들면 0부터 9까지 라벨의 f1-score 10개를 모두 더한 뒤 10을 나누면 된다. (0.99+0.82+0.95+0.94+0.98+0.95+0.97+0.96+0.87+0.90) / 10 = 0.933 샘플 개수의 불균형을 고려하지 않기 때문에 소수 클래스에 대한 성능이 낮을 때 더 큰 페널티가 발생한다. (데이터가 불균형할 때 확인해보자)


In [9]:
y_pred = model.predict(X_test)
report = metrics.classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        14
           1       1.00      0.94      0.97        16
           2       0.86      1.00      0.92         6

    accuracy                           0.97        36
   macro avg       0.95      0.98      0.96        36
weighted avg       0.98      0.97      0.97        36



# 4. 다른 데이터셋으로 학습 - 피마 인디언 비만도

In [4]:
df = pd.read_csv('diabetes.csv')
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [11]:
df2 = df.copy()
Y = df2['Outcome'] # 타겟
X = df2 # 피쳐
X.drop('Outcome',1,inplace =True)


  X.drop('Outcome',1,inplace =True)


In [18]:
# 트레이닝 데이터 분리
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, Y, test_size=0.1, random_state=0)
print("Training Dataset:", X_train.shape)
print("Test Dataset:", X_test.shape)

Training Dataset: (691, 8)
Test Dataset: (77, 8)


In [19]:
# 모델 학습
classifier = ensemble.RandomForestClassifier()
model = classifier.fit(X_train, y_train)
model

RandomForestClassifier()

In [20]:
# 학습 평가
model.score(X_test, y_test)

0.7922077922077922

In [16]:
# 예측
y_pred = model.predict(X_test)
report = metrics.classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.84      0.85      0.85       107
           1       0.65      0.64      0.65        47

    accuracy                           0.79       154
   macro avg       0.75      0.74      0.75       154
weighted avg       0.78      0.79      0.79       154



### (+) 의사 결정 나무로 분류하기

In [22]:
from sklearn import tree
clf = tree.DecisionTreeClassifier()
clf.fit(X_train, y_train)

DecisionTreeClassifier()

In [23]:
clf.score(X_test, y_test)

0.7402597402597403

In [25]:
y_pred = clf.predict(X_test)
report = metrics.classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.84      0.75      0.79        51
           1       0.59      0.73      0.66        26

    accuracy                           0.74        77
   macro avg       0.72      0.74      0.72        77
weighted avg       0.76      0.74      0.75        77

