<a href="https://colab.research.google.com/github/EunjaeHan/data/blob/main/20231002_%ED%95%9C%EC%9D%80%EC%9E%AC_5%EC%A3%BC%EC%B0%A8_%EC%8B%A4%EC%8A%B5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier, HistGradientBoostingClassifier
from sklearn.metrics import accuracy_score

# 데이터 불러오기
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv"
data = pd.read_csv(url, sep=';')

# 이진 분류를 위해 레이블 변환 (품질 6 이상을 1로, 그 외를 0으로)
data['quality'] = data['quality'] >= 6
data['quality'] = data['quality'].astype(int)

# 특성과 레이블 분리
X = data.drop('quality', axis=1)
y = data['quality']

# 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 모델 생성
models = {
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'Extra Trees': ExtraTreesClassifier(),
    'Gradient Boosting': GradientBoostingClassifier(),
    'Histogram Gradient Boosting': HistGradientBoostingClassifier()
}

# 하이퍼파라미터 그리드 설정
param_grids = {
    'Decision Tree': {'max_depth': [None, 5, 10, 20], 'min_samples_split': [2, 5, 10]},
    'Random Forest': {'n_estimators': [100, 200, 500], 'max_depth': [None, 10, 20], 'min_samples_split': [2, 5, 10]},
    'Extra Trees': {'n_estimators': [100, 200, 500], 'max_depth': [None, 10, 20], 'min_samples_split': [2, 5, 10]},
    'Gradient Boosting': {'n_estimators': [100, 200, 500], 'learning_rate': [0.01, 0.1, 0.5]},
    'Histogram Gradient Boosting': {'max_iter': [100, 200, 500]}
}

# 모델 훈련 및 최적화
for model_name, model in models.items():
    param_grid = param_grids[model_name]
    grid_search = GridSearchCV(model, param_grid, cv=5)
    grid_search.fit(X_train, y_train)
    best_model = grid_search.best_estimator_

    # 모델 평가
    y_pred = best_model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)

    print(f"Model: {model_name}")
    print(f"Best Parameters: {grid_search.best_params_}")
    print(f"Accuracy: {accuracy}")


Model: Decision Tree
Best Parameters: {'max_depth': None, 'min_samples_split': 2}
Accuracy: 0.7928571428571428
Model: Random Forest
Best Parameters: {'max_depth': 20, 'min_samples_split': 2, 'n_estimators': 100}
Accuracy: 0.8316326530612245
Model: Extra Trees
Best Parameters: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 200}
Accuracy: 0.85
Model: Gradient Boosting
Best Parameters: {'learning_rate': 0.1, 'n_estimators': 500}
Accuracy: 0.8020408163265306
Model: Histogram Gradient Boosting
Best Parameters: {'max_iter': 200}
Accuracy: 0.8295918367346938
