In [1]:
import numpy as np
from sklearn import datasets, tree, model_selection

In [1]:
import numpy as np
from xgboost import XGBClassifier
from sklearn import datasets
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
import torch

# 데이터 로드
wdbc = datasets.load_breast_cancer()
X, y = wdbc.data, wdbc.target

torch.cuda.is_available()

# 하이퍼파라미터 그리드 설정
param_grid = {
    "n_estimators": [50, 100, 150, 200, 250],
    "learning_rate": [0.01, 0.05, 0.1, 0.15, 0.2],
    "max_depth": [3, 4, 5, 6, 7],
    "min_child_weight": [1, 3, 5],
    "subsample": [0.6, 0.8, 1.0],
    "colsample_bytree": [0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
}

# 교차 검증 설정
kf = KFold(n_splits=5, shuffle=True, random_state=42)
best_score = 0
best_params = None
count = 0
n = 1
n = 6750

# 하이퍼파라미터 Grid Search
for n_estimators in param_grid["n_estimators"]:
    for learning_rate in param_grid["learning_rate"]:
        for max_depth in param_grid["max_depth"]:
            for min_child_weight in param_grid["min_child_weight"]:
                for subsample in param_grid["subsample"]:
                    for colsample_bytree in param_grid["colsample_bytree"]:
                        # 모델 정의
                        model = XGBClassifier(
                            n_estimators=n_estimators,
                            learning_rate=learning_rate,
                            max_depth=max_depth,
                            min_child_weight=min_child_weight,
                            subsample=subsample,
                            colsample_bytree=colsample_bytree,
                            eval_metric="mlogloss",
                        )

                        # 교차 검증 루프
                        cv_scores = []
                        for train_idx, val_idx in kf.split(X):
                            X_train, X_val = X[train_idx], X[val_idx]
                            y_train, y_val = y[train_idx], y[val_idx]

                            # 모델 학습
                            model.fit(X_train, y_train)

                            # 예측 및 정확도 계산
                            preds = model.predict(X_val)
                            acc = accuracy_score(y_val, preds)
                            cv_scores.append(acc)

                        # 평균 정확도 계산
                        mean_cv_score = np.mean(cv_scores)
                        count += 1
                        print(f"{count}/{n} | CV Score: {mean_cv_score:.3f}")

                        # 최고 성능 모델 업데이트
                        if mean_cv_score > best_score:
                            best_score = mean_cv_score
                            best_params = model.get_params()

print("\nBest Parameters:", best_params)
print("Best Cross-Validation Score:", best_score)

1/6750 | CV Score: 0.930
2/6750 | CV Score: 0.935
3/6750 | CV Score: 0.930
4/6750 | CV Score: 0.935
5/6750 | CV Score: 0.937
6/6750 | CV Score: 0.940
7/6750 | CV Score: 0.937
8/6750 | CV Score: 0.935
9/6750 | CV Score: 0.933
10/6750 | CV Score: 0.935
11/6750 | CV Score: 0.935
12/6750 | CV Score: 0.931
13/6750 | CV Score: 0.933
14/6750 | CV Score: 0.937
15/6750 | CV Score: 0.931
16/6750 | CV Score: 0.928
17/6750 | CV Score: 0.923
18/6750 | CV Score: 0.924
19/6750 | CV Score: 0.923
20/6750 | CV Score: 0.921
21/6750 | CV Score: 0.921
22/6750 | CV Score: 0.923
23/6750 | CV Score: 0.923
24/6750 | CV Score: 0.926
25/6750 | CV Score: 0.930
26/6750 | CV Score: 0.930
27/6750 | CV Score: 0.931
28/6750 | CV Score: 0.930
29/6750 | CV Score: 0.930
30/6750 | CV Score: 0.930
31/6750 | CV Score: 0.931
32/6750 | CV Score: 0.930
33/6750 | CV Score: 0.924
34/6750 | CV Score: 0.919
35/6750 | CV Score: 0.919
36/6750 | CV Score: 0.919
37/6750 | CV Score: 0.919
38/6750 | CV Score: 0.921
39/6750 | CV Score: 0

KeyboardInterrupt: 

In [4]:
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn import datasets

if __name__ == "__main__":
    wdbc = datasets.load_breast_cancer()

    # Define model - remove use_label_encoder parameter
    model = XGBClassifier(eval_metric='mlogloss')

    # Define parameter grid for XGBoost
    param_grid = {
        'n_estimators': [50],          # 5개 , 100, 150, 200, 250
        'learning_rate': [0.01],     # 5개 , 0.05, 0.1, 0.15, 0.2
        'max_depth': [3],                      # 5개 , 4, 5, 6, 7
        'min_child_weight': [1],                     # 3개 , 3, 5
        'subsample': [0.6],                      # 3개 , 0.8, 1.0
        'colsample_bytree': [0.5] # 6개 , 0.6, 0.7, 0.8, 0.9, 1.0
    }

    # Set up GridSearchCV
    grid_search = GridSearchCV(
        estimator=model, 
        param_grid=param_grid, 
        cv=5, 
        return_train_score=True,
        n_jobs=-1  # 모든 CPU 코어 사용
    )

    # Train the model with grid search
    print("Training XGBoost...")
    grid_search.fit(wdbc.data, wdbc.target)

    # Get the best model
    best_model = grid_search.best_estimator_

    # Evaluate the best model
    acc_train = grid_search.cv_results_['mean_train_score'][grid_search.best_index_]
    acc_test = grid_search.best_score_
    print(f"* Best parameters: {grid_search.best_params_}")
    print(f"* Accuracy @ training data: {acc_train:.3f}")
    print(f"* Accuracy @ test data: {acc_test:.3f}")
    print(f"* Your score: {max(10 + 100 * (acc_test - 0.9), 0):.0f}")

Training XGBoost...
* Best parameters: {'colsample_bytree': 0.5, 'learning_rate': 0.01, 'max_depth': 3, 'min_child_weight': 1, 'n_estimators': 50, 'subsample': 0.6}
* Accuracy @ training data: 0.959
* Accuracy @ test data: 0.935
* Your score: 14


In [1]:

if __name__ == "__main__":
    wdbc = datasets.load_breast_cancer()

    # Train a model
    cv_results = model_selection.cross_validate(
        model, wdbc.data, wdbc.target, cv=5, return_train_score=True
    )

    # Evaluate the model
    acc_train = np.mean(cv_results["train_score"])
    acc_test = np.mean(cv_results["test_score"])
    print(f"* Accuracy @ training data: {acc_train:.3f}")
    print(f"* Accuracy @ test data: {acc_test:.3f}")
    print(f"* Your score: {max(10 + 100 * (acc_test - 0.9), 0):.0f}")

* Accuracy @ training data: 1.000
* Accuracy @ test data: 0.917
* Your score: 12


In [2]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

if __name__ == "__main__":
    wdbc = datasets.load_breast_cancer()

    # Define a model
    model = DecisionTreeClassifier()

    # Define parameter grid
    param_grid = {
        'max_depth': [3, 5, 7, None],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    }

    # Set up GridSearchCV
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, return_train_score=True)

    # Train the model with grid search
    grid_search.fit(wdbc.data, wdbc.target)

    # Get the best model
    best_model = grid_search.best_estimator_

    # Evaluate the best model
    acc_train = grid_search.cv_results_['mean_train_score'][grid_search.best_index_]
    acc_test = grid_search.best_score_
    print(f"* Best parameters: {grid_search.best_params_}")
    print(f"* Accuracy @ training data: {acc_train:.3f}")
    print(f"* Accuracy @ test data: {acc_test:.3f}")
    print(f"* Your score: {max(10 + 100 * (acc_test - 0.9), 0):.0f}")

* Best parameters: {'max_depth': 5, 'min_samples_leaf': 1, 'min_samples_split': 10}
* Accuracy @ training data: 0.988
* Accuracy @ test data: 0.924
* Your score: 12


In [5]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

if __name__ == "__main__":
    wdbc = datasets.load_breast_cancer()

    # Define a model
    model = DecisionTreeClassifier()

    # Define parameter grid with more parameters
    param_grid = {
        'criterion': ['gini', 'entropy'],
        'max_depth': [3, 5, 7, 9, 11],
        'min_samples_split': [2, 3,4,5,6,7],
        'min_samples_leaf': [1, 2,3, 4],
        'max_features': [None, int, float, 'sqrt', 'log2'],
        'class_weight': [None, 'balanced']
    }

    # Set up GridSearchCV
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, return_train_score=True)

    # Train the model with grid search
    grid_search.fit(wdbc.data, wdbc.target)

    # Get the best model
    best_model = grid_search.best_estimator_

    # Evaluate the best model
    acc_train = grid_search.cv_results_['mean_train_score'][grid_search.best_index_]
    acc_test = grid_search.best_score_
    print(f"* Best parameters: {grid_search.best_params_}")
    print(f"* Accuracy @ training data: {acc_train:.3f}")
    print(f"* Accuracy @ test data: {acc_test:.3f}")
    print(f"* Your score: {max(10 + 100 * (acc_test - 0.9), 0):.0f}")

* Best parameters: {'class_weight': None, 'criterion': 'entropy', 'max_depth': 11, 'max_features': 'sqrt', 'min_samples_leaf': 3, 'min_samples_split': 5}
* Accuracy @ training data: 0.983
* Accuracy @ test data: 0.958
* Your score: 16


4800 fits failed out of a total of 12000.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
2400 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/anaconda3/envs/env_py38/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/anaconda3/envs/env_py38/lib/python3.8/site-packages/sklearn/base.py", line 1145, in wrapper
    estimator._validate_params()
  File "/opt/anaconda3/envs/env_py38/lib/python3.8/site-packages/sklearn/base.py", line 638, in _validate_params
    validate_parameter_constraints(
  File "/opt/anaconda3/envs/env_py38/lib/python3.8/site-packages/sklearn/utils/_param_validation.py", line 9

In [7]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

if __name__ == "__main__":
    wdbc = datasets.load_breast_cancer()

    # Define a model
    model = DecisionTreeClassifier()

    # Define parameter grid with corrected parameters
    param_grid = {
        "criterion": ["gini", "entropy"],
        "max_depth": [3, 5, 7, 9, 11],
        "min_samples_split": [2, 3, 4, 5, 6, 7],
        "min_samples_leaf": [1, 2, 3, 4],
        "max_features": [None, "sqrt", "log2"],  # Ensure valid values
        "class_weight": [None, "balanced"],
    }

    # Set up GridSearchCV
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, return_train_score=True, error_score='raise')

    # Train the model with grid search
    grid_search.fit(wdbc.data, wdbc.target)

    # Get the best model
    best_model = grid_search.best_estimator_

    # Evaluate the best model
    acc_train = grid_search.cv_results_['mean_train_score'][grid_search.best_index_]
    acc_test = grid_search.best_score_
    print(f"* Best parameters: {grid_search.best_params_}")
    print(f"* Accuracy @ training data: {acc_train:.3f}")
    print(f"* Accuracy @ test data: {acc_test:.3f}")
    print(f"* Your score: {max(10 + 100 * (acc_test - 0.9), 0):.0f}")

* Best parameters: {'class_weight': None, 'criterion': 'entropy', 'max_depth': 5, 'max_features': 'log2', 'min_samples_leaf': 4, 'min_samples_split': 6}
* Accuracy @ training data: 0.972
* Accuracy @ test data: 0.953
* Your score: 15


In [8]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

if __name__ == "__main__":
    wdbc = datasets.load_breast_cancer()

    # Define models
    models = {
        'DecisionTree': DecisionTreeClassifier(),
        'RandomForest': RandomForestClassifier(),
        'SVC': SVC()
    }

    # Define parameter grids
    param_grids = {
        'DecisionTree': {
            'criterion': ['gini', 'entropy'],
            'max_depth': [3, 5, 7, None],
            'min_samples_split': [2, 5, 10],
            'min_samples_leaf': [1, 2, 4],
            'max_features': [None, 'sqrt', 'log2'],
            'class_weight': [None, 'balanced']
        },
        'RandomForest': {
            'n_estimators': [50, 100, 200],
            'max_depth': [3, 5, 7, None],
            'max_features': ['sqrt', 'log2'],
            'class_weight': [None, 'balanced']
        },
        'SVC': {
            'C': [0.1, 1, 10],
            'kernel': ['linear', 'rbf'],
            'gamma': ['scale', 'auto']
        }
    }

    # Perform Grid Search for each model
    for model_name, model in models.items():
        print(f"Training {model_name}...")
        grid_search = GridSearchCV(estimator=model, param_grid=param_grids[model_name], cv=5, return_train_score=True)
        grid_search.fit(wdbc.data, wdbc.target)

        # Get the best model
        best_model = grid_search.best_estimator_

        # Evaluate the best model
        acc_train = grid_search.cv_results_['mean_train_score'][grid_search.best_index_]
        acc_test = grid_search.best_score_
        print(f"* {model_name} Best parameters: {grid_search.best_params_}")
        print(f"* {model_name} Accuracy @ training data: {acc_train:.3f}")
        print(f"* {model_name} Accuracy @ test data: {acc_test:.3f}")
        print(f"* {model_name} Your score: {max(10 + 100 * (acc_test - 0.9), 0):.0f}")

Training DecisionTree...
* DecisionTree Best parameters: {'class_weight': None, 'criterion': 'entropy', 'max_depth': 5, 'max_features': 'log2', 'min_samples_leaf': 2, 'min_samples_split': 2}
* DecisionTree Accuracy @ training data: 0.978
* DecisionTree Accuracy @ test data: 0.949
* DecisionTree Your score: 15
Training RandomForest...
* RandomForest Best parameters: {'class_weight': 'balanced', 'max_depth': None, 'max_features': 'sqrt', 'n_estimators': 50}
* RandomForest Accuracy @ training data: 1.000
* RandomForest Accuracy @ test data: 0.968
* RandomForest Your score: 17
Training SVC...
* SVC Best parameters: {'C': 10, 'gamma': 'scale', 'kernel': 'linear'}
* SVC Accuracy @ training data: 0.969
* SVC Accuracy @ test data: 0.953
* SVC Your score: 15


In [10]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier
from sklearn import datasets
import numpy as np

if __name__ == "__main__":
    wdbc = datasets.load_breast_cancer()

    # Define models
    models = {
        'DecisionTree': DecisionTreeClassifier(),
        'RandomForest': RandomForestClassifier(),
        'SVC': SVC(),
        'KNeighbors': KNeighborsClassifier(),
        'GradientBoosting': GradientBoostingClassifier(),
        'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
    }

    # Define parameter grids
    param_grids = {
        'DecisionTree': {
            'criterion': ['gini', 'entropy'],
            'max_depth': [3, 5, 7, None],
            'min_samples_split': [2, 5, 10],
            'min_samples_leaf': [1, 2, 4],
            'max_features': [None, 'sqrt', 'log2'],
            'class_weight': [None, 'balanced']
        },
        'RandomForest': {
            'n_estimators': [50, 100, 200],
            'max_depth': [3, 5, 7, None],
            'max_features': ['sqrt', 'log2'],
            'class_weight': [None, 'balanced']
        },
        'SVC': {
            'C': [0.1, 1, 10],
            'kernel': ['linear', 'rbf'],
            'gamma': ['scale', 'auto']
        },
        'KNeighbors': {
            'n_neighbors': [3, 5, 7],
            'weights': ['uniform', 'distance'],
            'metric': ['euclidean', 'manhattan']
        },
        'GradientBoosting': {
            'n_estimators': [50, 100, 200],
            'learning_rate': [0.01, 0.1, 0.2],
            'max_depth': [3, 5, 7]
        },
        'XGBoost': {
            'n_estimators': [50, 100, 200],
            'learning_rate': [0.01, 0.1, 0.2],
            'max_depth': [3, 5, 7]
        }
    }

    # Perform Grid Search for each model
    for model_name, model in models.items():
        print(f"Training {model_name}...")
        grid_search = GridSearchCV(estimator=model, param_grid=param_grids[model_name], cv=5, return_train_score=True)
        grid_search.fit(wdbc.data, wdbc.target)

        # Get the best model
        best_model = grid_search.best_estimator_

        # Evaluate the best model
        acc_train = grid_search.cv_results_['mean_train_score'][grid_search.best_index_]
        acc_test = grid_search.best_score_
        print(f"* {model_name} Best parameters: {grid_search.best_params_}")
        print(f"* {model_name} Accuracy @ training data: {acc_train:.3f}")
        print(f"* {model_name} Accuracy @ test data: {acc_test:.3f}")
        print(f"* {model_name} Your score: {max(10 + 100 * (acc_test - 0.9), 0):.0f}")

Training DecisionTree...
* DecisionTree Best parameters: {'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 5, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 5}
* DecisionTree Accuracy @ training data: 0.977
* DecisionTree Accuracy @ test data: 0.949
* DecisionTree Your score: 15
Training RandomForest...
* RandomForest Best parameters: {'class_weight': None, 'max_depth': 7, 'max_features': 'log2', 'n_estimators': 100}
* RandomForest Accuracy @ training data: 0.999
* RandomForest Accuracy @ test data: 0.967
* RandomForest Your score: 17
Training SVC...
* SVC Best parameters: {'C': 10, 'gamma': 'scale', 'kernel': 'linear'}
* SVC Accuracy @ training data: 0.969
* SVC Accuracy @ test data: 0.953
* SVC Your score: 15
Training KNeighbors...
* KNeighbors Best parameters: {'metric': 'manhattan', 'n_neighbors': 7, 'weights': 'uniform'}
* KNeighbors Accuracy @ training data: 0.950
* KNeighbors Accuracy @ test data: 0.931
* KNeighbors Your score: 13
Training Gradi

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encode

* XGBoost Best parameters: {'learning_rate': 0.2, 'max_depth': 5, 'n_estimators': 200}
* XGBoost Accuracy @ training data: 1.000
* XGBoost Accuracy @ test data: 0.974
* XGBoost Your score: 17


In [11]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier
from sklearn import datasets
import numpy as np

if __name__ == "__main__":
    wdbc = datasets.load_breast_cancer()

    # Define models
    models = {
        'DecisionTree': DecisionTreeClassifier(),
        'RandomForest': RandomForestClassifier(),
        'SVC': SVC(),
        'KNeighbors': KNeighborsClassifier(),
        'GradientBoosting': GradientBoostingClassifier(),
        'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
    }

    # Define parameter grids with more detailed values
    param_grids = {
        'DecisionTree': {
            'criterion': ['gini', 'entropy'],
            'max_depth': [3, 5, 7, 9, 11, None],
            'min_samples_split': [2, 3, 4, 5, 6, 7, 8, 9, 10],
            'min_samples_leaf': [1, 2, 3, 4, 5],
            'max_features': [None, 'sqrt', 'log2'],
            'class_weight': [None, 'balanced']
        },
        'RandomForest': {
            'n_estimators': [50, 100, 150, 200, 250],
            'max_depth': [3, 5, 7, 9, 11, None],
            'max_features': ['sqrt', 'log2'],
            'class_weight': [None, 'balanced']
        },
        'SVC': {
            'C': [0.1, 0.5, 1, 5, 10],
            'kernel': ['linear', 'rbf', 'poly'],
            'gamma': ['scale', 'auto', 0.01, 0.1, 1]
        },
        'KNeighbors': {
            'n_neighbors': [3, 4, 5, 6, 7],
            'weights': ['uniform', 'distance'],
            'metric': ['euclidean', 'manhattan', 'minkowski']
        },
        'GradientBoosting': {
            'n_estimators': [50, 100, 150, 200],
            'learning_rate': [0.01, 0.05, 0.1, 0.2],
            'max_depth': [3, 4, 5, 6, 7]
        },
        'XGBoost': {
            'n_estimators': [50, 100, 150, 200],
            'learning_rate': [0.01, 0.05, 0.1, 0.2],
            'max_depth': [3, 4, 5, 6, 7]
        }
    }

    # Perform Grid Search for each model
    for model_name, model in models.items():
        print(f"Training {model_name}...")
        grid_search = GridSearchCV(estimator=model, param_grid=param_grids[model_name], cv=5, return_train_score=True)
        grid_search.fit(wdbc.data, wdbc.target)

        # Get the best model
        best_model = grid_search.best_estimator_

        # Evaluate the best model
        acc_train = grid_search.cv_results_['mean_train_score'][grid_search.best_index_]
        acc_test = grid_search.best_score_
        print(f"* {model_name} Best parameters: {grid_search.best_params_}")
        print(f"* {model_name} Accuracy @ training data: {acc_train:.3f}")
        print(f"* {model_name} Accuracy @ test data: {acc_test:.3f}")
        print(f"* {model_name} Your score: {max(10 + 100 * (acc_test - 0.9), 0):.0f}")

Training DecisionTree...
* DecisionTree Best parameters: {'class_weight': None, 'criterion': 'entropy', 'max_depth': 7, 'max_features': 'sqrt', 'min_samples_leaf': 3, 'min_samples_split': 5}
* DecisionTree Accuracy @ training data: 0.984
* DecisionTree Accuracy @ test data: 0.956
* DecisionTree Your score: 16
Training RandomForest...
* RandomForest Best parameters: {'class_weight': None, 'max_depth': None, 'max_features': 'log2', 'n_estimators': 200}
* RandomForest Accuracy @ training data: 1.000
* RandomForest Accuracy @ test data: 0.970
* RandomForest Your score: 17
Training SVC...
