In [6]:
import pandas as pd
import numpy as np

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold

from sklearn.model_selection import train_test_split

In [7]:
df = pd.read_csv("breast-cancer-wisconsin.csv")


### Train / Test Split

In [8]:
def create_features(df):
    df = df.copy()

    features = [
        "radius_mean","texture_mean","perimeter_mean","area_mean",
        "smoothness_mean","compactness_mean","concavity_mean",
        "concave points_mean","symmetry_mean","fractal_dimension_mean",
        "radius_se","texture_se","perimeter_se","area_se","smoothness_se",
        "compactness_se","concavity_se","concave points_se","symmetry_se",
        "fractal_dimension_se","radius_worst","texture_worst",
        "perimeter_worst","area_worst","smoothness_worst","compactness_worst",
        "concavity_worst","concave points_worst","symmetry_worst",
        "fractal_dimension_worst",
    ]

    categories = ["diagnosis"]

    for category in categories:
        le = LabelEncoder()
        df[category] = le.fit_transform(df[category])
    
    X = df[features]
    y = df["diagnosis"]

    return X, y

### Create Model

In [18]:
kf = KFold(n_splits=10, shuffle=True, random_state=0)
scores = []

for train_index, test_index in kf.split(df):
    train = df.iloc[train_index]
    test = df.iloc[test_index]

    X_train, y_train = create_features(df=train)
    X_test, y_test = create_features(df=test)

    model = GradientBoostingClassifier(
        learning_rate=0.1,
        n_estimators=1000,
        max_depth=3,
        random_state=0,
        verbose=0
    )

    model.fit(X_train, y_train)
    
    score = model.score(X_test, y_test)
    scores.append(score)

### Evaluate

In [19]:
np.average(scores)

0.9736842105263157