In [1]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold

In [2]:
df = pd.read_csv("breast-cancer-wisconsin.csv")

df, df_validate = np.split(df.sample(frac=1, random_state=0), [int(.8*len(df))])

  return bound(*args, **kwds)


### Train / Test Split

In [3]:
def create_features(df):
    df = df.copy()

    features = [
        "radius_mean","texture_mean","perimeter_mean","area_mean",
        "smoothness_mean","compactness_mean","concavity_mean",
        "concave points_mean","symmetry_mean","fractal_dimension_mean",
        "radius_se","texture_se","perimeter_se","area_se","smoothness_se",
        "compactness_se","concavity_se","concave points_se","symmetry_se",
        "fractal_dimension_se","radius_worst","texture_worst",
        "perimeter_worst","area_worst","smoothness_worst","compactness_worst",
        "concavity_worst","concave points_worst","symmetry_worst",
        "fractal_dimension_worst",
    ]

    categories = ["diagnosis"]

    for category in categories:
        le = LabelEncoder()
        df[category] = le.fit_transform(df[category])
    
    X = df[features]
    y = df["diagnosis"]

    return X, y

### Create Model

In [4]:
def get_model():
    model = RandomForestClassifier(
        n_estimators=50,
        max_depth=3,
        random_state=0,
        verbose=0
    )
    
    return model

In [5]:
kf = KFold(n_splits=10, shuffle=True, random_state=0)
scores = []

for train_index, test_index in kf.split(df):
    train = df.iloc[train_index]
    test = df.iloc[test_index]

    X_train, y_train = create_features(df=train)
    X_test, y_test = create_features(df=test)

    model = get_model()

    model.fit(X_train, y_train)
    
    score = model.score(X_test, y_test)
    scores.append(score)

print(np.average(scores))
print(scores)

0.945072463768116
[0.9565217391304348, 0.9565217391304348, 0.9565217391304348, 1.0, 0.8478260869565217, 0.9777777777777777, 0.8888888888888888, 1.0, 0.9333333333333333, 0.9333333333333333]


### Validation

In [6]:
X, y = create_features(df=df)
X_val, y_val = create_features(df=df_validate)

model = get_model()

model.fit(X, y)

model.score(X_val, y_val)

0.9736842105263158