In [1]:
import pandas as pd

blood_transfusion = pd.read_csv("../datasets/blood_transfusion.csv")
target_name = "Class"
data = blood_transfusion.drop(columns=target_name)
target = blood_transfusion[target_name]

In [2]:
blood_transfusion.head()

Unnamed: 0,Recency,Frequency,Monetary,Time,Class
0,2,50,12500,98,donated
1,0,13,3250,28,donated
2,1,16,4000,35,donated
3,2,20,5000,45,donated
4,1,24,6000,77,not donated


In [3]:
blood_transfusion.Class.value_counts()

not donated    570
donated        178
Name: Class, dtype: int64

In [6]:
from sklearn.dummy import DummyClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import ShuffleSplit, cross_validate

model = make_pipeline(StandardScaler(), DummyClassifier(strategy="most_frequent"))

model.fit(data, target)
predictions = model.score(data, target)
predictions



0.7620320855614974

In [5]:
cv = ShuffleSplit(random_state=0)
cv_results = cross_validate(estimator=model, X=data, y=target, scoring="balanced_accuracy")
cv_results = pd.DataFrame(cv_results)
print(f"model accuracy is {cv_results.test_score.mean():.2f}"
      f" +/- {cv_results.test_score.std():.2}")


model accuracy is 0.50 +/- 0.0


In [7]:
from sklearn.neighbors import KNeighborsClassifier

model = make_pipeline(StandardScaler(), KNeighborsClassifier(n_neighbors=1))
cv_results = cross_validate(
    model, data, target, cv=10, scoring="balanced_accuracy",
    return_train_score=True,
)
cv_results = pd.DataFrame(cv_results)
cv_results[["train_score", "test_score"]].mean()

train_score    0.882552
test_score     0.483987
dtype: float64

In [8]:
model = make_pipeline(StandardScaler(), KNeighborsClassifier(n_neighbors=2))
cv_results = cross_validate(
    model, data, target, cv=10, scoring="balanced_accuracy",
    return_train_score=True,
)
cv_results = pd.DataFrame(cv_results)
cv_results[["train_score", "test_score"]].mean()

train_score    0.834712
test_score     0.519634
dtype: float64

In [9]:
model = make_pipeline(StandardScaler(), KNeighborsClassifier(n_neighbors=5))
cv_results = cross_validate(
    model, data, target, cv=10, scoring="balanced_accuracy",
    return_train_score=True,
)
cv_results = pd.DataFrame(cv_results)
cv_results[["train_score", "test_score"]].mean()

train_score    0.693856
test_score     0.560707
dtype: float64

In [10]:
model = make_pipeline(StandardScaler(), KNeighborsClassifier(n_neighbors=10))
cv_results = cross_validate(
    model, data, target, cv=10, scoring="balanced_accuracy",
    return_train_score=True,
)
cv_results = pd.DataFrame(cv_results)
cv_results[["train_score", "test_score"]].mean()

train_score    0.684816
test_score     0.550017
dtype: float64

In [11]:
model = make_pipeline(StandardScaler(), KNeighborsClassifier(n_neighbors=20))
cv_results = cross_validate(
    model, data, target, cv=10, scoring="balanced_accuracy",
    return_train_score=True,
)
cv_results = pd.DataFrame(cv_results)
cv_results[["train_score", "test_score"]].mean()

train_score    0.667389
test_score     0.589800
dtype: float64

In [12]:
model = make_pipeline(StandardScaler(), KNeighborsClassifier(n_neighbors=50))
cv_results = cross_validate(
    model, data, target, cv=10, scoring="balanced_accuracy",
    return_train_score=True,
)
cv_results = pd.DataFrame(cv_results)
cv_results[["train_score", "test_score"]].mean()

train_score    0.621075
test_score     0.572386
dtype: float64

In [13]:
model = make_pipeline(StandardScaler(), KNeighborsClassifier(n_neighbors=100))
cv_results = cross_validate(
    model, data, target, cv=10, scoring="balanced_accuracy",
    return_train_score=True,
)
cv_results = pd.DataFrame(cv_results)
cv_results[["train_score", "test_score"]].mean()

train_score    0.546007
test_score     0.541830
dtype: float64

In [14]:
model = make_pipeline(StandardScaler(), KNeighborsClassifier(n_neighbors=200))
cv_results = cross_validate(
    model, data, target, cv=10, scoring="balanced_accuracy",
    return_train_score=True,
)
cv_results = pd.DataFrame(cv_results)
cv_results[["train_score", "test_score"]].mean()

train_score    0.5
test_score     0.5
dtype: float64

In [15]:
model = make_pipeline(StandardScaler(), KNeighborsClassifier(n_neighbors=500))
cv_results = cross_validate(
    model, data, target, cv=10, scoring="balanced_accuracy",
    return_train_score=True,
)
cv_results = pd.DataFrame(cv_results)
cv_results[["train_score", "test_score"]].mean()

train_score    0.5
test_score     0.5
dtype: float64

In [None]:
%%time
from sklearn.model_selection import validation_curve
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.neighbors import KNeighborsClassifier


model = make_pipeline(StandardScaler(), KNeighborsClassifier())
param_range = [1, 2, 5, 10, 20, 50, 100, 200, 500]
train_scores, test_scores = validation_curve(estimator=model, X=data, y=target, 
                                             param_name="n_neighbors", param_range=[1,2], 
                                             n_jobs=2, scoring="balanced_accuracy")

In [None]:
plt.plot(max_depth, train_errors.mean(axis=1), label="Training error")
plt.plot(max_depth, test_errors.mean(axis=1), label="Testing error")
plt.legend()

plt.xlabel("Maximum depth of decision tree")
plt.ylabel("Mean absolute error (k$)")
_ = plt.title("Validation curve for decision tree")