In [37]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

In [2]:
path = "F:/for learn/Python_MachineLearning/"
df = pd.read_csv(path + "wdbc.data", header=None)

In [8]:
X = df.iloc[:, 2:].values
y = df.iloc[:, 1].values
le = LabelEncoder()
y = le.fit_transform(y)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [14]:
pipe_lr = Pipeline([("scaler", StandardScaler()), 
                    ('pca', PCA(n_components=2)),
                    ('lr', LogisticRegression(random_state=1))])
pipe_lr.fit(X_train, y_train)
print("the Test ACC is: %.3f" % pipe_lr.score(X_test, y_test))

the Test ACC is: 0.921


In [22]:
kfold = StratifiedKFold(n_splits=10, random_state=1)

In [35]:
scores = []
for k, (train, test) in enumerate(kfold.split(X_train, y_train)):
    pipe_lr.fit(X_train[train], y_train[train])
    score = pipe_lr.score(X_train[test], y_train[test])
    scores.append(score)
    print("Fold: %d, Class dist: %s, ACC: %.3f" % (k+1, np.bincount(y_train[train]), score))

Fold: 1, Class dist: [261 148], ACC: 0.978
Fold: 2, Class dist: [261 148], ACC: 0.978
Fold: 3, Class dist: [261 148], ACC: 0.957
Fold: 4, Class dist: [261 148], ACC: 0.978
Fold: 5, Class dist: [261 148], ACC: 0.891
Fold: 6, Class dist: [261 149], ACC: 0.956
Fold: 7, Class dist: [261 149], ACC: 0.956
Fold: 8, Class dist: [261 149], ACC: 0.956
Fold: 9, Class dist: [261 149], ACC: 1.000
Fold: 10, Class dist: [261 149], ACC: 0.978


In [36]:
print("CV ACC: %.3f +/- %.3f" % (np.mean(scores), np.std(scores)))

CV ACC: 0.963 +/- 0.028


In [39]:
# 采用sklearn中的集成方法做cv
scores = cross_val_score(estimator=pipe_lr, X=X_train, y=y_train, cv=10, n_jobs=-1)
print("CV ACC scores: %s" % scores)

CV ACC scores: [ 0.97826087  0.97826087  0.95652174  0.97826087  0.89130435  0.95555556
  0.95555556  0.95555556  1.          0.97777778]


In [40]:
print("CV ACC: %.3f +/- %.3f" % (np.mean(scores), np.std(scores)))

CV ACC: 0.963 +/- 0.028
