In [33]:
import plotly.offline as py
py.init_notebook_mode(connected=True) # 离线笔记本形式
import plotly.graph_objs as go
from plotly import tools

In [34]:
from sklearn import datasets
from sklearn.model_selection import cross_val_predict
from sklearn import linear_model

# Cross Validated Predictions in Scikit-learn

## 计算

In [35]:
lr = linear_model.LinearRegression()
boston = datasets.load_boston()
y = boston.target
predicted = cross_val_predict(lr, boston.data, y, cv=10)

In [36]:
predicted.shape

(506,)

## 绘制交叉验证预测

In [37]:
trace1 = go.Scatter(x=y, y=predicted, mode='markers',
                    marker = dict(size=8,
                                  color='rgb(0, 0, 255)',
                                  line=dict(
                                    width=2,
                                    color='rgb(0, 0, 0)'))
                   )

trace2 = go.Scatter(x=[y.min(), y.max()],y=[y.min(), y.max()],
                    line = dict(color=('rgb(0, 0, 0)'),
                                width=5, dash='dash')
                   )

data = [trace1,trace2]

layout = go.Layout(showlegend=False,
                   yaxis=dict(
                    range = [-10,60],
                    zeroline=False,
                    title='Predicted'),
                   xaxis=dict(
                    title='Measured',)
                  )

fig = go.Figure(data = data, layout = layout)

py.iplot(fig, filename="c-v-predict")

# Isotonic Regression in Scikit-learn

In [38]:
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.isotonic import IsotonicRegression
from sklearn.utils import check_random_state

## 计算

In [39]:
n = 100
x = np.arange(n)
rs = check_random_state(0)
y = rs.randint(-50, 50, size=(n,)) + 50. * np.log(1 + np.arange(n))

ir = IsotonicRegression()
y_ = ir.fit_transform(x, y)

lr = LinearRegression()
lr.fit(x[:, np.newaxis], y)  # x needs to be 2d for LinearRegression

segments = [[[i, y[i]], [i, y_[i]]] for i in range(n)]

## Plotting Isotonic Regression

In [40]:
trace1 = go.Scatter(x=x, y=y, mode = 'markers',
            marker = dict(
                size = 10,
                color = 'red'),
            name="Data",
            )
trace2 = go.Scatter(x=x, y=y_, mode='lines+markers',
            marker = dict(
                size = 10,
                color = 'green'),
            name="Isotonic Fit"
            )
trace3 = go.Scatter(x=x, y=lr.predict(x[:, np.newaxis]),
            mode = 'lines',
            line = dict(
                color = 'blue'),
            name="Linear Fit"
            )

data = []

for i in range(len(segments)):
    trace4_data = go.Scatter(
            x=[segments[i][0][0],segments[i][1][0]],
            y=[segments[i][0][1],segments[i][1][1]],
            mode = 'lines',
            showlegend=False,
            line = dict(
                color = 'black',
                width = 0.5))
    data.append(trace4_data)
data.append(trace1)
data.append(trace2)
data.append(trace3)
        
layout = go.Layout(title = "Isotonic regression")

fig = go.Figure(data=data , layout=layout)

py.iplot(fig, filename="Isotonic_Regression")

# Pipelining in Scikit-learn

In [41]:
import numpy as np
from sklearn import linear_model, decomposition, datasets
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

## 计算

In [42]:
logistic = linear_model.LogisticRegression()

pca = decomposition.PCA()
pipe = Pipeline(steps=[('pca', pca), ('logistic', logistic)])

digits = datasets.load_digits()
X_digits = digits.data
y_digits = digits.target

## PCA谱图

In [43]:
pca.fit(X_digits)

trace1 = go.Scatter(y=pca.explained_variance_ , 
                    mode="lines", line=dict(
                    width=2,
                    color='blue'),
                    name="PCA Spectrum"
                   )
layout1 = go.Layout(xaxis=dict(
                    title="n_components"),
                    yaxis=dict(
                    title="explained_variance_"))

fig1 = go.Figure(data=[trace1], layout=layout1)

py.iplot(fig1, filename="PCA-Spectrum")

## 预测图

In [44]:
n_componentsn_compo  = [20, 40, 64]
Cs = np.logspace(-4, 4, 3)

#Parameters of pipelines can be set using ‘__’ separated parameter names:

estimator = GridSearchCV(pipe,
                         dict(pca__n_components=n_componentsn_compo,
                              logistic__C=Cs))

estimator.fit(X_digits, y_digits)
x_ = estimator.best_estimator_.named_steps['pca'].n_components

trace2 = go.Scatter(x = [x_ , x_], y=[0, 1],
                    mode="lines", line=dict(
                    width=2,
                    dash='dot'),
                    name="n_components chosen",
                   )

layout2 = go.Layout(showlegend=True)

fig2 = go.Figure(data=[trace2], layout=layout2)

py.iplot(fig2, filename = "Prediction")

## 结合两张图

In [45]:
trace2 = go.Scatter(x=[x_ , x_], y=[0, 178],
                    mode="lines", line=dict(
                    width=1,
                    dash='dot',
                    color="rgb(10 ,10 , 240)"),
                    name="n_components chosen",
                   )

layout3 = go.Layout(xaxis=dict(
                    title="n_components"),
                    yaxis=dict(
                    title="explained_variance_"))

fig3 = go.Figure(data=[trace1, trace2], layout=layout3)

py.iplot(fig3, filename="pipeline")

# Dimensionality Reduction in Scikit-learn

In [46]:
from __future__ import print_function, division
import numpy as np

from sklearn.datasets import load_digits
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.decomposition import PCA, NMF
from sklearn.feature_selection import SelectKBest, chi2

## 计算

In [47]:
print(__doc__)  # 输出文件开头注释的内容

pipe = Pipeline([
    ('reduce_dim', PCA()),
    ('classify', LinearSVC())
])

N_FEATURES_OPTIONS = [2, 4, 8]
C_OPTIONS = [1, 10, 100, 1000]
param_grid = [
    {
        'reduce_dim': [PCA(iterated_power=7), NMF()],
        'reduce_dim__n_components': N_FEATURES_OPTIONS,
        'classify__C': C_OPTIONS
    },
    {
        'reduce_dim': [SelectKBest(chi2)],
        'reduce_dim__k': N_FEATURES_OPTIONS,
        'classify__C': C_OPTIONS
    },
]
reducer_labels = ['PCA', 'NMF', 'KBest(chi2)']

grid = GridSearchCV(pipe, cv=3, n_jobs=2, param_grid=param_grid)

digits = load_digits()

grid.fit(digits.data, digits.target)

mean_scores = np.array(grid.cv_results_['mean_test_score'])

# scores are in the order of param_grid iteration, which is alphabetical

mean_scores = mean_scores.reshape(len(C_OPTIONS), -1, len(N_FEATURES_OPTIONS))

# select score for best C
mean_scores = mean_scores.max(axis=0)
bar_offsets = (np.arange(len(N_FEATURES_OPTIONS)) *
               (len(reducer_labels) + 1) + .5)

Automatically created module for IPython interactive environment


## 绘制特征减少技术的比较

In [48]:
data = []
COLORS = ['blue','green','red']
for i, (label, reducer_scores) in enumerate(zip(reducer_labels, mean_scores)):
    trace = go.Bar(x=bar_offsets + i, y = reducer_scores, name=label,  marker=dict(
                color=COLORS[i]))
    data.append(trace)
    
layout = go.Layout(
                title = "Comparing feature reduction techniques",
                xaxis = dict(
                    dtick=2,
                    title="Reduced number of features",),
                yaxis = dict(
                    title="Digit classification accuracy",
                    range= [0,1]))

fig = go.Figure(data=data,layout=layout)

py.iplot(fig, filename="dimensionality-reduction")

# Multilabel classification in Scikit-learn

In [49]:
from sklearn.datasets import make_multilabel_classification
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import LabelBinarizer
from sklearn.decomposition import PCA
from sklearn.cross_decomposition import CCA

## 计算

In [50]:
print(__doc__)
fig = tools.make_subplots(rows=2, cols=2, 
                          subplot_titles=('With unlabeled samples + CCA',
                                          'With unlabeled samples + PCA', 
                                          'Without unlabeled samples + CCA',
                                          'Without unlabeled samples + PCA')
                         )
def plot_hyperplane(clf, min_x, max_x, name, shape, leg):
    # get the separating hyperplane
    w = clf.coef_[0]
    a = -w[0] / w[1]
    xx = np.linspace(min_x - 5, max_x + 5)  # make sure the line is long enough
    yy = a * xx - (clf.intercept_[0]) / w[1]
    if leg ==1: leg = True
    else: leg = False
    return go.Scatter(x=xx, y=yy, name=name, mode="lines",
                      showlegend=leg,
                      line=dict(
                            color=('black'),
                            width=1.5,
                            dash=shape)
                     )

def plot_subfigure(X, Y, subplot_row ,subplot_col , transform ,leg):
    if transform == "pca":
        X = PCA(n_components=2).fit_transform(X)
    elif transform == "cca":
        X = CCA(n_components=2).fit(X, Y).transform(X)
    else:
        raise ValueError

    min_x = np.min(X[:, 0])
    max_x = np.max(X[:, 0])

    min_y = np.min(X[:, 1])
    max_y = np.max(X[:, 1])

    classif = OneVsRestClassifier(SVC(kernel='linear'))
    classif.fit(X, Y)
    zero_class = np.where(Y[:, 0])
    one_class = np.where(Y[:, 1])
    
    trace1 = go.Scatter(x=X[:, 0], y=X[:, 1], mode="markers",
                        showlegend=False,
                        marker=dict(
                                color='gray',size =10,
                                line = dict(
                                        width = 2, color="black")
                        ))
    
    trace2 = go.Scatter(x=X[zero_class, 0][0], y=X[zero_class, 1][0],
                        name="Class 1", showlegend=leg,
                        mode='markers', 
                        marker=dict(
                                size=14, color='white',
                                line=dict(
                                        width=3, color='blue')
                       ))

    trace3 = go.Scatter(x=X[one_class, 0][0], y=X[one_class, 1][0],
                        name='Class 2', showlegend=leg,
                        mode = 'markers',
                        marker = dict(
                                size=14, color='white',
                                line=dict(
                                        width = 3, color='orange')
                        ))

    
    fig.append_trace(trace2, subplot_row, subplot_col)
    fig.append_trace(trace3, subplot_row, subplot_col)
    fig.append_trace(trace1, subplot_row, subplot_col)
    
    trace4 = plot_hyperplane(classif.estimators_[0], min_x, max_x, 
                    'Boundary<br>for class 1','dash', leg,)
    
    trace5 = plot_hyperplane(classif.estimators_[1], min_x, max_x, 
                    'Boundary<br>for class 2','dashdot',leg,)
    
    fig.append_trace(trace4, subplot_row, subplot_col)
    fig.append_trace(trace5, subplot_row, subplot_col)
    
    
    fig['layout']['xaxis1'].update(range=[-3, 3], zeroline=False,
                                  showgrid=False)
    fig['layout']['yaxis1'].update(range=[-5, 5], zeroline=False,
                                  showgrid=False)
    fig['layout']['xaxis3'].update(range=[-4, 4], zeroline=False,
                                  showgrid=False)
    fig['layout']['yaxis3'].update(range=[-4, 4], zeroline=False,
                                  showgrid=False)
    fig['layout']['xaxis4'].update(range=[-8, 8], zeroline=False,
                                  showgrid=False)
    fig['layout']['yaxis4'].update(range=[-10, 10], zeroline=False,
                                  showgrid=False)
    fig['layout']['xaxis2'].update(title='First principal component', range=[-3, 8],
                                   zeroline=False, showgrid=False)
    fig['layout']['yaxis2'].update(title='Second principal component', range=[-10, 10],
                                   zeroline= False, showgrid=False)
    fig['layout'].update(height=900, width=1000)

Automatically created module for IPython interactive environment
This is the format of your plot grid:
[ (1,1) x1,y1 ]  [ (1,2) x2,y2 ]
[ (2,1) x3,y3 ]  [ (2,2) x4,y4 ]



## 绘图结果

In [51]:
X, Y = make_multilabel_classification(n_classes=2, n_labels=1,
                                      allow_unlabeled=True,
                                      random_state=1)

plot_subfigure(X, Y, 1,1,  "cca", True)
plot_subfigure(X, Y, 1,2,  "pca", False)

X, Y = make_multilabel_classification(n_classes=2, n_labels=1,
                                      allow_unlabeled=False,
                                      random_state=1)

plot_subfigure(X, Y, 2,1,"cca", False)
plot_subfigure(X, Y, 2,2, "pca", False)

py.iplot(fig, filename="multilabel-classification")

# 欣赏

https://plot.ly/scikit-learn/plot-kernel-ridge-regression/

https://plot.ly/scikit-learn/plot-kernel-approximation/