广义线性模型

In [1]:
from sklearn import linear_model

reg = linear_model.LinearRegression()
reg.fit([[0,0],[1,1],[2,2]], [0,1,2])

LinearRegression()

In [3]:
reg.predict([[1, 1]])

array([1.])

In [4]:
# 这种预处理可以通过 Pipeline 工具进行简化。
# 可以创建一个表示简单多项式回归的单个对象，使用方法如下所示:

In [5]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
import numpy as np
model = Pipeline([('poly', PolynomialFeatures(degree=3)),
                    ('linear', LinearRegression(fit_intercept=False))])
# fit to an order-3 polynomial data
x = np.arange(5)
y = 3 - 2 * x + x ** 2 - x ** 3
model = model.fit(x[:, np.newaxis], y)
model.named_steps['linear'].coef_

array([ 3., -2.,  1., -1.])

In [None]:
# 模型评估与验证

In [11]:
from sklearn.model_selection import cross_val_score
from sklearn import metrics
from sklearn.model_selection import ShuffleSplit
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import datasets
from sklearn import svm

iris = datasets.load_iris()
clf = svm.SVC(kernel='linear', C=1)

In [13]:
n_samples = iris.data.shape[0]
cv = ShuffleSplit(n_splits=5, test_size=0.3, random_state=0)
cross_val_score(clf, iris.data, iris.target, cv=cv)

array([0.97777778, 0.97777778, 1.        , 0.95555556, 1.        ])

# 机器学习】使用 scikit-learn 构建模型的万能模板

In [None]:
## 1、加载数据集

from sklearn.datasets import load_iris
data = load_iris()
x = data.data
y = data.target

In [17]:
# 2、数据集拆分

from sklearn.model_selection import train_test_split
train_x,test_x,train_y,test_y = train_test_split(x,y,test_size=0.1,random_state=0)

In [20]:
#  方法1
# 1. 构建SVM分类模型

from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

svm_model = SVC()

svm_model.fit(train_x,train_y)

pred1 = svm_model.predict(train_x)
accuracy1 = accuracy_score(train_y,pred1)
print('在训练集上的精确度: %.4f'%accuracy1)

pred2 = svm_model.predict(test_x)
accuracy2 = accuracy_score(test_y,pred2)
print('在测试集上的精确度: %.4f'%accuracy2)

在训练集上的精确度: 0.9630
在测试集上的精确度: 1.0000


In [21]:
# 2. 构建LR分类模型
# LogisticRegression分类器

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score #评分函数用精确度评估

lr_model = LogisticRegression()

lr_model.fit(train_x,train_y)

pred1 = lr_model.predict(train_x)
accuracy1 = accuracy_score(train_y,pred1)
print('在训练集上的精确度: %.4f'%accuracy1)

pred2 = lr_model.predict(test_x)
accuracy2 = accuracy_score(test_y,pred2)
print('在测试集上的精确度: %.4f'%accuracy2)

在训练集上的精确度: 0.9704
在测试集上的精确度: 1.0000


In [None]:
# 3、构建随机森林分类模型



In [25]:
# 万能模板V2.0版
# 加入交叉验证，让算法模型评估更加科学
import warnings
warnings.filterwarnings('ignore')

### svm分类器

from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC

svm_model = SVC()
svm_model.fit(train_x,train_y)

scores1 = cross_val_score(svm_model,train_x,train_y,cv=5, scoring='accuracy')
# 输出精确度的平均值和置信度区间
print("训练集上的精确度: %0.2f (+/- %0.2f)" % (scores1.mean(), scores1.std() * 2))

scores2 = cross_val_score(svm_model,test_x,test_y,cv=5, scoring='accuracy')
# 输出精确度的平均值和置信度区间
print("测试集上的平均精确度: %0.2f (+/- %0.2f)" % (scores2.mean(), scores2.std() * 2))


print(scores1)
print(scores2)

训练集上的精确度: 0.96 (+/- 0.07)
测试集上的平均精确度: 0.73 (+/- 0.27)
[0.96296296 0.92592593 1.         1.         0.92592593]
[0.66666667 0.66666667 0.66666667 0.66666667 1.        ]


In [26]:
# LogisticRegression分类器

from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression

lr_model = LogisticRegression()
lr_model.fit(train_x,train_y)

scores1 = cross_val_score(lr_model,train_x,train_y,cv=5, scoring='accuracy')
# 输出精确度的平均值和置信度区间
print("训练集上的精确度: %0.2f (+/- %0.2f)" % (scores1.mean(), scores1.std() * 2))

scores2 = cross_val_score(lr_model,test_x,test_y,cv=5, scoring='accuracy')
# 输出精确度的平均值和置信度区间
print("测试集上的平均精确度: %0.2f (+/- %0.2f)" % (scores2.mean(), scores2.std() * 2))

print(scores1)
print(scores2)

训练集上的精确度: 0.96 (+/- 0.09)
测试集上的平均精确度: 0.80 (+/- 0.33)
[0.96296296 0.88888889 1.         1.         0.92592593]
[0.66666667 0.66666667 1.         0.66666667 1.        ]


In [27]:
# 万能模板V3.0版

###1、svm分类器
from sklearn.model_selection import cross_val_score,GridSearchCV
from sklearn.svm import SVC

svm_model = SVC()

params = [
        {'kernel': ['linear'], 'C': [1, 10, 100, 100]},
        {'kernel': ['poly'], 'C': [1], 'degree': [2, 3]},
        {'kernel': ['rbf'], 'C': [1, 10, 100, 100], 'gamma':[1, 0.1, 0.01, 0.001]}
        ]

best_model = GridSearchCV(svm_model, param_grid=params,cv = 5,scoring = 'accuracy')
best_model.fit(train_x,train_y)

GridSearchCV(cv=5, estimator=SVC(),
             param_grid=[{'C': [1, 10, 100, 100], 'kernel': ['linear']},
                         {'C': [1], 'degree': [2, 3], 'kernel': ['poly']},
                         {'C': [1, 10, 100, 100],
                          'gamma': [1, 0.1, 0.01, 0.001], 'kernel': ['rbf']}],
             scoring='accuracy')

In [33]:
best_model.best_score_
best_model.best_params_
best_model.best_estimator_
best_model.cv_results_

{'mean_fit_time': array([0.00144401, 0.00103574, 0.0011816 , 0.00133066, 0.00129461,
        0.00096045, 0.00279484, 0.003508  , 0.00418305, 0.00489459,
        0.00334477, 0.00308371, 0.00304551, 0.00367069, 0.00406837,
        0.00264535, 0.00245862, 0.00385575, 0.00361261, 0.00313792,
        0.00257778, 0.00266976]),
 'std_fit_time': array([4.89114297e-04, 6.65667747e-05, 4.13113480e-04, 4.20740595e-04,
        4.48958657e-04, 2.18475817e-04, 7.45882449e-04, 5.34369293e-04,
        1.14495094e-03, 6.53738920e-04, 5.27376616e-04, 9.24100251e-04,
        5.89283521e-04, 5.52355988e-04, 7.00248526e-04, 5.34485463e-04,
        4.94522973e-04, 2.73693663e-04, 7.92121775e-04, 7.06916910e-04,
        4.97384632e-04, 3.73392220e-04]),
 'mean_score_time': array([0.00062022, 0.00075941, 0.00059814, 0.00028248, 0.00066667,
        0.00093184, 0.00143023, 0.00190735, 0.00199533, 0.00281448,
        0.0016551 , 0.00160403, 0.00174713, 0.00232286, 0.00182834,
        0.00156822, 0.00158453, 0.00