In [1]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import SGDRegressor, Ridge

In [2]:
x = [[80, 86],
     [82, 80],
     [85, 78],
     [90, 90],
     [86, 82],
     [82, 90],
     [78, 80],
     [92, 94]]
y = [84.2, 80.6, 80.1, 90, 83.2, 87.6, 79.4, 93.4]

In [3]:
estimator = LinearRegression()
estimator.fit(x, y)
estimator.coef_

array([0.3, 0.7])

In [4]:
estimator.predict([[100, 80]])

array([86.])

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.datasets import load_boston

In [8]:
# 正规方程

def linear_model1():
    """
    线性回归:正规方程
    :return:None
    """
    # 1.获取数据
    data = load_boston()

    # 2.数据集划分
    x_train, x_test, y_train, y_test = train_test_split(data.data, data.target, random_state=22)

    # 3.特征工程-标准化
    transfer = StandardScaler()
    x_train = transfer.fit_transform(x_train)
    x_test = transfer.fit_transform(x_test)

    # 4.机器学习-线性回归(正规方程)
    estimator = LinearRegression()
    estimator.fit(x_train, y_train)

    # 5.模型评估
    # 5.1 获取系数等值
    y_predict = estimator.predict(x_test)
    print("预测值为:\n", y_predict)
    print("模型中的系数为:\n", estimator.coef_)
    print("模型中的偏置为:\n", estimator.intercept_)

    # 5.2 评价
    # 均方误差
    error = mean_squared_error(y_test, y_predict)
    print("误差为:\n", error)
    print('score为\n', estimator.score(x_test, y_test))
linear_model1()

预测值为:
 [28.14790667 31.30481159 20.5173895  31.4803076  19.01576648 18.26058425
 20.57439825 18.45232382 18.46065155 32.93661269 20.3603692  27.24886071
 14.81691426 19.20872297 37.01503458 18.32036009  7.71389628 17.56196944
 30.18543811 23.60655873 18.14917545 33.84385342 28.48976083 16.9967041
 34.76065063 26.22246312 34.83857168 26.62310118 18.64402278 13.21154037
 30.37364532 14.70785748 37.18173708  8.88049446 15.06699441 16.14502168
  7.19990762 19.17049423 39.56848262 28.23663    24.62411509 16.75182833
 37.84465582  5.71770376 21.21547924 24.63882018 18.8561516  19.93416672
 15.19839712 26.29892968  7.4274177  27.14300763 29.18745146 16.27895854
  7.99799673 35.46394958 32.38905222 20.83161049 16.41464618 20.87141783
 22.92150844 23.60828508 19.32245804 38.33751529 23.87463642 18.98494066
 12.63480997  6.12915396 41.44675745 21.08894595 16.27561572 21.48546861
 40.74502107 20.4839158  36.82098808 27.0452329  19.79437176 19.64484428
 24.58763105 21.08454269 30.91968983 19.33266


    The Boston housing prices dataset has an ethical problem. You can refer to
    the documentation of this function for further details.

    The scikit-learn maintainers therefore strongly discourage the use of this
    dataset unless the purpose of the code is to study and educate about
    ethical issues in data science and machine learning.

    In this special case, you can fetch the dataset from the original
    source::

        import pandas as pd
        import numpy as np


        data_url = "http://lib.stat.cmu.edu/datasets/boston"
        raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
        data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
        target = raw_df.values[1::2, 2]

    Alternative datasets include the California housing dataset (i.e.
    :func:`~sklearn.datasets.fetch_california_housing`) and the Ames housing
    dataset. You can load the datasets as follows::

        from sklearn.datasets import fetch_california_h

In [10]:
# 梯度下降

def linear_model2():
    """
    线性回归:梯度下降法
    :return:None
    """
    # 1.获取数据
    data = load_boston()

    # 2.数据集划分
    x_train, x_test, y_train, y_test = train_test_split(data.data, data.target, random_state=22)

    # 3.特征工程-标准化
    transfer = StandardScaler()
    x_train = transfer.fit_transform(x_train)
    x_test = transfer.fit_transform(x_test)

    # 4.机器学习-线性回归(特征方程)
    estimator = SGDRegressor(max_iter=1000)
    estimator.fit(x_train, y_train)

    # 5.模型评估
    # 5.1 获取系数等值
    y_predict = estimator.predict(x_test)
    print("预测值为:\n", y_predict)
    print("模型中的系数为:\n", estimator.coef_)
    print("模型中的偏置为:\n", estimator.intercept_)

    # 5.2 评价
    # 均方误差
    error = mean_squared_error(y_test, y_predict)
    print("误差为:\n", error)
    print('score为\n', estimator.score(x_test, y_test))
linear_model2()

预测值为:
 [28.25331402 31.52470216 20.83796468 31.57002609 19.41091526 18.00524898
 20.77517508 18.51757368 18.75989851 32.77701149 20.67242036 26.69481901
 14.5672424  19.19934272 37.21553461 17.94622926  7.94406877 17.74849614
 30.65792399 23.77033989 17.84214374 33.96463236 28.16006665 16.41375411
 34.86235936 26.11233022 34.55711987 26.91051238 18.20794869 14.56198671
 30.4491424  12.85842881 37.82812402  8.90596015 15.28535322 15.28945488
  6.77548042 18.63188249 39.85113666 28.99612523 24.81185708 16.55360062
 38.46850609  4.85865222 20.62309902 24.3782866  19.2585154  19.97421616
 14.79688325 26.10203025  8.24085645 26.88444138 29.37541408 15.87452373
  7.62717603 35.51805819 31.48126771 22.38238978 16.44256817 21.00087503
 23.03836128 23.41027134 19.72188996 38.28183772 25.41474838 18.50317715
 12.45893441  5.29935202 42.05544162 21.12034583 15.38194715 21.92949426
 41.24316235 21.08744857 37.0235879  26.97912277 21.26988335 19.83144728
 24.87747083 23.3298545  31.5353626  19.2885


    The Boston housing prices dataset has an ethical problem. You can refer to
    the documentation of this function for further details.

    The scikit-learn maintainers therefore strongly discourage the use of this
    dataset unless the purpose of the code is to study and educate about
    ethical issues in data science and machine learning.

    In this special case, you can fetch the dataset from the original
    source::

        import pandas as pd
        import numpy as np


        data_url = "http://lib.stat.cmu.edu/datasets/boston"
        raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
        data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
        target = raw_df.values[1::2, 2]

    Alternative datasets include the California housing dataset (i.e.
    :func:`~sklearn.datasets.fetch_california_housing`) and the Ames housing
    dataset. You can load the datasets as follows::

        from sklearn.datasets import fetch_california_h

In [13]:
def linear_model3():
    """
    线性回归:岭回归
    :return:
    """
    # 1.获取数据
    data = load_boston()

    # 2.数据集划分
    x_train, x_test, y_train, y_test = train_test_split(data.data, data.target, random_state=22)

    # 3.特征工程-标准化
    transfer = StandardScaler()
    x_train = transfer.fit_transform(x_train)
    x_test = transfer.fit_transform(x_test)

    # 4.机器学习-线性回归(岭回归)
    estimator = Ridge(alpha=1)
    # estimator = RidgeCV(alphas=(0.1, 1, 10))
    estimator.fit(x_train, y_train)

    # 5.模型评估
    # 5.1 获取系数等值
    y_predict = estimator.predict(x_test)
    print("预测值为:\n", y_predict)
    print("模型中的系数为:\n", estimator.coef_)
    print("模型中的偏置为:\n", estimator.intercept_)

    # 5.2 评价
    # 均方误差
    error = mean_squared_error(y_test, y_predict)
    print("误差为:\n", error)
    print('score为\n', estimator.score(x_test, y_test))
linear_model3()

预测值为:
 [28.13514381 31.28742806 20.54637256 31.45779505 19.05568933 18.26035004
 20.59277879 18.46395399 18.49310689 32.89149735 20.38916336 27.19539571
 14.82641534 19.22385973 36.98699955 18.29852297  7.78481347 17.58930015
 30.19228148 23.61186682 18.14688039 33.81334203 28.44588593 16.97492092
 34.72357533 26.19400705 34.77212916 26.62689656 18.63066492 13.34246426
 30.35128911 14.59472585 37.18259957  8.93178571 15.10673508 16.1072542
  7.22299512 19.14535184 39.53308652 28.26937936 24.62676357 16.76310494
 37.85719041  5.71249289 21.17777272 24.60640023 18.90197753 19.95020929
 15.1922374  26.27853095  7.55102357 27.10160025 29.17947182 16.275476
  8.02888564 35.42165713 32.28262473 20.9525814  16.43494393 20.88177884
 22.92764493 23.58271167 19.35870763 38.27704421 23.98459232 18.96691367
 12.66552625  6.122414   41.44033214 21.09214394 16.23412117 21.51649375
 40.72274345 20.53192898 36.78646575 27.01972904 19.91315009 19.66906691
 24.59629369 21.2589005  30.93402996 19.3338604


    The Boston housing prices dataset has an ethical problem. You can refer to
    the documentation of this function for further details.

    The scikit-learn maintainers therefore strongly discourage the use of this
    dataset unless the purpose of the code is to study and educate about
    ethical issues in data science and machine learning.

    In this special case, you can fetch the dataset from the original
    source::

        import pandas as pd
        import numpy as np


        data_url = "http://lib.stat.cmu.edu/datasets/boston"
        raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
        data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
        target = raw_df.values[1::2, 2]

    Alternative datasets include the California housing dataset (i.e.
    :func:`~sklearn.datasets.fetch_california_housing`) and the Ames housing
    dataset. You can load the datasets as follows::

        from sklearn.datasets import fetch_california_h

In [14]:
import sklearn
sklearn.__version__

'1.0.2'

In [None]:
# 模型保存
import joblib
def load_dump_demo():
    """
    模型保存和加载
    :return:
    """
    # 1.获取数据
    data = load_boston()

    # 2.数据集划分
    x_train, x_test, y_train, y_test = train_test_split(data.data, data.target, random_state=22)

    # 3.特征工程-标准化
    transfer = StandardScaler()
    x_train = transfer.fit_transform(x_train)
    x_test = transfer.fit_transform(x_test)

    # 4.机器学习-线性回归(岭回归)
    # # 4.1 模型训练
    # estimator = Ridge(alpha=1)
    # estimator.fit(x_train, y_train)
    #
    # # 4.2 模型保存
    # joblib.dump(estimator, "./data/test.pkl")

    # 4.3 模型加载
    estimator = joblib.load("./data/test.pkl")

    # 5.模型评估
    # 5.1 获取系数等值
    y_predict = estimator.predict(x_test)
    print("预测值为:\n", y_predict)
    print("模型中的系数为:\n", estimator.coef_)
    print("模型中的偏置为:\n", estimator.intercept_)

    # 5.2 评价
    # 均方误差
    error = mean_squared_error(y_test, y_predict)
    print("误差为:\n", error)