In [12]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.tree import DecisionTreeClassifier,export_graphviz
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression,SGDRegressor,\
    Ridge,LogisticRegression, Lasso
from sklearn.datasets import load_boston
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error,classification_report,roc_auc_score


## 1、	练习岭回归，lasso回归

In [2]:
# 岭回归
house_price = load_boston()
x_train,x_test,y_train,y_test = train_test_split(
    house_price.data,house_price.target,test_size=0.25,random_state=12)


    The Boston housing prices dataset has an ethical problem. You can refer to
    the documentation of this function for further details.

    The scikit-learn maintainers therefore strongly discourage the use of this
    dataset unless the purpose of the code is to study and educate about
    ethical issues in data science and machine learning.

    In this special case, you can fetch the dataset from the original
    source::

        import pandas as pd
        import numpy as np

        data_url = "http://lib.stat.cmu.edu/datasets/boston"
        raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
        data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
        target = raw_df.values[1::2, 2]

    Alternative datasets include the California housing dataset (i.e.
    :func:`~sklearn.datasets.fetch_california_housing`) and the Ames housing
    dataset. You can load the datasets as follows::

        from sklearn.datasets import fetch_california_ho

In [3]:
# 数据集进行标准化
stand = StandardScaler()
x_train = stand.fit_transform(x_train)
x_test = stand.transform(x_test)

In [6]:
# 对目标值进行标准化
std = StandardScaler()
y_train = y_train.reshape(-1,1)
y_test = y_test.reshape(-1,1)
y_train = std.fit_transform(y_train)
y_test = std.transform(y_test)

In [94]:
# 利用岭回归进行房价预测
# ridge = Ridge(0.5)  # 0.2671640681649236
# ridge = Ridge(1.0)  # 0.2669920859086866
# ridge = Ridge(1.5)  # 0.2668356134940821
# ridge = Ridge(1.9)  # 0.2667209196027645
# ridge = Ridge(4)  # 0.2662532860821976
# ridge = Ridge(5)  # 0.26609930187659925
# ridge = Ridge(6)  # 0.2659828805934769
ridge = Ridge(10) # 0.2658197961390732
ridge.fit(x_train,y_train)


In [95]:
# 模型评估
y_predict = ridge.predict(x_test)
mse = mean_squared_error(y_test,y_predict)
print(mse)

0.2658197961390732


In [136]:
# Lasso回归
# lasso = Lasso(alpha=0.01) # 0.2651680331001297
lasso = Lasso(alpha=0.006) # 0.2646419022195875
lasso.fit(x_train,y_train)

In [137]:
# 模型评估
y_predict = lasso.predict(x_test)
mse = mean_squared_error(y_test,y_predict)
print(mse)

0.2646419022195875


## 2、	练习逻辑回归案例，调整参数，得到最佳精确率及召回率

In [56]:
data = pd.read_csv("breast-cancer-wisconsin.data")

In [57]:
data.columns = ['Sample code number', 'Clump Thickness', 'Uniformity of Cell                   Size', 'Uniformity of Cell Shape','Marginal Adhesion',
                'Single Epithelial Cell Size', 'Bare Nuclei', 'Bland Chromatin', 'Normal Nucleoli','Mitoses', 'Class']

In [58]:
# 处理缺失值
data = data.replace(to_replace='?', value=np.nan)
data = data.dropna()
print(data.isnull().count())

Sample code number                           682
Clump Thickness                              682
Uniformity of Cell                   Size    682
Uniformity of Cell Shape                     682
Marginal Adhesion                            682
Single Epithelial Cell Size                  682
Bare Nuclei                                  682
Bland Chromatin                              682
Normal Nucleoli                              682
Mitoses                                      682
Class                                        682
dtype: int64


In [59]:
# 划分数据集
x_train,x_test,y_train,y_test = train_test_split(
    data[data.columns[1:10]],data[data.columns[10]],
    test_size=0.25,random_state=1)

In [60]:
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

(511, 9)
(511,)
(171, 9)
(171,)


In [61]:
# 标准化
stand = StandardScaler()
x_train = stand.fit_transform(x_train)
x_test = stand.transform(x_test)

In [65]:
# 利用逻辑回归进行预测
lg = LogisticRegression(C=0.8,solver='liblinear')
lg.fit(x_train,y_train)

In [66]:
# 模型评估
score = lg.score(x_test,y_test)
print(score)

0.9532163742690059


In [67]:
# 利用模型进行预测
y_predict = lg.predict(x_test)
print(y_predict)

[2 4 4 4 2 4 2 2 4 4 2 2 4 4 2 2 2 2 2 4 2 4 2 4 4 2 2 4 4 4 4 2 2 2 2 4 2
 2 4 2 4 2 2 2 4 2 2 2 2 2 2 2 4 2 4 2 2 2 2 4 2 2 4 2 2 2 2 2 4 2 4 4 2 2
 4 2 2 2 4 2 4 2 2 2 2 2 4 2 2 2 2 2 4 2 4 2 4 4 2 2 4 2 2 2 2 2 2 2 2 2 4
 4 4 2 2 2 4 2 2 4 4 2 2 2 4 4 2 2 4 2 2 2 2 2 2 2 2 2 2 4 4 4 2 2 4 2 4 2
 4 2 2 2 2 2 2 4 2 2 4 2 4 2 4 2 2 2 2 4 2 4 2]


In [69]:
recall = classification_report(y_test,y_predict,target_names=["良性","恶性"])
print(recall)

              precision    recall  f1-score   support

          良性       0.96      0.97      0.96       112
          恶性       0.95      0.92      0.93        59

    accuracy                           0.95       171
   macro avg       0.95      0.94      0.95       171
weighted avg       0.95      0.95      0.95       171

