In [None]:
# 数据处理，制图
import numpy as np
import pandas as pd
import matplotlib as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
# 回归包：线性回归；k紧邻回归;决策树回归；随机森林回归；支持向量机；lightGBM
from sklearn.linear_model import  LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
import lightgbm as lgb

In [None]:
# 切分数据和评价指标
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import SGDRegressor
# from sklearn.model_selection import learning_curve
# from sklearn.model_selection import ShuffleSplit

In [None]:
# 数据读取
train_data_file = "../data/zhengqi_train.txt"
test_data_file = "../data/zhengqi_test.txt"
train_data = pd.read_csv(train_data_file,sep='\t',encoding='utf-8')
test_data = pd.read_csv(test_data_file,sep='\t',encoding='utf-8')


In [None]:
# 归一化处理,先fit拟合到train的方差和均值，再对测试与训练集进行归一化（保证模型的统一）
from sklearn import preprocessing
feature_colunms = [col for col in train_data.columns if col not in ['target']]
min_max_scaler = preprocessing.MinMaxScaler()
min_max_scaler = min_max_scaler.fit(train_data[feature_colunms])
train_data_scaler = min_max_scaler.transform(train_data[feature_colunms])
test_data_scaler = min_max_scaler.transform(test_data[feature_colunms])
train_data_scaler = pd.DataFrame(train_data_scaler)
train_data_scaler.columns = feature_colunms
test_data_scaler = pd.DataFrame(test_data_scaler)
test_data_scaler.columns = feature_colunms
train_data_scaler['target'] = train_data['target']


In [None]:
# PCA降维
from sklearn.decomposition import PCA
# 保留16个主成分
pca = PCA(n_components=16)
# 剔除target
new_train_pca_16 = pca.fit_transform(train_data_scaler.iloc[:,0:-1])
new_test_pca_16 = pca.transform(test_data_scaler)
new_train_pca_16 = pd.DataFrame(new_train_pca_16)
new_train_pca_16['target'] = train_data_scaler['target']
new_test_pca_16 = pd.DataFrame(new_test_pca_16)


In [None]:
# 切分数据集
new_train_pca_16 =new_train_pca_16.fillna(0)
train = new_train_pca_16[new_test_pca_16.columns]
target = new_train_pca_16['target']
train_data,test_data,train_target,test_target= train_test_split(train,target,test_size=0.2,random_state=0)

In [None]:
# 欠拟合模拟
clf = SGDRegressor(max_iter=500,tol=1e-2)
clf.fit(train_data,train_target)
score_train = mean_squared_error(train_target,clf.predict(train_data))
score_test = mean_squared_error(test_target,clf.predict(test_data))
print("SGB train MSE:",score_train)
print("SGB test MSE:",score_test)


In [None]:
# 过拟合模拟
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(5)
train_data_poly = poly.fit_transform(train_data)
test_data_poly = poly.transform(test_data)
clf = SGDRegressor(max_iter=1000,tol=1e-3)
clf.fit(train_data_poly,train_target)
score_train = mean_squared_error(train_target,clf.predict(train_data_poly))
score_test = mean_squared_error(test_target,clf.predict(test_data_poly))
print("SGB train MSE:",score_train)
print("SGB test MSE:",score_test)

In [None]:
# 正常拟合
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(3)
train_data_poly = poly.fit_transform(train_data)
test_data_poly = poly.transform(test_data)
clf = SGDRegressor(max_iter=1000,tol=1e-3)
clf.fit(train_data_poly,train_target)
score_train = mean_squared_error(train_target,clf.predict(train_data_poly))
score_test = mean_squared_error(test_target,clf.predict(test_data_poly))
print("SGB train MSE:",score_train)
print("SGB test MSE:",score_test)

In [None]:
# 如何解决过拟合
# 正则化（L2）
poly = PolynomialFeatures(3)
train_data_poly = poly.fit_transform(train_data)
test_data_poly = poly.transform(test_data)
clf = SGDRegressor(max_iter=1000,tol=1e-3,penalty='L2',alpha=0.0001)
# 联合L1和L2范数加权正则化
# clf = SGDRegressor(max_iter=1000,tol=1e-3,penalty='elasticnet',alpha=0.0001)
clf.fit(train_data_poly,train_target)
score_train = mean_squared_error(train_target,clf.predict(train_data_poly))
score_test = mean_squared_error(test_target,clf.predict(test_data_poly))
print("SGB train MSE:",score_train)
print("SGB test MSE:",score_test)

In [None]:
# 模型校验
# 5折交叉验证;划分为5个子集，5次循环，每次（4个训练，一个测试）
from sklearn.model_selection import KFold
kf = KFold(n_splits=5)
for k,(train_index,test_index) in enumerate(kf.split(train)):
    train_data,test_data,train_target,test_target = train.values[train_index],train.values[test_index],target[train_index],target[test_index]
    clf = SGDRegressor(max_iter=1000,tol=1e-3)
    clf.fit(train_data,train_target)
    score_train = mean_squared_error(train_target,clf.predict(train_data))
    score_test = mean_squared_error(test_target,clf.predict(test_data))
    print(k,"折","SGB train MSE:",score_train)
    print(k,"折","SGB test MSE:",score_test,'\n')

In [None]:
# 留一法
from sklearn.model_selection import LeaveOneOut
loo = LeaveOneOut()
num = 100
for k,(train_index,test_index) in enumerate(loo.split(train)):
    train_data,test_data,train_target,test_target = train.values[train_index],train.values[test_index],target[train_index],target[test_index]
    clf = SGDRegressor(max_iter=1000,tol=1e-3)
    clf.fit(train_data,train_target)
    score_train = mean_squared_error(train_target,clf.predict(train_data))
    score_test = mean_squared_error(test_target,clf.predict(test_data))
    print(k,"个","SGB train MSE:",score_train)
    print(k,"个","SGB test MSE:",score_test,'\n')
    if k>30:
        break


In [35]:
# 留P法交叉验证
from sklearn.model_selection import LeavePOut
lpo = LeavePOut(p=10)
num = 100
for k,(train_index,test_index) in enumerate(loo.split(train)):
    train_data,test_data,train_target,test_target = train.values[train_index],train.values[test_index],target[train_index],target[test_index]
    clf = SGDRegressor(max_iter=1000,tol=1e-3)
    clf.fit(train_data,train_target)
    score_train = mean_squared_error(train_target,clf.predict(train_data))
    score_test = mean_squared_error(test_target,clf.predict(test_data))
    print(k," 10个","SGB train MSE:",score_train)
    print(k," 10个","SGB test MSE:",score_test,'\n')
    if k>30:
        break

0  10个 SGB train MSE: 0.141589610326405
0  10个 SGB test MSE: 0.011950247889628162 

1  10个 SGB train MSE: 0.1415549895815349
1  10个 SGB test MSE: 0.12599965958586606 

2  10个 SGB train MSE: 0.14154402696152402
2  10个 SGB test MSE: 0.037397321105189486 

3  10个 SGB train MSE: 0.14157920068459187
3  10个 SGB test MSE: 0.0037706832607911256 

4  10个 SGB train MSE: 0.14170542600264072
4  10个 SGB test MSE: 0.013065837012642474 

5  10个 SGB train MSE: 0.14149969314180227
5  10个 SGB test MSE: 0.13617504149350784 

6  10个 SGB train MSE: 0.1415515802873285
6  10个 SGB test MSE: 0.025152175814853645 

7  10个 SGB train MSE: 0.14094377491123947
7  10个 SGB test MSE: 0.000782984733135422 

8  10个 SGB train MSE: 0.14159194881331255
8  10个 SGB test MSE: 0.09163086160239475 

9  10个 SGB train MSE: 0.14160465173351186
9  10个 SGB test MSE: 0.05272489073076243 

10  10个 SGB train MSE: 0.14168381993896964
10  10个 SGB test MSE: 0.006617579772373373 

11  10个 SGB train MSE: 0.1415958344395051
11  10个 SGB test 