# 多元线性回归模型

### 读取数据

In [1]:
import pandas as pd
df = pd.read_excel('信用评分卡模型.xlsx')
df.head()

Unnamed: 0,月收入,年龄,性别,历史授信额度,历史违约次数,信用评分
0,7783,29,0,32274,3,73
1,7836,40,1,6681,4,72
2,6398,25,0,26038,2,74
3,6483,23,1,24584,4,65
4,5167,23,1,6710,3,73


### 提取特征变量和目标变量

In [2]:
X = df.drop(columns='信用评分')
Y = df['信用评分']

### 模型训练

In [3]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(X,Y)

LinearRegression()

### 线性回归方程构造

In [4]:
print('各系数为:' + str(model.coef_))
print('常数项系数k0为:' + str(model.intercept_))

各系数为:[ 5.58658996e-04  1.62842002e-01  2.18430276e-01  6.69996665e-05
 -1.51063940e+00]
常数项系数k0为:67.16686603853304


### 模型评估

In [5]:
import statsmodels.api as sm
X2 = sm.add_constant(X)
est = sm.OLS(Y, X2).fit()
est.summary()

0,1,2,3
Dep. Variable:,信用评分,R-squared:,0.629
Model:,OLS,Adj. R-squared:,0.628
Method:,Least Squares,F-statistic:,337.6
Date:,"Wed, 27 Apr 2022",Prob (F-statistic):,2.32e-211
Time:,03:16:27,Log-Likelihood:,-2969.8
No. Observations:,1000,AIC:,5952.0
Df Residuals:,994,BIC:,5981.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,67.1669,1.121,59.906,0.000,64.967,69.367
月收入,0.0006,8.29e-05,6.735,0.000,0.000,0.001
年龄,0.1628,0.022,7.420,0.000,0.120,0.206
性别,0.2184,0.299,0.730,0.466,-0.369,0.806
历史授信额度,6.7e-05,7.78e-06,8.609,0.000,5.17e-05,8.23e-05
历史违约次数,-1.5106,0.140,-10.811,0.000,-1.785,-1.236

0,1,2,3
Omnibus:,13.18,Durbin-Watson:,1.996
Prob(Omnibus):,0.001,Jarque-Bera (JB):,12.534
Skew:,-0.236,Prob(JB):,0.0019
Kurtosis:,2.721,Cond. No.,427000.0


# GBDT回归模型

### 读取数据

In [6]:
import pandas as pd
df = pd.read_excel('信用评分卡模型.xlsx')
df.head()

Unnamed: 0,月收入,年龄,性别,历史授信额度,历史违约次数,信用评分
0,7783,29,0,32274,3,73
1,7836,40,1,6681,4,72
2,6398,25,0,26038,2,74
3,6483,23,1,24584,4,65
4,5167,23,1,6710,3,73


### 提取特征变量和目标变量

In [9]:
X = df.drop(columns='信用评分')
y = df['信用评分']

### 划分训练集和测试集

In [10]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

### 模型训练

In [11]:
from sklearn.ensemble import GradientBoostingRegressor
model = GradientBoostingRegressor()
model.fit(X_train, y_train)

GradientBoostingRegressor()

### 模型预测及评估

In [12]:
y_pred = model.predict(X_test)
print(y_pred[0:10])

[70.77631652 71.40032104 73.73465155 84.52533945 71.09188294 84.9327599
 73.72232388 83.44560704 82.61221486 84.86927209]


In [13]:
a = pd.DataFrame()  # 创建一个空DataFrame 
a['预测值'] = list(y_pred)
a['实际值'] = list(y_test)
a.head()

Unnamed: 0,预测值,实际值
0,70.776317,79
1,71.400321,80
2,73.734652,62
3,84.525339,89
4,71.091883,80


In [14]:
from sklearn.metrics import r2_score
r2 = r2_score(y_test, model.predict(X_test))
print(r2)

0.6755949459507654


In [15]:
model.score(X_test, y_test)

0.6755949459507654

# XGBoost回归模型

### 读取数据

In [17]:
import pandas as pd
df = pd.read_excel('信用评分卡模型.xlsx')
df.head()

Unnamed: 0,月收入,年龄,性别,历史授信额度,历史违约次数,信用评分
0,7783,29,0,32274,3,73
1,7836,40,1,6681,4,72
2,6398,25,0,26038,2,74
3,6483,23,1,24584,4,65
4,5167,23,1,6710,3,73


### 提取特征变量和目标变量

In [18]:
X = df.drop(columns='信用评分')
y = df['信用评分']

### 划分测试集和训练集

In [19]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

### 模型训练

In [20]:
from xgboost import XGBRegressor
model = XGBRegressor()
model.fit(X_train, y_train)

XGBRegressor(base_score=0.5, booster='gbtree', callbacks=None,
             colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
             early_stopping_rounds=None, enable_categorical=False,
             eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
             importance_type=None, interaction_constraints='',
             learning_rate=0.300000012, max_bin=256, max_cat_to_onehot=4,
             max_delta_step=0, max_depth=6, max_leaves=0, min_child_weight=1,
             missing=nan, monotone_constraints='()', n_estimators=100, n_jobs=0,
             num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0,
             reg_lambda=1, ...)

In [21]:
y_pred = model.predict(X_test)
print(y_pred[0:10])

[74.62306  69.01495  76.393486 83.88998  71.5683   86.257324 76.0784
 81.38994  81.05504  83.24717 ]


In [22]:
a = pd.DataFrame()
a['预测值'] = list(y_pred)
a['实际值'] = list(y_test)
a.head()

Unnamed: 0,预测值,实际值
0,74.623062,79
1,69.014954,80
2,76.393486,62
3,83.889977,89
4,71.568298,80


In [23]:
from sklearn.metrics import r2_score
r2 = r2_score(y_test, model.predict(X_test))
print(r2)

0.5715437436791975


In [24]:
model.score(X_test, y_test)

0.5715437436791975

### 查看特征重要性

In [25]:
features = X.columns
importances = model.feature_importances_
importances_df = pd.DataFrame()
importances_df['特征名称'] = features
importances_df['特征重要性'] = importances
importances_df.sort_values('特征重要性', ascending=False)

Unnamed: 0,特征名称,特征重要性
0,月收入,0.324461
4,历史违约次数,0.307467
3,历史授信额度,0.202864
1,年龄,0.098869
2,性别,0.066339


### 参数调优

In [26]:
from sklearn.model_selection import GridSearchCV  
parameters = {'max_depth': [1, 3, 5],
              'n_estimators': [50, 100, 150],
              'learning_rate': [0.01, 0.05, 0.1, 0.2]}
clf = XGBRegressor()
grid_search = GridSearchCV(model, parameters, scoring='r2', cv=5) 
grid_search.fit(X_train, y_train)
grid_search.best_params_

{'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 50}

In [27]:
model = XGBRegressor(max_depth=3, n_estimators=50, learning_rate=0.1)
model.fit(X_train, y_train)

XGBRegressor(base_score=0.5, booster='gbtree', callbacks=None,
             colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
             early_stopping_rounds=None, enable_categorical=False,
             eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
             importance_type=None, interaction_constraints='',
             learning_rate=0.1, max_bin=256, max_cat_to_onehot=4,
             max_delta_step=0, max_depth=3, max_leaves=0, min_child_weight=1,
             missing=nan, monotone_constraints='()', n_estimators=50, n_jobs=0,
             num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0,
             reg_lambda=1, ...)

In [28]:
from sklearn.metrics import r2_score
r2 = r2_score(y_test, model.predict(X_test))
print(r2)

0.6884486054771359
