# 模型搭建

### 读取数据

In [1]:
import pandas as pd
df = pd.read_excel('广告收益数据.xlsx')
df.head()

Unnamed: 0,电视,广播,报纸,收益
0,230.1,37.8,69.2,331.5
1,44.5,39.3,45.1,156.0
2,17.2,45.9,69.3,139.5
3,151.5,41.3,58.5,277.5
4,180.8,10.8,58.4,193.5


### 提取特征变量和目标变量

In [2]:
X = df.drop(columns='收益') 
y = df['收益']  

### 划分训练集和测试集

In [3]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

### 模型训练

In [4]:
from lightgbm import LGBMRegressor
model = LGBMRegressor()
model.fit(X_train, y_train)

LGBMRegressor()

# 模型预测及评估

### 预测测试数据

In [5]:
y_pred = model.predict(X_test)
y_pred[0:5]

array([192.6139063 , 295.11999665, 179.92649365, 293.45888909,
       166.86159398])

In [6]:
a = pd.DataFrame() 
a['预测值'] = list(y_pred)
a['实际值'] = list(y_test)
a.head()

Unnamed: 0,预测值,实际值
0,192.613906,190.5
1,295.119997,292.5
2,179.926494,171.0
3,293.458889,324.0
4,166.861594,144.0


### 查看R-square

In [7]:
from sklearn.metrics import r2_score
r2 = r2_score(y_test, model.predict(X_test))
r2

0.9570203214455993

In [8]:
model.score(X_test, y_test)

0.9570203214455993

### 特征重要性

In [9]:
model.feature_importances_

array([ 950, 1049,  963])

# 模型参数调优

In [10]:
from sklearn.model_selection import GridSearchCV
parameters = {'num_leaves': [15, 31, 62],
              'n_estimators': [20, 30, 50, 70],
              'learning_rate': [0.1, 0.2, 0.3, 0.4]}
model = LGBMRegressor() 
grid_search = GridSearchCV(model, parameters,scoring='r2',cv=5)
grid_search.fit(X_train, y_train)
grid_search.best_params_ 

{'learning_rate': 0.3, 'n_estimators': 50, 'num_leaves': 31}

In [11]:
model = LGBMRegressor(num_leaves=31, n_estimators=50,learning_rate=0.3)
model.fit(X_train, y_train)
model.score(X_test, y_test)

0.9558624845475153