In [77]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

In [78]:
# 2. 手动构造真实逻辑的数据集（100个样本，含业务逻辑+随机噪声）
np.random.seed(42)  # 固定随机种子，结果可复现
n_samples = 100  # 100条广告投放记录

# 构造4个特征（符合业务常识的取值范围）
TV_ad = np.random.uniform(10, 100, n_samples)  # 电视广告：10-100万
Social_ad = np.random.uniform(5, 50, n_samples)  # 社交媒体：5-50万
Outdoor_ad = np.random.uniform(3, 30, n_samples)  # 户外广告：3-30万
Ad_days = np.random.randint(7, 31, n_samples)  # 投放天数：7-30天

In [79]:
# 业务逻辑：电视广告权重最高（0.8），其次是社交媒体（0.5）、投放天数（0.3）、户外广告（0.2）
Sales = (0.8 * TV_ad + 0.5 * Social_ad + 0.2 * Outdoor_ad + 0.3 * Ad_days 
         + np.random.randn(n_samples) * 2)  # 噪声：模拟不可控因素

In [80]:
# 转换为DataFrame（方便查看和处理）
data={'电视广告投放金额':TV_ad,'设计媒体广告投放金额':Social_ad,'户外广告投放金额':Outdoor_ad,'广告投放天数':Ad_days,'产品销售额':Sales}
df=pd.DataFrame(data)

In [81]:
# # 查看数据集基本信息
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   电视广告投放金额    100 non-null    float64
 1   设计媒体广告投放金额  100 non-null    float64
 2   户外广告投放金额    100 non-null    float64
 3   广告投放天数      100 non-null    int32  
 4   产品销售额       100 non-null    float64
dtypes: float64(4), int32(1)
memory usage: 3.6 KB


In [82]:
# 查看数据统计描述
df.describe()

Unnamed: 0,电视广告投放金额,设计媒体广告投放金额,户外广告投放金额,广告投放天数,产品销售额
count,100.0,100.0,100.0,100.0,100.0
mean,52.316267,27.402428,16.975236,18.47,64.374224
std,26.774047,13.190006,7.922509,7.425665,21.999484
min,10.496991,5.312846,3.136663,7.0,22.275827
25%,27.388068,15.890204,10.475756,11.0,44.418629
50%,51.772821,27.753119,18.188983,19.0,61.997424
75%,75.718281,39.478262,23.313907,25.0,81.573866
max,98.819824,49.35427,29.731454,30.0,107.325215


In [83]:
df.head()

Unnamed: 0,电视广告投放金额,设计媒体广告投放金额,户外广告投放金额,广告投放天数,产品销售额
0,43.708611,6.414313,20.334854,19,48.161615
1,95.564288,33.638469,5.271779,26,101.264969
2,75.879455,19.146019,7.363975,21,80.509791
3,63.879264,27.885681,27.260963,9,70.615995
4,24.041678,45.840491,19.373585,29,57.83815


In [84]:
# 3. 数据预处理
# 3.1 分离特征（X）和标签（y）
x=df[['电视广告投放金额','设计媒体广告投放金额','户外广告投放金额','广告投放天数']]
x

Unnamed: 0,电视广告投放金额,设计媒体广告投放金额,户外广告投放金额,广告投放天数
0,43.708611,6.414313,20.334854,19
1,95.564288,33.638469,5.271779,26
2,75.879455,19.146019,7.363975,21
3,63.879264,27.885681,27.260963,9
4,24.041678,45.840491,19.373585,29
...,...,...,...,...
95,54.441604,20.714431,17.100568,18
96,57.045955,37.668006,23.789826,19
97,48.478692,45.369962,8.827168,18
98,12.287721,44.918889,19.818043,20


In [85]:
y=df['产品销售额']
y

0      48.161615
1     101.264969
2      80.509791
3      70.615995
4      57.838150
         ...    
95     62.260915
96     72.954273
97     67.651041
98     39.834985
99     42.454547
Name: 产品销售额, Length: 100, dtype: float64

In [86]:
# 3.2 划分训练集（80%）和测试集（20%）
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2)
x_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 80 entries, 70 to 59
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   电视广告投放金额    80 non-null     float64
 1   设计媒体广告投放金额  80 non-null     float64
 2   户外广告投放金额    80 non-null     float64
 3   广告投放天数      80 non-null     int32  
dtypes: float64(3), int32(1)
memory usage: 2.8 KB


In [87]:
# 4. 训练多元线性回归模型
#标准化要训练的数据
from sklearn.preprocessing import StandardScaler
#实例化标准库
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)
x_train_scaled

array([[ 1.03026256e+00,  5.55934352e-01,  1.03416007e+00,
         7.47090239e-02],
       [-1.19194578e+00, -1.24928750e+00,  9.88681663e-01,
         1.56888950e+00],
       [-3.43883999e-01, -1.34110914e+00, -4.46132388e-01,
        -1.55530604e+00],
       [-1.11196228e+00,  1.28641192e+00,  2.26208004e-01,
         1.16138574e+00],
       [ 8.08888613e-01,  4.47657924e-01,  3.52705417e-02,
         4.82212791e-01],
       [ 1.13166798e+00, -1.08550163e+00,  6.75932995e-01,
        -1.55530604e+00],
       [-1.38758267e+00, -3.37182303e-01, -1.33852120e+00,
         1.02555115e+00],
       [-1.49817069e+00,  1.25631101e+00,  4.09153741e-01,
         2.10543613e-01],
       [ 1.59651389e+00, -1.22463841e+00,  1.17418188e+00,
         4.82212791e-01],
       [ 1.63448433e+00,  4.18367616e-01, -1.39661262e+00,
         1.02555115e+00],
       [ 1.34827416e+00,  8.16623454e-01,  5.45273346e-01,
         4.82212791e-01],
       [-9.40603138e-02,  1.53942612e+00, -1.08499641e+00,
      

In [None]:
model =LinearRegression()
model.fit(x_train,y_train)

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [89]:
# 5. 模型评估
y_pred=model.predict(x_test)
r2 = r2_score(y_test,y_pred)
print(r2)

0.9864431628163263


In [90]:
#均方误差
mse = mean_squared_error(y_test,y_pred)
print(mse)

5.72288300490447
