In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
data = pd.read_csv('./Data/50_Startups.csv')
data.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [9]:
X, y = data.iloc[:, :-1], data.Profit
print(X.shape, y.shape)
obj_cols = [cname for cname in X.columns if X[cname].dtype == 'object']
num_cols = [cname for cname in X.columns if X[cname].dtype in ['int64', 'float64']]
print(obj_cols)
print(num_cols)

(50, 4) (50,)
['State']
['R&D Spend', 'Administration', 'Marketing Spend']


In [4]:
# 数据预处理 onehot
X = pd.get_dummies(X)
X.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State_California,State_Florida,State_New York
0,165349.2,136897.8,471784.1,False,False,True
1,162597.7,151377.59,443898.53,True,False,False
2,153441.51,101145.55,407934.54,False,True,False
3,144372.41,118671.85,383199.62,False,False,True
4,142107.34,91391.77,366168.42,False,True,False


In [5]:
# 划分数据集
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
print(X_train.iloc[0])

R&D Spend            63408.86
Administration      129219.61
Marketing Spend      46085.25
State_California         True
State_Florida           False
State_New York          False
Name: 32, dtype: object


In [13]:
# 标准化
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train[num_cols] = sc.fit_transform(X_train[num_cols])
X_test[num_cols] = sc.transform(X_test[num_cols])
X_train.iloc[0]

R&D Spend           -0.32133
Administration      0.187003
Marketing Spend    -1.370084
State_California        True
State_Florida          False
State_New York         False
Name: 32, dtype: object

In [14]:
# 选择模型
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(X_train, y_train)



In [17]:
# 模型预测及评估
from sklearn.metrics import mean_squared_error, r2_score
y_pred = model.predict(X_test)
mse = mean_squared_error(y_pred, y_test)
print(f'线性回归预测mse为: {mse:.3f}')
r2s = r2_score(y_pred, y_test)
print(f'线性回归预测r2_score: {r2s:.3f}')

线性回归预测mse为: 79495441.504
线性回归预测r2_score: 0.955
