In [1]:
import pandas as pd
import numpy as np
data = pd.read_csv('./datasets/daily-bike-share.csv')
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 731 entries, 0 to 730
Data columns (total 14 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   instant     731 non-null    int64  
 1   dteday      731 non-null    object 
 2   season      731 non-null    int64  
 3   yr          731 non-null    int64  
 4   mnth        731 non-null    int64  
 5   holiday     731 non-null    int64  
 6   weekday     731 non-null    int64  
 7   workingday  731 non-null    int64  
 8   weathersit  731 non-null    int64  
 9   temp        731 non-null    float64
 10  atemp       731 non-null    float64
 11  hum         731 non-null    float64
 12  windspeed   731 non-null    float64
 13  rentals     731 non-null    int64  
dtypes: float64(4), int64(9), object(1)
memory usage: 80.1+ KB


In [2]:
data.head()

Unnamed: 0,instant,dteday,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,rentals
0,1,1/1/2011,1,0,1,0,6,0,2,0.344167,0.363625,0.805833,0.160446,331
1,2,1/2/2011,1,0,1,0,0,0,2,0.363478,0.353739,0.696087,0.248539,131
2,3,1/3/2011,1,0,1,0,1,1,1,0.196364,0.189405,0.437273,0.248309,120
3,4,1/4/2011,1,0,1,0,2,1,1,0.2,0.212122,0.590435,0.160296,108
4,5,1/5/2011,1,0,1,0,3,1,1,0.226957,0.22927,0.436957,0.1869,82


In [3]:
data.isnull().sum()

instant       0
dteday        0
season        0
yr            0
mnth          0
holiday       0
weekday       0
workingday    0
weathersit    0
temp          0
atemp         0
hum           0
windspeed     0
rentals       0
dtype: int64

In [4]:
data = data[['season'
             , 'mnth'
             , 'holiday'
             , 'weekday'
             , 'workingday'
             , 'weathersit'
             , 'temp'
             , 'atemp'
             , 'hum'
             , 'windspeed'
             , 'rentals']]
data.head()

Unnamed: 0,season,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,rentals
0,1,1,0,6,0,2,0.344167,0.363625,0.805833,0.160446,331
1,1,1,0,0,0,2,0.363478,0.353739,0.696087,0.248539,131
2,1,1,0,1,1,1,0.196364,0.189405,0.437273,0.248309,120
3,1,1,0,2,1,1,0.2,0.212122,0.590435,0.160296,108
4,1,1,0,3,1,1,0.226957,0.22927,0.436957,0.1869,82


In [5]:
data.shape

(731, 11)

In [6]:
from sklearn.model_selection import train_test_split
# 拆分出特征变量
X = data.drop('rentals',axis=1)
# 拆分出目标变量
y = data['rentals']
# 将数据集拆分成训练集和测试机
# 训练集：训练模型 X_train y_train组成
# 测试集：评估模型 X_test y_test 组成
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    test_size=0.2,
                                                    random_state=123)

In [7]:
# Pipeline思想是将机器学习建模的步骤进行封装，
# 封装成pipeline对象

In [8]:
# 标准化、顺序编码
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
# SimpleImputer 填充缺失值
from sklearn.impute import SimpleImputer

# 对特征应用某种数据预处理操作
from sklearn.compose import ColumnTransformer

# Pipeline模型构造器
from sklearn.pipeline import Pipeline

In [10]:
# 对数值型特征做缺失值填充及标准化
numeric_transformer = Pipeline(steps=[
       ('imputer', SimpleImputer(strategy='mean')),  # 对数值型变量进行缺失值填充，填充的值是平均值
       ('scaler', StandardScaler()) # 对数值型变量进行标准化
])

In [11]:
# 对类别型变量做顺序编码
categorical_transformer = Pipeline(steps=[
       ('imputer', SimpleImputer(strategy='constant')) # 对类别型变量用众数填充
      ,('encoder', OrdinalEncoder()) # 将类别型变量进行顺序编码
])

In [12]:
# 筛选出数据集中的数值型的连续变量
numeric_features = ['temp', 'atemp', 'hum', 'windspeed']


# 类别型变量
categorical_features = ['season', 'mnth', 'holiday', 
                        'weekday', 'workingday', 'weathersit']

# 分别针对连续型变量和类别型变量使用不同的预处理过程
preprocessor = ColumnTransformer(
   transformers=[
    ('numeric', numeric_transformer, numeric_features),
    ('categorical', categorical_transformer, categorical_features)
]) 

In [14]:
# 使用随机森林算法
from sklearn.ensemble import RandomForestRegressor

# pipline构建模型（Pipeline类似套娃）
rf_model = Pipeline(steps = [
               ('preprocessor', preprocessor) # 第一步：数据预处理，有分别针对连续型变量和类别型变量的处理器
              ,('regressor',RandomForestRegressor()) # 第二步，实例化算法模型
           ])

In [16]:
rf_model.fit(X_train,y_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('numeric',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer()),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  ['temp', 'atemp', 'hum',
                                                   'windspeed']),
                                                 ('categorical',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='constant')),
                                                                  ('encoder',
                                                                   OrdinalEncoder())]),
                        

In [17]:
# metrics 专门存放模型评估相关的方法和模块
# 导入R方模型评估方法
from sklearn.metrics import r2_score

# 使用模型对测试集中的样本做预测
predictions = rf_model.predict(X_test)
# 使用R方对模型进行评估
print (r2_score(y_test, predictions))

0.7679798723452534


+ R方评估范围是 $(- \infty,1)$
+ R方评估指标越接近1，模型效果越好