In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from feature_engine.creation import CyclicalFeatures
from feature_engine.datetime import DatetimeFeatures

from sklearn.pipeline import Pipeline
plt.style.use("dark_background")

data = pd.read_csv("AirQualityUCI.csv")
data["date_time"] = data["Date"] + " " + data["Time"]
data.head()

Unnamed: 0,Date,Time,CO,PT08S1,NMHC,C6H6,PT08S2,NOx,PT08S3,NO2,PT08S4,PT08S5,T,RH,AH,date_time
0,3/10/2004,18:00:00,2.6,1360,150,11.9,1046,166,1056,113,1692,1268,13.6,48.9,0.7578,3/10/2004 18:00:00
1,3/10/2004,19:00:00,2.0,1292,112,9.4,955,103,1174,92,1559,972,13.3,47.7,0.7255,3/10/2004 19:00:00
2,3/10/2004,20:00:00,2.2,1402,88,9.0,939,131,1140,114,1555,1074,11.9,54.0,0.7502,3/10/2004 20:00:00
3,3/10/2004,21:00:00,2.2,1376,80,9.2,948,172,1092,122,1584,1203,11.0,60.0,0.7867,3/10/2004 21:00:00
4,3/10/2004,22:00:00,1.6,1272,51,6.5,836,131,1205,116,1490,1110,11.2,59.6,0.7888,3/10/2004 22:00:00


In [3]:
data["date_time"] = pd.to_datetime(data["date_time"])

In [4]:
df = data[["date_time","CO"]]
df.head()

Unnamed: 0,date_time,CO
0,2004-03-10 18:00:00,2.6
1,2004-03-10 19:00:00,2.0
2,2004-03-10 20:00:00,2.2
3,2004-03-10 21:00:00,2.2
4,2004-03-10 22:00:00,1.6


In [7]:
df.CO.nunique()

97

In [8]:
pipe = Pipeline(steps=[
    ("datetime",DatetimeFeatures(variables=["date_time"],
                                 features_to_extract=["month","hour"],
                                 drop_original=True
                                 )),
    ("cyclical",CyclicalFeatures(variables=["date_time_month","date_time_hour"]))
])

In [9]:
df = pipe.fit_transform(df)
df.head()

Unnamed: 0,CO,date_time_month,date_time_hour,date_time_month_sin,date_time_month_cos,date_time_hour_sin,date_time_hour_cos
0,2.6,3,18,1.0,6.123234000000001e-17,-0.979084,0.203456
1,2.0,3,19,1.0,6.123234000000001e-17,-0.887885,0.460065
2,2.2,3,20,1.0,6.123234000000001e-17,-0.730836,0.682553
3,2.2,3,21,1.0,6.123234000000001e-17,-0.519584,0.854419
4,1.6,3,22,1.0,6.123234000000001e-17,-0.269797,0.962917


In [10]:
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestRegressor
lasso = Lasso(random_state=10)
forest = RandomForestRegressor(max_depth=5,random_state=10)

In [13]:
from sklearn.model_selection import cross_val_score
x1 = df[["date_time_month","date_time_hour"]]
y1 = df["CO"]

cross_val_score(lasso,x1,y1,cv=10,n_jobs=-1,scoring="r2").mean()

-0.3085114034991276

In [14]:
from sklearn.model_selection import cross_val_score
x2 = df.drop("CO",axis=1)
y2 = df["CO"]

cross_val_score(forest,x2,y2,cv=10,n_jobs=-1,scoring="r2").mean()

-0.1978878058247325

In [27]:
lasso.fit(x1,y1)
lasso.score(x1,y1)

0.027091751146741117

In [28]:
lasso.fit(x2,y2)
lasso.score(x2,y2)

0.06163233954146419

In [25]:
forest.fit(x1,y1)
forest.score(x1,y1)

0.15280635614777582

In [26]:
forest.fit(x2,y2)
forest.score(x2,y2)

0.16808398609425512

In [29]:
trig_vars = [var for var in df.columns if "sin" in var or "cos" in var]
trig_vars

['date_time_month_sin',
 'date_time_month_cos',
 'date_time_hour_sin',
 'date_time_hour_cos']

In [30]:
df[trig_vars]

Unnamed: 0,date_time_month_sin,date_time_month_cos,date_time_hour_sin,date_time_hour_cos
0,1.000000,6.123234e-17,-0.979084,0.203456
1,1.000000,6.123234e-17,-0.887885,0.460065
2,1.000000,6.123234e-17,-0.730836,0.682553
3,1.000000,6.123234e-17,-0.519584,0.854419
4,1.000000,6.123234e-17,-0.269797,0.962917
...,...,...,...,...
9352,0.866025,-5.000000e-01,0.398401,-0.917211
9353,0.866025,-5.000000e-01,0.136167,-0.990686
9354,0.866025,-5.000000e-01,-0.136167,-0.990686
9355,0.866025,-5.000000e-01,-0.398401,-0.917211


In [31]:
lasso.fit(df[trig_vars],y1)
lasso.score(df[trig_vars],y1)

0.06166180199568949

In [32]:
forest.fit(df[trig_vars],y1)
forest.score(df[trig_vars],y1)

0.16716016360397334