In [1]:
import pandas as pd
import numpy as np

import plotly.graph_objects as go
import plotly.figure_factory as ff
from plotly.subplots import make_subplots

In [2]:
data_train = pd.read_csv('D:/Data/Tabular Playground July/train.csv')
data_test = pd.read_csv('D:/Data/Tabular Playground July/test.csv')

In [3]:
data_train

Unnamed: 0,date_time,deg_C,relative_humidity,absolute_humidity,sensor_1,sensor_2,sensor_3,sensor_4,sensor_5,target_carbon_monoxide,target_benzene,target_nitrogen_oxides
0,2010-03-10 18:00:00,13.1,46.0,0.7578,1387.2,1087.8,1056.0,1742.8,1293.4,2.5,12.0,167.7
1,2010-03-10 19:00:00,13.2,45.3,0.7255,1279.1,888.2,1197.5,1449.9,1010.9,2.1,9.9,98.9
2,2010-03-10 20:00:00,12.6,56.2,0.7502,1331.9,929.6,1060.2,1586.1,1117.0,2.2,9.2,127.1
3,2010-03-10 21:00:00,11.0,62.4,0.7867,1321.0,929.0,1102.9,1536.5,1263.2,2.2,9.7,177.2
4,2010-03-10 22:00:00,11.9,59.0,0.7888,1272.0,852.7,1180.9,1415.5,1132.2,1.5,6.4,121.8
...,...,...,...,...,...,...,...,...,...,...,...,...
7106,2010-12-31 20:00:00,9.2,32.0,0.3871,1000.5,811.2,873.0,909.0,910.5,1.3,5.1,191.1
7107,2010-12-31 21:00:00,9.1,33.2,0.3766,1022.7,790.0,951.6,912.9,903.4,1.4,5.8,221.3
7108,2010-12-31 22:00:00,9.6,34.6,0.4310,1044.4,767.3,861.9,889.2,1159.1,1.6,5.2,227.4
7109,2010-12-31 23:00:00,8.0,40.7,0.4085,952.8,691.9,908.5,917.0,1206.3,1.5,4.6,199.8


# Dataset Overview
Unfortunately, there are no official information about features. We can only guess their meaning:
<ul>
    <li>date_time - is date and time when sensors recording occured within interval of 1 hour</li>
    <li>deg_C - is a temperature measured in Celsius</li>
    <li>relative_himidity - is the measure of water vapor in the air, regardless of temperature</li>
    <li>absolute_humidity - also measures water vaport but <b>RELATIVE</b> to the temperature of the air</li>
</ul>

We also have three columns in the database whose values we want to predict using machine learning. What do they mean?
<ul>
    <li>target_carbon_monoxide - is a colorless, odorless, tasteless, flammable gas that is slightly less dense than air. Thermal combustion is the most common source of carbon monoxide.</li>
    <li>target_benzene - is an organic chemical compound. Because it contains only carbon and hydrogen atoms, benzene is classed as a hydrocarbon.</li>
    <li>target_nitrogen_oxides - in atmospheric chemistry, <b>NO<sub>x</sub></b> is a generic term for the nitrogen oxides that are most relevant for air pollution, namely nitric oxide (NO) and nitrogen dioxide (NO<sub>2</sub>). These gases contribute to the formation of smog and acid rain.</li>
</ul>

## Dataset description

In [4]:
data_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7111 entries, 0 to 7110
Data columns (total 12 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   date_time               7111 non-null   object 
 1   deg_C                   7111 non-null   float64
 2   relative_humidity       7111 non-null   float64
 3   absolute_humidity       7111 non-null   float64
 4   sensor_1                7111 non-null   float64
 5   sensor_2                7111 non-null   float64
 6   sensor_3                7111 non-null   float64
 7   sensor_4                7111 non-null   float64
 8   sensor_5                7111 non-null   float64
 9   target_carbon_monoxide  7111 non-null   float64
 10  target_benzene          7111 non-null   float64
 11  target_nitrogen_oxides  7111 non-null   float64
dtypes: float64(11), object(1)
memory usage: 666.8+ KB


In [5]:
data_train.describe()

Unnamed: 0,deg_C,relative_humidity,absolute_humidity,sensor_1,sensor_2,sensor_3,sensor_4,sensor_5,target_carbon_monoxide,target_benzene,target_nitrogen_oxides
count,7111.0,7111.0,7111.0,7111.0,7111.0,7111.0,7111.0,7111.0,7111.0,7111.0,7111.0
mean,20.878034,47.561004,1.110309,1091.5721,938.06497,883.903305,1513.238349,998.335565,2.086219,10.237083,204.066784
std,7.937917,17.398731,0.39895,218.537554,281.978988,310.456355,350.18031,381.537695,1.447109,7.694426,193.927723
min,1.3,8.9,0.1988,620.3,364.0,310.6,552.9,242.7,0.1,0.1,1.9
25%,14.9,33.7,0.8559,930.25,734.9,681.05,1320.35,722.85,1.0,4.5,76.45
50%,20.7,47.3,1.0835,1060.5,914.2,827.8,1513.1,928.7,1.7,8.5,141.0
75%,25.8,60.8,1.40415,1215.8,1124.1,1008.85,1720.4,1224.7,2.8,14.2,260.0
max,46.1,90.8,2.231,2088.3,2302.6,2567.4,2913.8,2594.6,12.5,63.7,1472.3


## Null values

In [6]:
data_train.isnull().sum()

date_time                 0
deg_C                     0
relative_humidity         0
absolute_humidity         0
sensor_1                  0
sensor_2                  0
sensor_3                  0
sensor_4                  0
sensor_5                  0
target_carbon_monoxide    0
target_benzene            0
target_nitrogen_oxides    0
dtype: int64

# Data cleaning
First we will look for outliers using box plot and examine the distribution of the features. 

In [7]:
data_train

Unnamed: 0,date_time,deg_C,relative_humidity,absolute_humidity,sensor_1,sensor_2,sensor_3,sensor_4,sensor_5,target_carbon_monoxide,target_benzene,target_nitrogen_oxides
0,2010-03-10 18:00:00,13.1,46.0,0.7578,1387.2,1087.8,1056.0,1742.8,1293.4,2.5,12.0,167.7
1,2010-03-10 19:00:00,13.2,45.3,0.7255,1279.1,888.2,1197.5,1449.9,1010.9,2.1,9.9,98.9
2,2010-03-10 20:00:00,12.6,56.2,0.7502,1331.9,929.6,1060.2,1586.1,1117.0,2.2,9.2,127.1
3,2010-03-10 21:00:00,11.0,62.4,0.7867,1321.0,929.0,1102.9,1536.5,1263.2,2.2,9.7,177.2
4,2010-03-10 22:00:00,11.9,59.0,0.7888,1272.0,852.7,1180.9,1415.5,1132.2,1.5,6.4,121.8
...,...,...,...,...,...,...,...,...,...,...,...,...
7106,2010-12-31 20:00:00,9.2,32.0,0.3871,1000.5,811.2,873.0,909.0,910.5,1.3,5.1,191.1
7107,2010-12-31 21:00:00,9.1,33.2,0.3766,1022.7,790.0,951.6,912.9,903.4,1.4,5.8,221.3
7108,2010-12-31 22:00:00,9.6,34.6,0.4310,1044.4,767.3,861.9,889.2,1159.1,1.6,5.2,227.4
7109,2010-12-31 23:00:00,8.0,40.7,0.4085,952.8,691.9,908.5,917.0,1206.3,1.5,4.6,199.8


In [8]:
fig = make_subplots(rows=1, cols=2, subplot_titles=('Temperature Box Plot', 'Temperature Distribution'))

fig.add_trace(go.Box(
    y = data_train['deg_C'],
    marker_color = '#990099',
    name = ''
), row=1, col=1)

fig.add_trace(go.Histogram(
    x = data_train['deg_C'],
    marker_color = '#660066'
), row=1, col=2)

fig.update_layout(
    template = 'plotly_dark',
    width = 1000,
    showlegend = False
)

In [9]:
fig = make_subplots(rows=1, cols=2, subplot_titles=('Relative Humidity Box Plot', 'Relative Humidity Distribution'))

fig.add_trace(go.Box(
    y = data_train['relative_humidity'],
    marker_color = '#b30000',
    name = ''
), row=1, col=1)

fig.add_trace(go.Histogram(
    x = data_train['relative_humidity'],
    marker_color='#b30000'
), row=1, col=2)

fig.update_layout(
    template = 'plotly_dark',
    width = 1000,
    showlegend = False
)

In [10]:
fig = make_subplots(rows=1, cols=2, subplot_titles=('Absolute Humidity Box Plot', 'Absolute Humidity Distribution'))

fig.add_trace(go.Box(
    y = data_train['absolute_humidity'],
    marker_color = '#b3b300',
    name = ''
), row=1, col=1)

fig.add_trace(go.Histogram(
    x = data_train['absolute_humidity'],
    marker_color='#b3b300'
), row=1, col=2)

fig.update_layout(
    template = 'plotly_dark',
    width = 1000,
    showlegend = False
)

It looks like there aren't any outliers in above features. Admittedly, we can see couple of dots in box plots but such outliers are irrelevant. Next we will take closer look at measurements of the sensors. 

In [11]:
fig = go.Figure()

fig.add_trace(go.Box(
    y = data_train['sensor_1'],
    name = 'Sensor 1'
))

fig.add_trace(go.Box(
    y = data_train['sensor_2'],
    name = 'Sensor 2'
))

fig.add_trace(go.Box(
    y = data_train['sensor_3'],
    name = 'Sensor 3'
))

fig.add_trace(go.Box(
    y = data_train['sensor_4'],
    name = 'Sensor 4'
))

fig.add_trace(go.Box(
    y = data_train['sensor_5'],
    name = 'Sensor 5'
))

fig.update_layout(
    template = 'plotly_dark',
    title_text = 'Box plot sensors'
)

In this case every sensor have quite a few candidates for outlier but it's highly unlikely that they are. Most likely, the measurements were taken at the time of the highest concentration of the given component in the air.

In [12]:
fig = make_subplots(rows=3, cols=2, subplot_titles=(
    'Distribution of sensor 1 measurements',
    'Distribution of sensor 2 measurements',
    'Distribution of sensor 3 measurements',
    'Distribution of sensor 4 measurements',
    'Distribution of sensor 5 measurements'
))

fig.add_trace(go.Histogram(
    x = data_train['sensor_1'],
    name = 'Sensor 1'
), row=1, col=1)

fig.add_trace(go.Histogram(
    x = data_train['sensor_2'],
    name = 'Sensor 2'
), row=1, col=2)

fig.add_trace(go.Histogram(
    x = data_train['sensor_3'],
    name = 'Sensor 3'
), row=2, col=1)

fig.add_trace(go.Histogram(
    x = data_train['sensor_4'],
    name = 'Sensor 4'
), row=2, col=2)

fig.add_trace(go.Histogram(
    x = data_train['sensor_5'],
    name = 'Sensor 5'
), row=3, col=1)

fig.update_layout(
    height = 700,
    width = 1000,
    template = 'plotly_dark',
    showlegend = False
)

What do the sensors represent?
I'm not really sure but after brief researche and reading notebooks I found something like this.
<ul>
    <li>Sensor_1 - (tin oxide) hourly averaged sensor response (nominally CO targeted)</li>
    <li>Sensor_2 - (titania) hourly averaged sensor response (nominally NMHC targeted)</li>
    <li>Sensor_3 - (tungsten oxide) hourly averaged sensor response (nominally NOx targeted)</li>
    <li>Sensor_4 - (tungsten oxide) hourly averaged sensor response (nominally NO2 targeted)</li>
    <li>Sensor_5 - (indium oxide) hourly averaged sensor response (nominally O3 targeted)</li>
</ul>

# Data visualization 

In [13]:
fig = go.Figure()

mean = np.mean(data_train['deg_C'])

fig.add_trace(go.Scatter(
    x = data_train['date_time'],
    y = data_train['deg_C'],
    marker_color = '#3366ff'
))

fig.add_hline(
    y = mean,
    annotation_text = 'Mean of {}'.format(mean.round(2)),
    annotation_position = 'bottom right'
)

fig.update_layout(
    template = 'plotly_dark',
    width = 1000,
    title_text = 'Temperature over time'
)

In [14]:
fig = go.Figure()

mean = np.mean(data_train['relative_humidity'])

fig.add_trace(go.Scatter(
    x = data_train['date_time'],
    y = data_train['relative_humidity'],
    marker_color = '#e6ac00'
))

fig.add_hline(
    y = mean,
    annotation_text = 'Mean of {}'.format(mean.round(2)),
    annotation_position = 'bottom right'
)

fig.update_layout(
    template = 'plotly_dark',
    width = 1000,
    title_text = 'Relative humidity over time'
)

In [15]:
fig = go.Figure()

mean = np.mean(data_train['absolute_humidity'])

fig.add_trace(go.Scatter(
    x = data_train['date_time'],
    y = data_train['absolute_humidity'],
    marker_color = '#ff7800'
))

fig.add_hline(
    y = mean,
    annotation_text = 'Mean of {}'.format(mean.round(2)),
    annotation_position = 'bottom right'
)

fig.update_layout(
    template = 'plotly_dark',
    width = 1000,
    title_text = 'Relative humidity over time'
)

In [16]:
fig = make_subplots(rows=5, cols=1, subplot_titles=(
    'Sensor 1 measurements over time',
    'Sensor 2 measurements over time',
    'Sensor 3 measurements over time',
    'Sensor 4 measurements over time',
    'Sensor 5 measurements over time'
))

fig.add_trace(go.Scatter(
    x = data_train['date_time'],
    y = data_train['sensor_1']
), row=1, col=1)

fig.add_trace(go.Scatter(
    x = data_train['date_time'],
    y = data_train['sensor_2']
), row=2, col=1)

fig.add_trace(go.Scatter(
    x = data_train['date_time'],
    y = data_train['sensor_2']
), row=3, col=1)

fig.add_trace(go.Scatter(
    x = data_train['date_time'],
    y = data_train['sensor_2']
), row=4, col=1)

fig.add_trace(go.Scatter(
    x = data_train['date_time'],
    y = data_train['sensor_2']
), row=5, col=1)

fig.update_layout(
    height = 1000,
    template = 'plotly_dark',
    showlegend = False,
)

Unfortunately we don't see much from above visualizations. It is possible to spot some trend or seasonality but that doesn't change the fact that daily measurements aren't very helpful. We need resample our features to perform proper analysis.

# Resampling

In [17]:
data_train_resamp = data_train.copy()
data_train_resamp

Unnamed: 0,date_time,deg_C,relative_humidity,absolute_humidity,sensor_1,sensor_2,sensor_3,sensor_4,sensor_5,target_carbon_monoxide,target_benzene,target_nitrogen_oxides
0,2010-03-10 18:00:00,13.1,46.0,0.7578,1387.2,1087.8,1056.0,1742.8,1293.4,2.5,12.0,167.7
1,2010-03-10 19:00:00,13.2,45.3,0.7255,1279.1,888.2,1197.5,1449.9,1010.9,2.1,9.9,98.9
2,2010-03-10 20:00:00,12.6,56.2,0.7502,1331.9,929.6,1060.2,1586.1,1117.0,2.2,9.2,127.1
3,2010-03-10 21:00:00,11.0,62.4,0.7867,1321.0,929.0,1102.9,1536.5,1263.2,2.2,9.7,177.2
4,2010-03-10 22:00:00,11.9,59.0,0.7888,1272.0,852.7,1180.9,1415.5,1132.2,1.5,6.4,121.8
...,...,...,...,...,...,...,...,...,...,...,...,...
7106,2010-12-31 20:00:00,9.2,32.0,0.3871,1000.5,811.2,873.0,909.0,910.5,1.3,5.1,191.1
7107,2010-12-31 21:00:00,9.1,33.2,0.3766,1022.7,790.0,951.6,912.9,903.4,1.4,5.8,221.3
7108,2010-12-31 22:00:00,9.6,34.6,0.4310,1044.4,767.3,861.9,889.2,1159.1,1.6,5.2,227.4
7109,2010-12-31 23:00:00,8.0,40.7,0.4085,952.8,691.9,908.5,917.0,1206.3,1.5,4.6,199.8


In [18]:
data_train_resamp['date_time'] = pd.to_datetime(data_train['date_time'], format='%Y-%m-%d %H:%M:%S')

In [19]:
to_plot_deg_C = data_train_resamp[['date_time', 'deg_C']]
resampled_to_plot = to_plot_deg_C.resample('D', on='date_time').mean().reset_index(drop=False)

mean = np.mean(resampled_to_plot['deg_C'])

fig = go.Figure()

fig.add_trace(go.Scatter(
    x = resampled_to_plot['date_time'],
    y = resampled_to_plot['deg_C'],
    marker_color = '#00b36b'
))

fig.add_hline(
    y = mean,
    annotation_text = 'Mean of {}'.format(mean.round(2)),
    annotation_position = 'bottom right'
)

fig.update_layout(
    template = 'plotly_dark',
    width = 1000,
    title_text = 'def_C by day'
)

As expected after the change we can clearly see increase in temperature from Jun to Oct. Between these months temperature is above average. At the first glance I don't see other paterns.

In [20]:
to_plot_relative_humidity = data_train_resamp[['date_time', 'relative_humidity']]
resampled_to_plot = to_plot_relative_humidity.resample('D', on='date_time').mean().reset_index(drop=False)

mean = np.mean(resampled_to_plot['relative_humidity'])

fig = go.Figure()

fig.add_trace(go.Scatter(
    x = resampled_to_plot['date_time'],
    y = resampled_to_plot['relative_humidity'],
    marker_color = '#0099ff'
))

fig.add_hline(
    y = mean,
    annotation_text = 'Mean of {}'.format(mean.round(2)),
    annotation_position = 'bottom right'
)

fig.update_layout(
    template = 'plotly_dark',
    width = 1000,
    title_text = 'relative_humidity by day'
)

It's looks like humidity is correlated with temperature. When temperature is higher humidity goes down. We will check if it is true using scatter plot below.

In [21]:
# Correlation coeficient

x_corr = data_train_resamp['deg_C']
y_corr = data_train_resamp['relative_humidity']

corr = np.corrcoef(x_corr, y_corr)
print('Pearson correlation coefficient: {}'.format(corr[0,1]))

Pearson correlation coefficient: -0.6680019661025169


In [22]:
fig = go.Figure()

fig.add_trace(go.Scatter(
    x = data_train_resamp['deg_C'],
    y = data_train_resamp['relative_humidity'],
    mode = 'markers'
))

fig.update_layout(
    template = 'plotly_dark',
    xaxis_title = 'Temperature (°C)',
    yaxis_title = ' Relative humidity',
    title_text = 'Temperature vs Relative humidity' 
)

I was partly right because correlation between those two features is -0.66 thats mean that when temperature rises then humidity drops in most cases.

In [23]:
to_plot_absolute_humidity = data_train_resamp[['date_time', 'absolute_humidity']]
resampled_to_plot = to_plot_absolute_humidity.resample('D', on='date_time').mean().reset_index(drop=False)

mean = np.mean(resampled_to_plot['absolute_humidity'])

fig = go.Figure()

fig.add_trace(go.Scatter(
    x = resampled_to_plot['date_time'],
    y = resampled_to_plot['absolute_humidity'],
    marker_color = '#cc3300'
))

fig.add_hline(
    y = mean,
    annotation_text = 'Mean of {}'.format(mean.round(2)),
    annotation_position = 'bottom right'
)

fig.update_layout(
    template = 'plotly_dark',
    width = 1000,
    title_text = 'absolute_humidity by day'
)

In [24]:
to_plot_absolute_humidity = data_train_resamp[['date_time', 'sensor_1', 'sensor_2', 'sensor_3', 'sensor_4', 'sensor_5']]
resampled_to_plot = to_plot_absolute_humidity.resample('D', on='date_time').mean().reset_index(drop=False)

In [25]:
fig = make_subplots(rows=5, cols=1, subplot_titles=(
    'Sensor 1 measurements by day',
    'Sensor 2 measurements by day',
    'Sensor 3 measurements by day',
    'Sensor 4 measurements by day',
    'Sensor 5 measurements by day'
))

fig.add_trace(go.Scatter(
    x = resampled_to_plot['date_time'],
    y = resampled_to_plot['sensor_1']
), row=1, col=1)

fig.add_trace(go.Scatter(
    x = resampled_to_plot['date_time'],
    y = resampled_to_plot['sensor_2']
), row=2, col=1)

fig.add_trace(go.Scatter(
    x = resampled_to_plot['date_time'],
    y = resampled_to_plot['sensor_3']
), row=3, col=1)

fig.add_trace(go.Scatter(
    x = resampled_to_plot['date_time'],
    y = resampled_to_plot['sensor_4']
), row=4, col=1)

fig.add_trace(go.Scatter(
    x = resampled_to_plot['date_time'],
    y = resampled_to_plot['sensor_5']
), row=5, col=1)

fig.update_layout(
    height = 1000,
    template = 'plotly_dark',
    showlegend = False,
)

In [26]:
data_train_resamp

Unnamed: 0,date_time,deg_C,relative_humidity,absolute_humidity,sensor_1,sensor_2,sensor_3,sensor_4,sensor_5,target_carbon_monoxide,target_benzene,target_nitrogen_oxides
0,2010-03-10 18:00:00,13.1,46.0,0.7578,1387.2,1087.8,1056.0,1742.8,1293.4,2.5,12.0,167.7
1,2010-03-10 19:00:00,13.2,45.3,0.7255,1279.1,888.2,1197.5,1449.9,1010.9,2.1,9.9,98.9
2,2010-03-10 20:00:00,12.6,56.2,0.7502,1331.9,929.6,1060.2,1586.1,1117.0,2.2,9.2,127.1
3,2010-03-10 21:00:00,11.0,62.4,0.7867,1321.0,929.0,1102.9,1536.5,1263.2,2.2,9.7,177.2
4,2010-03-10 22:00:00,11.9,59.0,0.7888,1272.0,852.7,1180.9,1415.5,1132.2,1.5,6.4,121.8
...,...,...,...,...,...,...,...,...,...,...,...,...
7106,2010-12-31 20:00:00,9.2,32.0,0.3871,1000.5,811.2,873.0,909.0,910.5,1.3,5.1,191.1
7107,2010-12-31 21:00:00,9.1,33.2,0.3766,1022.7,790.0,951.6,912.9,903.4,1.4,5.8,221.3
7108,2010-12-31 22:00:00,9.6,34.6,0.4310,1044.4,767.3,861.9,889.2,1159.1,1.6,5.2,227.4
7109,2010-12-31 23:00:00,8.0,40.7,0.4085,952.8,691.9,908.5,917.0,1206.3,1.5,4.6,199.8


# Machine Learning

In [27]:
data_train

Unnamed: 0,date_time,deg_C,relative_humidity,absolute_humidity,sensor_1,sensor_2,sensor_3,sensor_4,sensor_5,target_carbon_monoxide,target_benzene,target_nitrogen_oxides
0,2010-03-10 18:00:00,13.1,46.0,0.7578,1387.2,1087.8,1056.0,1742.8,1293.4,2.5,12.0,167.7
1,2010-03-10 19:00:00,13.2,45.3,0.7255,1279.1,888.2,1197.5,1449.9,1010.9,2.1,9.9,98.9
2,2010-03-10 20:00:00,12.6,56.2,0.7502,1331.9,929.6,1060.2,1586.1,1117.0,2.2,9.2,127.1
3,2010-03-10 21:00:00,11.0,62.4,0.7867,1321.0,929.0,1102.9,1536.5,1263.2,2.2,9.7,177.2
4,2010-03-10 22:00:00,11.9,59.0,0.7888,1272.0,852.7,1180.9,1415.5,1132.2,1.5,6.4,121.8
...,...,...,...,...,...,...,...,...,...,...,...,...
7106,2010-12-31 20:00:00,9.2,32.0,0.3871,1000.5,811.2,873.0,909.0,910.5,1.3,5.1,191.1
7107,2010-12-31 21:00:00,9.1,33.2,0.3766,1022.7,790.0,951.6,912.9,903.4,1.4,5.8,221.3
7108,2010-12-31 22:00:00,9.6,34.6,0.4310,1044.4,767.3,861.9,889.2,1159.1,1.6,5.2,227.4
7109,2010-12-31 23:00:00,8.0,40.7,0.4085,952.8,691.9,908.5,917.0,1206.3,1.5,4.6,199.8


In [28]:
data_test

Unnamed: 0,date_time,deg_C,relative_humidity,absolute_humidity,sensor_1,sensor_2,sensor_3,sensor_4,sensor_5
0,2011-01-01 00:00:00,8.0,41.3,0.4375,1108.8,745.7,797.1,880.0,1273.1
1,2011-01-01 01:00:00,5.1,51.7,0.4564,1249.5,864.9,687.9,972.8,1714.0
2,2011-01-01 02:00:00,5.8,51.5,0.4689,1102.6,878.0,693.7,941.9,1300.8
3,2011-01-01 03:00:00,5.0,52.3,0.4693,1139.7,916.2,725.6,1011.0,1283.0
4,2011-01-01 04:00:00,4.5,57.5,0.4650,1022.4,838.5,871.5,967.0,1142.3
...,...,...,...,...,...,...,...,...,...
2242,2011-04-04 10:00:00,23.2,28.7,0.7568,1340.3,1023.9,522.8,1374.0,1659.8
2243,2011-04-04 11:00:00,24.5,22.5,0.7119,1232.8,955.1,616.1,1226.1,1269.0
2244,2011-04-04 12:00:00,26.6,19.0,0.6406,1187.7,1052.4,572.8,1253.4,1081.1
2245,2011-04-04 13:00:00,29.1,12.7,0.5139,1053.2,1009.0,702.0,1009.8,808.5


<b>Multi target regression</b> is the term used when there are multiple dependent variables. If the target variables are categorical, then it is called multi-label or multi-target classification, and if the target variables are numeric, then multi-target (or multi-output) regression is the name commonly used.