# Training and Validation of Predictive Model - Predictive Maintance LUO

Luo: Build a binary classifier in the style of the predictive maintenance examples for the mset 2 (time series data)

    (a) import the data and just use the hour (not the rest of the datetime!), the temperature, humidity, light, CO2 and humidity ratio (remove everything else, occupancy is the label); 
    (b) standardize the numeric features and add the moving average and the moving standard deviation for them; 
    (c) now train a classifier using a Random Forest (500 trees with a max. depth of 6); 
    (d) calculate the accuracy of the classifier on the testdata

Imports:

In [220]:
import pandas as pd
import datetime
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from datetime import datetime, date, time, timedelta
from sklearn.metrics import accuracy_score

## A 
Import the data and just use the hour (not the rest of the datetime!), the temperature, humidity, light, CO2 and humidity ratio (remove everything else, occupancy is the label)

Load data:

In [221]:
data = pd.read_csv("data2_timeseries_occupancy_training.txt",delimiter=",")


Extract and sort by time out of date:

In [222]:
data.date = data.date.apply(lambda x : datetime.strptime(x, "%Y-%m-%d %H:%M:%S").time() )
data=data.sort_values(by="date")
data

Unnamed: 0,date,Temperature,Humidity,Light,CO2,HumidityRatio,Occupancy
4690,00:00:00,19.600,29.390,0.0,438.5,0.004143,0
6130,00:00:00,19.500,27.100,0.0,459.0,0.003795,0
1810,00:00:00,20.200,21.290,0.0,438.0,0.003110,0
7570,00:00:00,20.390,32.950,0.0,498.5,0.004884,0
370,00:00:00,21.245,25.245,0.0,456.5,0.003938,0
...,...,...,...,...,...,...,...
4689,23:58:59,19.600,29.390,0.0,438.0,0.004143,0
7569,23:58:59,20.390,33.000,0.0,506.5,0.004891,0
1809,23:58:59,20.200,21.200,0.0,444.0,0.003097,0
6129,23:58:59,19.500,27.100,0.0,458.0,0.003795,0


## B

 - Standardize the numeric features 
and 
 - add the moving average 
and
 - the moving standard deviation for them

### Standardize numeric features

In [223]:

data[['Temperature',"Humidity","Light","CO2","HumidityRatio"]] = StandardScaler().fit_transform(data[['Temperature',"Humidity","Light","CO2","HumidityRatio"]])
data.head()

Unnamed: 0,date,Temperature,Humidity,Light,CO2,HumidityRatio,Occupancy
4690,00:00:00,-1.002193,0.661468,-0.613726,-0.534666,0.32952,0
6130,00:00:00,-1.100535,0.247428,-0.613726,-0.469442,-0.079534,0
1810,00:00:00,-0.412137,-0.803039,-0.613726,-0.536256,-0.882903,0
7570,00:00:00,-0.225287,1.305128,-0.613726,-0.343766,1.198526,0
370,00:00:00,0.615542,-0.087962,-0.613726,-0.477396,0.088786,0


### Add moving average and moving standard deviation

CAVE : As not further described, a rolling window of 5 is taken

In [224]:
data["TemperatureMA"]=data.Temperature.rolling(5,min_periods=1).mean()
data["HumidityMA"]=data.Humidity.rolling(5,min_periods=1).mean()
data["LightMA"]=data.Light.rolling(5,min_periods=1).mean()
data["CO2MA"]=data.CO2.rolling(5,min_periods=1).mean()
data["HumidityRatioMA"]=data.HumidityRatio.rolling(5,min_periods=1).mean()

data["TemperatureStdev"]=data.Temperature.rolling(5,min_periods=1).std()
data["HumidityStdev"]=data.Humidity.rolling(5,min_periods=1).std()
data["LightStdev"]=data.Light.rolling(5,min_periods=1).std()
data["CO2Stdev"]=data.CO2.rolling(5,min_periods=1).std()
data["HumidityRatioStdev"]=data.HumidityRatio.rolling(5,min_periods=1).std()

data

Unnamed: 0,date,Temperature,Humidity,Light,CO2,HumidityRatio,Occupancy,TemperatureMA,HumidityMA,LightMA,CO2MA,HumidityRatioMA,TemperatureStdev,HumidityStdev,LightStdev,CO2Stdev,HumidityRatioStdev
4690,00:00:00,-1.002193,0.661468,-0.613726,-0.534666,0.329520,0,-1.002193,0.661468,-0.613726,-0.534666,0.329520,,,,,
6130,00:00:00,-1.100535,0.247428,-0.613726,-0.469442,-0.079534,0,-1.051364,0.454448,-0.613726,-0.502054,0.124993,0.069539,0.292770,0.000000e+00,0.046120,0.289245
1810,00:00:00,-0.412137,-0.803039,-0.613726,-0.536256,-0.882903,0,-0.838288,0.035285,-0.613726,-0.513455,-0.210972,0.372319,0.754949,0.000000e+00,0.038125,0.616806
7570,00:00:00,-0.225287,1.305128,-0.613726,-0.343766,1.198526,0,-0.685038,0.352746,-0.613726,-0.471032,0.141402,0.431691,0.884924,0.000000e+00,0.090374,0.866201
370,00:00:00,0.615542,-0.087962,-0.613726,-0.477396,0.088786,0,-0.424922,0.264604,-0.613726,-0.472305,0.130879,0.691426,0.791304,0.000000e+00,0.078318,0.750521
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4689,23:58:59,-1.002193,0.661468,-0.613726,-0.536256,0.329520,0,-0.256756,-0.017268,-0.613726,-0.495213,-0.094154,0.839272,0.544093,5.053230e-08,0.030318,0.472861
7569,23:58:59,-0.225287,1.314168,-0.613726,-0.318313,1.207290,0,-0.219386,0.409428,-0.613726,-0.455124,0.326985,0.834771,0.592294,5.053230e-08,0.081210,0.513453
1809,23:58:59,-0.412137,-0.819312,-0.613726,-0.517166,-0.898406,0,-0.081707,0.196080,-0.613726,-0.464669,0.163211,0.698807,0.815355,5.053230e-08,0.085978,0.751124
6129,23:58:59,-1.100535,0.247428,-0.613726,-0.472623,-0.079534,0,-0.433773,0.261531,-0.613726,-0.464669,0.125314,0.674862,0.800673,5.053230e-08,0.085978,0.759220


## C 
Now train a classifier using a Random Forest (500 trees with a max. depth of 6)

Turn into seconds to have metric float value:

In [225]:
data.date = data.date.apply(lambda x :(datetime.combine(date.min, x)-datetime.min).total_seconds())


Drop one rone with nan value in it

In [226]:
data=data.drop(4690,axis=0)

Create and train model:

In [227]:
clf = RandomForestClassifier(max_depth=6,n_estimators=500)
clf.fit(data.drop("Occupancy",axis=1),data.Occupancy)

RandomForestClassifier(max_depth=6, n_estimators=500)

## D
Calculate the accuracy of the classifier on the testdata

## Test Set 1 

### Transformation/Preprocessing wie oben

In [228]:
test1 = pd.read_csv("data2_timeseries_occupancy_test.txt",delimiter=",")
test1.date = test1.date.apply(lambda x :datetime.strptime(x, "%Y-%m-%d %H:%M:%S").time() )
test1=test1.sort_values(by="date")
test1[['Temperature',"Humidity","Light","CO2","HumidityRatio"]] = StandardScaler().fit_transform(test1[['Temperature',"Humidity","Light","CO2","HumidityRatio"]])
test1["TemperatureMA"]=test1.Temperature.rolling(5,min_periods=1).mean()
test1["HumidityMA"]=test1.Humidity.rolling(5,min_periods=1).mean()
test1["LightMA"]=test1.Light.rolling(5,min_periods=1).mean()
test1["CO2MA"]=test1.CO2.rolling(5,min_periods=1).mean()
test1["HumidityRatioMA"]=test1.HumidityRatio.rolling(5,min_periods=1).mean()

test1["TemperatureStdev"]=test1.Temperature.rolling(5,min_periods=1).std()
test1["HumidityStdev"]=test1.Humidity.rolling(5,min_periods=1).std()
test1["LightStdev"]=test1.Light.rolling(5,min_periods=1).std()
test1["CO2Stdev"]=test1.CO2.rolling(5,min_periods=1).std()
test1["HumidityRatioStdev"]=test1.HumidityRatio.rolling(5,min_periods=1).std()

test1.date = test1.date.apply(lambda x :(datetime.combine(date.min, x)-datetime.min).total_seconds())
test1=test1.dropna()


In [229]:
test1.head()

Unnamed: 0,date,Temperature,Humidity,Light,CO2,HumidityRatio,Occupancy,TemperatureMA,HumidityMA,LightMA,CO2MA,HumidityRatioMA,TemperatureStdev,HumidityStdev,LightStdev,CO2Stdev,HumidityRatioStdev
3433,0.0,-0.883532,0.9356,-0.591074,-0.833863,0.568118,0,-0.49652,0.411267,-0.591074,0.337535,0.198851,0.547317,0.741519,0.0,1.656607,0.522221
553,0.0,-0.393643,-0.820188,-0.591074,-0.825448,-1.116382,0,-0.462228,0.000782,-0.591074,-0.050126,-0.23956,0.391543,0.883413,0.0,1.350191,0.844376
6313,0.0,-0.726767,-0.108428,-0.591074,-0.060224,-0.490397,0,-0.528363,-0.026521,-0.591074,-0.052651,-0.302269,0.345976,0.723368,0.0,1.102438,0.700745
7753,0.0,-0.785554,0.126014,-0.591074,-0.132034,-0.263529,0,-0.579801,0.003986,-0.591074,-0.068527,-0.294521,0.320942,0.630158,0.0,0.955399,0.60711
4873,0.0,-1.089285,1.480805,-0.591074,-0.732881,1.036187,0,-0.775756,0.322761,-0.591074,-0.51689,-0.0532,0.254176,0.901055,0.0,0.386972,0.857108


### Accuracy

In [230]:
accuracy_score(test1.Occupancy, clf.predict(test1.drop("Occupancy",axis=1))) 

0.9586709055481489

## Test Set 2 

### Transformation/Preprocessing wie oben

In [231]:
test2 = pd.read_csv("data2_timeseries_occupancy_test2.txt",delimiter=",")
test2.date = test2.date.apply(lambda x :datetime.strptime(x, "%Y-%m-%d %H:%M:%S").time() )
test2=test2.sort_values(by="date")
test2[['Temperature',"Humidity","Light","CO2","HumidityRatio"]] = StandardScaler().fit_transform(test2[['Temperature',"Humidity","Light","CO2","HumidityRatio"]])
test2["TemperatureMA"]=test2.Temperature.rolling(5,min_periods=1).mean()
test2["HumidityMA"]=test2.Humidity.rolling(5,min_periods=1).mean()
test2["LightMA"]=test2.Light.rolling(5,min_periods=1).mean()
test2["CO2MA"]=test2.CO2.rolling(5,min_periods=1).mean()
test2["HumidityRatioMA"]=test2.HumidityRatio.rolling(5,min_periods=1).mean()

test2["TemperatureStdev"]=test2.Temperature.rolling(5,min_periods=1).std()
test2["HumidityStdev"]=test2.Humidity.rolling(5,min_periods=1).std()
test2["LightStdev"]=test2.Light.rolling(5,min_periods=1).std()
test2["CO2Stdev"]=test2.CO2.rolling(5,min_periods=1).std()
test2["HumidityRatioStdev"]=test2.HumidityRatio.rolling(5,min_periods=1).std()

test2.date = test2.date.apply(lambda x :(datetime.combine(date.min, x)-datetime.min).total_seconds())
test2=test2.dropna()


In [232]:
test2.head()

Unnamed: 0,date,Temperature,Humidity,Light,CO2,HumidityRatio,Occupancy,TemperatureMA,HumidityMA,LightMA,CO2MA,HumidityRatioMA,TemperatureStdev,HumidityStdev,LightStdev,CO2Stdev,HumidityRatioStdev
721,0.0,-0.811297,-1.294515,-0.772404,-0.910397,-1.149636,0,-0.670223,-0.734943,-0.772404,-0.735828,-0.757631,0.199508,0.791355,0.0,0.246877,0.554378
2162,60.0,-0.529149,-0.172361,-0.772404,-0.565589,-0.363782,0,-0.623199,-0.547415,-0.772404,-0.679082,-0.626348,0.162898,0.647009,0.0,0.200336,0.453181
722,60.0,-0.811297,-1.294515,-0.772404,-0.897582,-1.149636,0,-0.670223,-0.73419,-0.772404,-0.733707,-0.75717,0.162898,0.647008,0.0,0.196703,0.453181
723,120.0,-0.811297,-1.294515,-0.772404,-0.897582,-1.149636,0,-0.698438,-0.846255,-0.772404,-0.766482,-0.835663,0.154539,0.613806,0.0,0.185445,0.429925
2163,120.0,-0.529149,-0.175371,-0.772404,-0.560121,-0.365626,0,-0.698438,-0.846255,-0.772404,-0.766254,-0.835663,0.154539,0.613806,0.0,0.185761,0.429925


### Accuracy

In [233]:
accuracy_score(test2.Occupancy, clf.predict(test2.drop("Occupancy",axis=1))) 

0.9170420420420421