# Preparation

In [3]:
import pandas as pd

import sys
sys.path.append( '../../src/' )

from ml.preprocessing import *

In [None]:
%%time
sheets = [ 'Friday 15 May', 'Sunday 24 May', 'Monday 25 May', 'Tuesday 26 May', 
          'Wednesday 27 May', 'Thursday 28 May', 'Friday 29 May', 'Sunday May 31', 
          'Monday June 1', 'Thursday June 4', 'Friday June 5' ]

df = read_sheets( '../../datasets/binary files/dataset-residential.xlsx', sheets )

## Removing null values

In [None]:
print( 'Legnth: ', len( df ) )
print( 'Null values:' )
len( df ) - df.count()

In [None]:
df = df.dropna( axis = 0, subset = [ 'occ', 'hum' ], thresh = 2 )

In [None]:
print( 'Legnth: ', len( df ) )
print( 'Null values:' )
len( df ) - df.count()

In [None]:
df.to_csv( '../../datasets/ds-residential-1sec.csv' )

## Genereting other resolutions

In [4]:
from sklearn.model_selection import train_test_split 
import os

path = [ '..', '..', 'datasets' ]
out_path = [ '..', '..', 'datasets', 'resolutions' ]

def split_and_save( df, out_path, resolution ):
    x_train, x_test, y_train , y_test = train_test_split( df[ df.columns[ :-1 ] ], df.occ, test_size = 0.20, random_state = 0 )
    x_train[ 'occ' ] = y_train
    x_test[ 'occ' ] = y_test
    x_train.to_csv( os.path.join( *out_path, 'ds-%s-train.csv' % ( resolution ) ) )
    x_test.to_csv( os.path.join( *out_path, 'ds-%s-test.csv'% ( resolution ) ) )

In [5]:
df = pd.read_csv( os.path.join( *path, 'ds-residential-1sec.csv' ), index_col = 'date', parse_dates = [ 'date' ] ).drop( 'ven', axis = 1 )

### 10 (sampled and averaged)

In [6]:
split_and_save( resample_df( df, '10S' ), out_path, 'home-10sec' )
split_and_save( resample_df_avg( df, '10S' ).round( 2 ), out_path, 'home-10sec-avg' )

### 30 seconds 

In [7]:
split_and_save( resample_df( df, '30S' ), out_path, 'home-30sec' )
split_and_save( resample_df_avg( df, '30S' ).round( 2 ), out_path, 'home-30sec-avg' )

### 1 minute (sampled and averaged)

In [8]:
split_and_save( resample_df( df, '1min' ), out_path, 'home-1min' )
split_and_save( resample_df_avg( df, '1min' ).round( 2 ), out_path, 'home-1min-avg' )

### 5 minutes (sampled and averaged)

In [9]:
split_and_save( resample_df( df, '5min' ), out_path, 'home-5min' )
split_and_save( resample_df_avg( df, '5min' ).round( 2 ), out_path, 'home-5min-avg' )

## Visualization

In [None]:
from ml.visualization import *

### Complete dataset

In [None]:
%%time
plot_env_vars( df.index.astype( str ), df[ 'tem' ], df[ 'hum' ], df[ 'occ' ] )

# plot_single( df.index.astype( str ), df[ 'tem' ], 'Temperature' )
# plot_single( df.index.astype( str ), df[ 'hum' ], 'Humidity' )
# plot_single( df.index.astype( str ), df[ 'pre' ], 'Pressure' )

### By day

In [None]:
df_temp = df.resample( 'D' ).agg( { 'pre': 'mean', 'hum': 'mean', 'tem': 'mean', 'ven': [ 'mean', 'count' ] } ).dropna( how = 'any' )
df_temp

In [None]:
from collections import Counter

df_temp = df.groupby( df.index.floor( 'D' ) ) 
for i, g in df_temp:
    plot_env_vars( 
        g.index, 
        g[ 'tem' ], 
        g[ 'hum' ], 
        g[ 'occ' ],
        "Temperature and Humidity (" + g.index[ 0 ].strftime( '%Y-%b-%d' ) + ")" )
    
    print( 'Total:', len( g ) )
    print( Counter( g.occ ) )