### Import libraries.

In [181]:
import pandas as pd
import string
from functions import to_snake_case
import pickle

#### Data Source [UCI Machine Learning Repository](http://archive.ics.uci.edu/ml/datasets/Occupancy+Detection+#)

### Read the dataset into a dataframe object.

In [160]:
df1 = pd.read_csv('../datasets/datatraining.txt')
df2 = pd.read_csv('../datasets/datatest.txt')
df3 = pd.read_csv('../datasets/datatest2.txt')

### Merge all datasets into one large dataframe for cleaning purposes.

In [161]:
df = pd.concat([df1, df2], ignore_index=True)
df = pd.concat([df1, df3], ignore_index=True)

### Check for duplicate observations. 
#### There are none.

In [162]:
df.duplicated().value_counts()

False    17895
dtype: int64

### Check the shape of the dataframe.

In [163]:
df.shape

(17895, 7)

### Peek at the data.

In [164]:
df.head()

Unnamed: 0,date,Temperature,Humidity,Light,CO2,HumidityRatio,Occupancy
0,2015-02-04 17:51:00,23.18,27.272,426.0,721.25,0.004793,1
1,2015-02-04 17:51:59,23.15,27.2675,429.5,714.0,0.004783,1
2,2015-02-04 17:53:00,23.15,27.245,426.0,713.5,0.004779,1
3,2015-02-04 17:54:00,23.15,27.2,426.0,708.25,0.004772,1
4,2015-02-04 17:55:00,23.1,27.2,426.0,704.5,0.004757,1


### Sort all observations in the dataframe by date.
#### We need the date sorted for time series indexing later.

In [165]:
df.sort_values(by=['date'], inplace=True)

In [166]:
df.head()

Unnamed: 0,date,Temperature,Humidity,Light,CO2,HumidityRatio,Occupancy
0,2015-02-04 17:51:00,23.18,27.272,426.0,721.25,0.004793,1
1,2015-02-04 17:51:59,23.15,27.2675,429.5,714.0,0.004783,1
2,2015-02-04 17:53:00,23.15,27.245,426.0,713.5,0.004779,1
3,2015-02-04 17:54:00,23.15,27.2,426.0,708.25,0.004772,1
4,2015-02-04 17:55:00,23.1,27.2,426.0,704.5,0.004757,1


In [167]:
df.tail()

Unnamed: 0,date,Temperature,Humidity,Light,CO2,HumidityRatio,Occupancy
17890,2015-02-18 09:15:00,20.815,27.7175,429.75,1505.25,0.004213,1
17891,2015-02-18 09:16:00,20.865,27.745,423.5,1514.5,0.00423,1
17892,2015-02-18 09:16:59,20.89,27.745,423.5,1521.5,0.004237,1
17893,2015-02-18 09:17:59,20.89,28.0225,418.75,1632.0,0.004279,1
17894,2015-02-18 09:19:00,21.0,28.1,409.0,1864.0,0.004321,1


### Check for missing values.
#### There are none.

In [168]:
df.isnull().sum()

date             0
Temperature      0
Humidity         0
Light            0
CO2              0
HumidityRatio    0
Occupancy        0
dtype: int64

### Check the data types for each column.
#### Notice date is not in DateTime format.  
#### We will need to convert to allow time series indexing.

In [169]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 17895 entries, 0 to 17894
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   date           17895 non-null  object 
 1   Temperature    17895 non-null  float64
 2   Humidity       17895 non-null  float64
 3   Light          17895 non-null  float64
 4   CO2            17895 non-null  float64
 5   HumidityRatio  17895 non-null  float64
 6   Occupancy      17895 non-null  int64  
dtypes: float64(5), int64(1), object(1)
memory usage: 1.1+ MB


### Rename all columns to follow snake-case conventions.

In [170]:
df.columns = df.columns.map(lambda x: to_snake_case(x))

In [171]:
df.head()

Unnamed: 0,date,temperature,humidity,light,co2,humidity_ratio,occupancy
0,2015-02-04 17:51:00,23.18,27.272,426.0,721.25,0.004793,1
1,2015-02-04 17:51:59,23.15,27.2675,429.5,714.0,0.004783,1
2,2015-02-04 17:53:00,23.15,27.245,426.0,713.5,0.004779,1
3,2015-02-04 17:54:00,23.15,27.2,426.0,708.25,0.004772,1
4,2015-02-04 17:55:00,23.1,27.2,426.0,704.5,0.004757,1


### Sort the observations by the date column.
#### We need the date column sorted in ascending order in preparation for time series indexing.

In [172]:
df.sort_values(by='date', inplace=True)

In [173]:
df.head()

Unnamed: 0,date,temperature,humidity,light,co2,humidity_ratio,occupancy
0,2015-02-04 17:51:00,23.18,27.272,426.0,721.25,0.004793,1
1,2015-02-04 17:51:59,23.15,27.2675,429.5,714.0,0.004783,1
2,2015-02-04 17:53:00,23.15,27.245,426.0,713.5,0.004779,1
3,2015-02-04 17:54:00,23.15,27.2,426.0,708.25,0.004772,1
4,2015-02-04 17:55:00,23.1,27.2,426.0,704.5,0.004757,1


### Save the cleaned dataset to a csv file for later use.

In [150]:
df.to_csv('../datasets/occupancy.csv')

### Create time series index using the date column.

#### Convert the date column from object/string type to DateTime type.

In [174]:
df['date'] = df['date'].map(lambda x: pd.to_datetime(x))

In [175]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 17895 entries, 0 to 17894
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   date            17895 non-null  datetime64[ns]
 1   temperature     17895 non-null  float64       
 2   humidity        17895 non-null  float64       
 3   light           17895 non-null  float64       
 4   co2             17895 non-null  float64       
 5   humidity_ratio  17895 non-null  float64       
 6   occupancy       17895 non-null  int64         
dtypes: datetime64[ns](1), float64(5), int64(1)
memory usage: 1.1 MB


In [176]:
df.set_index('date', inplace=True)

In [177]:
df.head()

Unnamed: 0_level_0,temperature,humidity,light,co2,humidity_ratio,occupancy
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2015-02-04 17:51:00,23.18,27.272,426.0,721.25,0.004793,1
2015-02-04 17:51:59,23.15,27.2675,429.5,714.0,0.004783,1
2015-02-04 17:53:00,23.15,27.245,426.0,713.5,0.004779,1
2015-02-04 17:54:00,23.15,27.2,426.0,708.25,0.004772,1
2015-02-04 17:55:00,23.1,27.2,426.0,704.5,0.004757,1


In [178]:
df.sort_index(inplace=True)

In [179]:
df.head()

Unnamed: 0_level_0,temperature,humidity,light,co2,humidity_ratio,occupancy
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2015-02-04 17:51:00,23.18,27.272,426.0,721.25,0.004793,1
2015-02-04 17:51:59,23.15,27.2675,429.5,714.0,0.004783,1
2015-02-04 17:53:00,23.15,27.245,426.0,713.5,0.004779,1
2015-02-04 17:54:00,23.15,27.2,426.0,708.25,0.004772,1
2015-02-04 17:55:00,23.1,27.2,426.0,704.5,0.004757,1


In [180]:
df.tail()

Unnamed: 0_level_0,temperature,humidity,light,co2,humidity_ratio,occupancy
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2015-02-18 09:15:00,20.815,27.7175,429.75,1505.25,0.004213,1
2015-02-18 09:16:00,20.865,27.745,423.5,1514.5,0.00423,1
2015-02-18 09:16:59,20.89,27.745,423.5,1521.5,0.004237,1
2015-02-18 09:17:59,20.89,28.0225,418.75,1632.0,0.004279,1
2015-02-18 09:19:00,21.0,28.1,409.0,1864.0,0.004321,1


### Picke the data for later use. This will preserve the time series indexing.

In [185]:
df.to_pickle('../datasets/occupancy.p')