In [30]:
import pandas as pd
import numpy as np
import seaborn as sns 
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [17]:
data = pd.read_csv("./dataset/occupancy_est.csv")
#average out s1_temp, s2_temp, s3_temp and s4_temp into temperature
data.dtypes

Date                     object
Time                     object
S1_Temp                 float64
S2_Temp                 float64
S3_Temp                 float64
S4_Temp                 float64
S1_Light                  int64
S2_Light                  int64
S3_Light                  int64
S4_Light                  int64
S1_Sound                float64
S2_Sound                float64
S3_Sound                float64
S4_Sound                float64
S5_CO2                    int64
S5_CO2_Slope            float64
S6_PIR                    int64
S7_PIR                    int64
Room_Occupancy_Count      int64
dtype: object

## Data Description

In [12]:
data.describe()

Unnamed: 0,S1_Temp,S2_Temp,S3_Temp,S4_Temp,S1_Light,S2_Light,S3_Light,S4_Light,S1_Sound,S2_Sound,S3_Sound,S4_Sound,S5_CO2,S5_CO2_Slope,S6_PIR,S7_PIR,Room_Occupancy_Count
count,10129.0,10129.0,10129.0,10129.0,10129.0,10129.0,10129.0,10129.0,10129.0,10129.0,10129.0,10129.0,10129.0,10129.0,10129.0,10129.0,10129.0
mean,25.454012,25.546059,25.056621,25.754125,25.445059,26.01629,34.248494,13.220259,0.168178,0.120066,0.158119,0.10384,460.860401,-0.00483,0.090137,0.079574,0.398559
std,0.351351,0.586325,0.427283,0.356434,51.011264,67.30417,58.400744,19.602219,0.316709,0.266503,0.413637,0.120683,199.96494,1.16499,0.286392,0.270645,0.893633
min,24.94,24.75,24.44,24.94,0.0,0.0,0.0,0.0,0.06,0.04,0.04,0.05,345.0,-6.296154,0.0,0.0,0.0
25%,25.19,25.19,24.69,25.44,0.0,0.0,0.0,0.0,0.07,0.05,0.06,0.06,355.0,-0.046154,0.0,0.0,0.0
50%,25.38,25.38,24.94,25.75,0.0,0.0,0.0,0.0,0.08,0.05,0.06,0.08,360.0,0.0,0.0,0.0,0.0
75%,25.63,25.63,25.38,26.0,12.0,14.0,50.0,22.0,0.08,0.06,0.07,0.1,465.0,0.0,0.0,0.0,0.0
max,26.38,29.0,26.19,26.56,165.0,258.0,280.0,74.0,3.88,3.44,3.67,3.4,1270.0,8.980769,1.0,1.0,3.0


## Check for misising or null values

In [9]:
data.isnull().sum()

Date                    0
Time                    0
S1_Temp                 0
S2_Temp                 0
S3_Temp                 0
S4_Temp                 0
S1_Light                0
S2_Light                0
S3_Light                0
S4_Light                0
S1_Sound                0
S2_Sound                0
S3_Sound                0
S4_Sound                0
S5_CO2                  0
S5_CO2_Slope            0
S6_PIR                  0
S7_PIR                  0
Room_Occupancy_Count    0
dtype: int64

# Data Pre-processing
## 1. Split the data into training and testing sets
## 2. Feature Scaling

In [61]:
#splitting the data into train and test
x_train, x_test, y_train, y_test = train_test_split(data.drop('Room_Occupancy_Count', axis=1), data['Room_Occupancy_Count'], test_size=0.2, random_state=42)
# Scaling the data
scaler = StandardScaler()
# Select only the numeric columns
numeric_columns = x_train.select_dtypes(include=np.number).columns
categorical_columns = ["Date"]

# Scale the numeric columns
x_train[numeric_columns].head()
x_test[numeric_columns].head()
# x_train[numeric_columns] = scaler.fit_transform(x_train[numeric_columns])
# x_test[numeric_columns] = scaler.transform(x_test[numeric_columns])



Unnamed: 0,S1_Temp,S2_Temp,S3_Temp,S4_Temp,S1_Light,S2_Light,S3_Light,S4_Light,S1_Sound,S2_Sound,...,S5_CO2_Slope,S6_PIR,S7_PIR,0,1,2,3,4,5,6
8855,-0.409902,-0.402627,-0.13252,-0.881342,-0.498837,-0.386567,-0.586468,-0.67446,-0.310008,-0.300448,...,0.004146,-0.314749,-0.294029,0.0,0.0,0.0,0.0,0.0,1.0,0.0
532,1.724823,0.450184,1.482417,1.559623,1.834096,0.074051,0.423843,1.264192,-0.278432,-0.262923,...,0.644659,-0.314749,-0.294029,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1155,0.671692,0.347846,0.593031,0.521511,-0.498837,-0.386567,-0.586468,-0.67446,-0.310008,-0.262923,...,-1.890979,-0.314749,-0.294029,1.0,0.0,0.0,0.0,0.0,0.0,0.0
7769,-0.751458,-0.607302,-0.9985,-0.881342,-0.498837,-0.386567,-0.586468,-0.67446,-0.278432,-0.225398,...,-0.071791,-0.314749,-0.294029,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4922,-0.751458,-0.607302,-0.9985,-0.713,-0.498837,-0.386567,-0.586468,-0.67446,-0.341584,-0.262923,...,-0.015664,-0.314749,-0.294029,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [78]:
# label the categorical columnn (0,1,2,3,4,5,6) by prefixing them with 'day' 

x_train = pd.get_dummies(x_train, columns=[0,1,2,3,4,5,6], drop_first=True, prefix='day')
x_train.head()


Unnamed: 0,Time,S1_Temp,S2_Temp,S3_Temp,S4_Temp,S1_Light,S2_Light,S3_Light,S4_Light,S1_Sound,...,S5_CO2_Slope,S6_PIR,S7_PIR,day_1.0,day_1.0.1,day_1.0.2,day_1.0.3,day_1.0.4,day_1.0.5,day_1.0.6
1937,04:06:10,-0.751458,-0.607302,-0.9985,-0.544657,-0.498837,-0.386567,-0.586468,-0.67446,-0.278432,...,0.004146,-0.314749,-0.294029,False,True,False,False,False,False,False
4477,02:02:57,-0.210661,-0.283234,-0.577212,-0.011573,-0.498837,-0.386567,-0.586468,-0.67446,-0.310008,...,0.004146,-0.314749,-0.294029,False,False,True,False,False,False,False
8550,19:26:17,0.301673,0.143172,0.593031,-0.179915,-0.498837,-0.386567,-0.586468,-0.67446,-0.310008,...,-2.22114,-0.314749,-0.294029,False,False,False,False,False,True,False
1346,23:00:19,0.130895,-0.078559,0.007909,0.353169,-0.498837,-0.386567,-0.586468,-0.67446,-0.310008,...,-0.547223,-0.314749,-0.294029,True,False,False,False,False,False,False
7296,02:23:04,-0.210661,-0.283234,-0.413378,-0.011573,-0.498837,-0.386567,-0.586468,-0.67446,-0.310008,...,0.004146,-0.314749,-0.294029,False,False,False,False,True,False,False


We notice that we have particular days from our EDA. So we can categorize the dates but not the time.



In [82]:
linear_reg_model = LinearRegression()
copy_x_train = x_train.copy()
copy_y_train = y_train.copy()
numeric_columns = copy_x_train.select_dtypes(include=np.number).columns
copy_x_train[numeric_columns].head()
# print(x_train[numeric_columns].shape , y_train.shape)


Unnamed: 0,S1_Temp,S2_Temp,S3_Temp,S4_Temp,S1_Light,S2_Light,S3_Light,S4_Light,S1_Sound,S2_Sound,S3_Sound,S4_Sound,S5_CO2,S5_CO2_Slope,S6_PIR,S7_PIR
1937,-0.751458,-0.607302,-0.9985,-0.544657,-0.498837,-0.386567,-0.586468,-0.67446,-0.278432,-0.262923,-0.237223,-0.446154,-0.504415,0.004146,-0.314749,-0.294029
4477,-0.210661,-0.283234,-0.577212,-0.011573,-0.498837,-0.386567,-0.586468,-0.67446,-0.310008,-0.262923,-0.237223,-0.363288,-0.47941,0.004146,-0.314749,-0.294029
8550,0.301673,0.143172,0.593031,-0.179915,-0.498837,-0.386567,-0.586468,-0.67446,-0.310008,-0.262923,-0.2614,-0.197556,0.345776,-2.22114,-0.314749,-0.294029
1346,0.130895,-0.078559,0.007909,0.353169,-0.498837,-0.386567,-0.586468,-0.67446,-0.310008,-0.300448,-0.213046,-0.363288,-0.354382,-0.547223,-0.314749,-0.294029
7296,-0.210661,-0.283234,-0.413378,-0.011573,-0.498837,-0.386567,-0.586468,-0.67446,-0.310008,-0.262923,-0.237223,-0.031824,-0.529421,0.004146,-0.314749,-0.294029
