# **Capstone III Data Wrangling**

### **Section 1: Imports and Initial Inspection**

In [71]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from datetime import datetime

In [2]:
house_measurements = pd.read_csv("../Raw Data/train.csv")

In [3]:
house_measurements.head()

Unnamed: 0,Id,Date,Time,CO2_(dinning-room),CO2_room,Relative_humidity_(dinning-room),Relative_humidity_room,Lighting_(dinning-room),Lighting_room,Meteo_Rain,Meteo_Sun_dusk,Meteo_Wind,Meteo_Sun_light_in_west_facade,Meteo_Sun_light_in_east_facade,Meteo_Sun_light_in_south_facade,Meteo_Sun_irradiance,Outdoor_relative_humidity_Sensor,Day_of_the_week,Indoor_temperature_room
0,0,13/03/2012,11:45,216.56,221.92,39.9125,42.415,81.665,113.52,0.0,623.36,1.42625,9690.24,12604.2,95436.8,758.88,48.375,2.0,17.8275
1,1,13/03/2012,12:00,219.947,220.363,39.9267,42.2453,81.7413,113.605,0.0,623.211,1.592,11022.0,10787.2,95436.8,762.069,47.808,2.0,18.1207
2,2,13/03/2012,12:15,219.403,218.933,39.772,42.2267,81.424,113.6,0.0,622.656,1.89133,13960.5,9669.63,95398.6,766.251,47.432,2.0,18.4367
3,3,13/03/2012,12:30,218.613,217.045,39.776,42.0987,81.5013,113.344,0.0,622.571,1.828,18511.2,9648.13,95360.3,766.037,47.024,2.0,18.7513
4,4,13/03/2012,12:45,217.714,216.08,39.7757,42.0686,81.4657,113.034,0.0,622.4,2.36071,26349.0,9208.32,95354.9,762.743,45.4743,2.0,19.0414


In [4]:
house_measurements.shape

(2764, 19)

In [5]:
# With the exception of Date and Time, all columns are the correct data type
house_measurements.dtypes

Id                                    int64
Date                                 object
Time                                 object
CO2_(dinning-room)                  float64
CO2_room                            float64
Relative_humidity_(dinning-room)    float64
Relative_humidity_room              float64
Lighting_(dinning-room)             float64
Lighting_room                       float64
Meteo_Rain                          float64
Meteo_Sun_dusk                      float64
Meteo_Wind                          float64
Meteo_Sun_light_in_west_facade      float64
Meteo_Sun_light_in_east_facade      float64
Meteo_Sun_light_in_south_facade     float64
Meteo_Sun_irradiance                float64
Outdoor_relative_humidity_Sensor    float64
Day_of_the_week                     float64
Indoor_temperature_room             float64
dtype: object

In [6]:
# No NaN values
house_measurements.isna().sum()

Id                                  0
Date                                0
Time                                0
CO2_(dinning-room)                  0
CO2_room                            0
Relative_humidity_(dinning-room)    0
Relative_humidity_room              0
Lighting_(dinning-room)             0
Lighting_room                       0
Meteo_Rain                          0
Meteo_Sun_dusk                      0
Meteo_Wind                          0
Meteo_Sun_light_in_west_facade      0
Meteo_Sun_light_in_east_facade      0
Meteo_Sun_light_in_south_facade     0
Meteo_Sun_irradiance                0
Outdoor_relative_humidity_Sensor    0
Day_of_the_week                     0
Indoor_temperature_room             0
dtype: int64

### **Section 2: Format Changes**

Here we'll make some format changes. This includes creating a combined date and time column, changing the data types of the date columns, and changing column names.

In [7]:
house_measurements.head()

Unnamed: 0,Id,Date,Time,CO2_(dinning-room),CO2_room,Relative_humidity_(dinning-room),Relative_humidity_room,Lighting_(dinning-room),Lighting_room,Meteo_Rain,Meteo_Sun_dusk,Meteo_Wind,Meteo_Sun_light_in_west_facade,Meteo_Sun_light_in_east_facade,Meteo_Sun_light_in_south_facade,Meteo_Sun_irradiance,Outdoor_relative_humidity_Sensor,Day_of_the_week,Indoor_temperature_room
0,0,13/03/2012,11:45,216.56,221.92,39.9125,42.415,81.665,113.52,0.0,623.36,1.42625,9690.24,12604.2,95436.8,758.88,48.375,2.0,17.8275
1,1,13/03/2012,12:00,219.947,220.363,39.9267,42.2453,81.7413,113.605,0.0,623.211,1.592,11022.0,10787.2,95436.8,762.069,47.808,2.0,18.1207
2,2,13/03/2012,12:15,219.403,218.933,39.772,42.2267,81.424,113.6,0.0,622.656,1.89133,13960.5,9669.63,95398.6,766.251,47.432,2.0,18.4367
3,3,13/03/2012,12:30,218.613,217.045,39.776,42.0987,81.5013,113.344,0.0,622.571,1.828,18511.2,9648.13,95360.3,766.037,47.024,2.0,18.7513
4,4,13/03/2012,12:45,217.714,216.08,39.7757,42.0686,81.4657,113.034,0.0,622.4,2.36071,26349.0,9208.32,95354.9,762.743,45.4743,2.0,19.0414


In [8]:
# Set the extra Id column to be the index of the data frame.
house_measurements.set_index('Id', inplace = True)

In [9]:
# We'd like these column names to be formatted differently.
house_measurements.columns

Index(['Date', 'Time', 'CO2_(dinning-room)', 'CO2_room',
       'Relative_humidity_(dinning-room)', 'Relative_humidity_room',
       'Lighting_(dinning-room)', 'Lighting_room', 'Meteo_Rain',
       'Meteo_Sun_dusk', 'Meteo_Wind', 'Meteo_Sun_light_in_west_facade',
       'Meteo_Sun_light_in_east_facade', 'Meteo_Sun_light_in_south_facade',
       'Meteo_Sun_irradiance', 'Outdoor_relative_humidity_Sensor',
       'Day_of_the_week', 'Indoor_temperature_room'],
      dtype='object')

In [10]:
# Function to remove parentheses, make lower case, and change dashes to underscores in a list of strings.
def column_renamer(df):
    new_names = []
    for column in df.columns:
        new_name = column.lower()
        new_name = new_name.replace('(', '').replace(')', '').replace('-', '_').replace('dinning', 'dining')
        new_names.append(new_name)
    return(new_names)

In [11]:
# Replace column names with the formatted names using our column_renamer function.
new_columns = column_renamer(house_measurements)
mapper = dict(zip(house_measurements.columns, new_columns))
house_measurements.rename(mapper, axis = 'columns', inplace = True)
house_measurements.head()

Unnamed: 0_level_0,date,time,co2_dining_room,co2_room,relative_humidity_dining_room,relative_humidity_room,lighting_dining_room,lighting_room,meteo_rain,meteo_sun_dusk,meteo_wind,meteo_sun_light_in_west_facade,meteo_sun_light_in_east_facade,meteo_sun_light_in_south_facade,meteo_sun_irradiance,outdoor_relative_humidity_sensor,day_of_the_week,indoor_temperature_room
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
0,13/03/2012,11:45,216.56,221.92,39.9125,42.415,81.665,113.52,0.0,623.36,1.42625,9690.24,12604.2,95436.8,758.88,48.375,2.0,17.8275
1,13/03/2012,12:00,219.947,220.363,39.9267,42.2453,81.7413,113.605,0.0,623.211,1.592,11022.0,10787.2,95436.8,762.069,47.808,2.0,18.1207
2,13/03/2012,12:15,219.403,218.933,39.772,42.2267,81.424,113.6,0.0,622.656,1.89133,13960.5,9669.63,95398.6,766.251,47.432,2.0,18.4367
3,13/03/2012,12:30,218.613,217.045,39.776,42.0987,81.5013,113.344,0.0,622.571,1.828,18511.2,9648.13,95360.3,766.037,47.024,2.0,18.7513
4,13/03/2012,12:45,217.714,216.08,39.7757,42.0686,81.4657,113.034,0.0,622.4,2.36071,26349.0,9208.32,95354.9,762.743,45.4743,2.0,19.0414


In [12]:
# Add a new column that combines the Date and Time columns 
house_measurements['date_time'] = house_measurements['date'] + ' ' + house_measurements['time']
house_measurements['date_time'].sample(10)

Id
1183    25/03/2012 19:30
2417    07/04/2012 16:00
2160    04/04/2012 23:45
965     23/03/2012 13:00
1742    31/03/2012 15:15
2761    11/04/2012 06:00
1304    27/03/2012 01:45
796     21/03/2012 18:45
355     17/03/2012 04:30
1044    24/03/2012 08:45
Name: date_time, dtype: object

In [109]:
# These should be datetime objects.
house_measurements[['date', 'time', 'date_time']].dtypes

date         object
time         object
date_time    object
dtype: object

In [98]:
# Change Date, Time, and Date_time columns to be datetime objects instead of strings.
df['date'] = pd.to_datetime(df['date'], format = '%d/%m/%Y')
df['date_time'] = pd.to_datetime(df['date_time'], format = "%d/%m/%Y %H:%M")

df[['date','date_time']].sample(10)

Unnamed: 0_level_0,date,date_time
Id,Unnamed: 1_level_1,Unnamed: 2_level_1
2530,2012-04-08,2012-04-08 20:15:00
2617,2012-04-09,2012-04-09 18:00:00
340,2012-03-17,2012-03-17 00:45:00
630,2012-03-20,2012-03-20 01:15:00
374,2012-03-17,2012-03-17 09:15:00
1342,2012-03-27,2012-03-27 11:15:00
1759,2012-03-31,2012-03-31 19:30:00
1811,2012-04-01,2012-04-01 08:30:00
823,2012-03-22,2012-03-22 01:30:00
944,2012-03-23,2012-03-23 07:45:00


In [114]:
df[['date','date_time']].sample(10)

Unnamed: 0_level_0,date,date_time
Id,Unnamed: 1_level_1,Unnamed: 2_level_1
623,2012-03-19,2012-03-19 23:30:00
1682,2012-03-31,2012-03-31 00:15:00
2605,2012-04-09,2012-04-09 15:00:00
1180,2012-03-25,2012-03-25 18:45:00
46,2012-03-13,2012-03-13 23:15:00
2759,2012-04-11,2012-04-11 05:30:00
775,2012-03-21,2012-03-21 13:30:00
792,2012-03-21,2012-03-21 17:45:00
746,2012-03-21,2012-03-21 06:15:00
2146,2012-04-04,2012-04-04 20:15:00


In [46]:
house_measurements.dtypes

date                                datetime64[ns]
time                                        object
co2_dining_room                            float64
co2_room                                   float64
relative_humidity_dining_room              float64
relative_humidity_room                     float64
lighting_dining_room                       float64
lighting_room                              float64
meteo_rain                                 float64
meteo_sun_dusk                             float64
meteo_wind                                 float64
meteo_sun_light_in_west_facade             float64
meteo_sun_light_in_east_facade             float64
meteo_sun_light_in_south_facade            float64
meteo_sun_irradiance                       float64
outdoor_relative_humidity_sensor           float64
day_of_the_week                            float64
indoor_temperature_room                    float64
date_time                                   object
dtype: object

In [99]:
# # Change Date, Time, and Date_time columns to be datetime objects instead of strings.
# house_measurements['date'] = pd.to_datetime(house_measurements['date'], errors = 'coerce', format='%d/%m/%y')
# house_measurements['date_time'] = pd.to_datetime(house_measurements['date_time'])

# # Using .sample() has revealed to us some inconsistencies in the date format. We'll address this in the next section.
# house_measurements['date_time'].sample(10)

In [115]:
house_measurements['date'].dtypes

dtype('<M8[ns]')

In [38]:
house_measurements['date_time'].sample(15)

Id
35      13/03/2012 20:30
68      14/03/2012 04:45
1998    03/04/2012 07:15
16      13/03/2012 15:45
1148    25/03/2012 10:45
82      14/03/2012 08:15
229     15/03/2012 21:00
276     16/03/2012 08:45
1997    03/04/2012 07:00
793     21/03/2012 18:00
1515    29/03/2012 06:30
592     19/03/2012 15:45
2418    07/04/2012 16:15
153     15/03/2012 02:00
453     18/03/2012 05:00
Name: date_time, dtype: object

### **Section 3: Data Validation**

Now we'll inspect the contents of our columns to make sure the data makes sense. We know there are no missing values, but we need to make sure the data isn't invalid in other ways. For example, we shouldn't see negative values columns like the co2 columns, which are in parts per million units. We also need to see if any measurements were skipped. They should occur every fifteen minutes.

In [None]:
house_measurements.columns

In [None]:
# Function to check if the minimum values of the numeric data types are above zero
def min_max_checker(df):
    invalid_value_columns = []
    for column in df.columns:
        try:
            df[column][0] + 1 # Checks to see if column is of numeric type
            if df[column].min() < 0: # If it is numeric, and if the minimum value is below zero, add it to the list
                invalid_value_columns.append(column)
        except:
            pass # Does nothing if the column is not numeric
    return(invalid_value_columns)

In [None]:
min_max_checker(house_measurements)

In [None]:
house_measurements[min_max_checker(house_measurements)].describe()

In [None]:
negative_irrad_rows = len(house_measurements[house_measurements['meteo_sun_irradiance'] < 0])
all_rows = len(house_measurements)
print(round(negative_irrad_rows / all_rows * 100, 2), 'percent of the irradiance readings are below zero.')

It seems to be relatively common to have small negative irradiance values at night time due to measurement error, but these should usually be positive values. The reason for this might be because the value is recorded on a log scale, with an arbitrary zero point at, say, dawn or dusk. At night (which comprises half of the measurments), the absence of sun falls just below this zero irradiance point. During the day, it is vastly higher.

In [None]:
# I'm not sure what Meteo_Sun_dusk means. Let's check it out.
house_measurements['meteo_sun_dusk'].describe()

In [None]:
# Measurements are supposed to have been taken every 15 minutes. Let's find out if any rows are missing.
house_measurements['date_time'].diff().drop([0]).max()

The maximum time delta should be 0 days and fifteen minutes. If measurements were taken every fifteen minutes, we shouldn't be seeing a difference of 30 days between any two consecutive measurements, which is what the .diff() function is showing us.

In [None]:
# Bring in a fresh subsetted version of the data for experimentation
df = pd.read_csv("../Raw Data/train.csv", usecols = ['Date'])
df.head()

In [None]:
# It seems 12 dates were unconverted, and so are in an inconsistent date format
df['date_obj'] = pd.to_datetime(df['Date'], errors = 'ignore', format='%d/%m/%y')

In [None]:
df['date_obj'].unique()

In [None]:
house_measurements['date'].unique()

In [None]:
house_measurements['date'].head()

In [None]:
house_measurements.dtypes

In [None]:
len(df), len(house_measurements)

In [None]:
house_measurements = house_measurements.sort_values('Date_time')
house_measurements['diff_datetime'] = house_measurements['Date_time'].diff().drop([0])
house_measurements[house_measurements['diff_datetime'] == house_measurements['Date_time'].diff().drop([0]).max()]

In [None]:
#df = house_measurements.set_index('Date_time')

In [None]:
#df.index.diff().drop([0]).max()