# Cleaning all of the data: Making all values numeric to be able to train an run KNN model

In [1]:
# Import libraries
import numpy as np      
import pandas as pd    
from IPython.display import display
import matplotlib.pyplot as plt
import random
from scipy.stats.mstats import winsorize


In [2]:
# Read in the data 
filename = '../CleanedData/AllFebData.csv'
all_feb_data = pd.read_csv(filename)      
print(f"{filename} : file read into a pandas dataframe.")

../CleanedData/AllFebData.csv : file read into a pandas dataframe.


In [7]:
# Look at the data 
display(all_feb_data)

Unnamed: 0,Account Number,Meter Number,Consumption,Date,ETo (in),Precip (in),Sol Rad (Ly/day),Avg Vap Pres (mBars),Max Air Temp (F),Min Air Temp (F),Avg Air Temp (F),Max Rel Hum (%),Min Rel Hum (%),Avg Rel Hum (%),Dew Point (F),Avg Wind Speed (mph),Wind Run (miles),Avg Soil Temp (F)
0,0001-0001-09,53884583,33.910,2-13-22,0.13,0.00,424,4.2,85.5,47.1,64.8,40,7,20,22.7,2.8,67.8,54.4
1,0001-0002-01,53962882,61.640,2-13-22,0.13,0.00,424,4.2,85.5,47.1,64.8,40,7,20,22.7,2.8,67.8,54.4
2,0001-0003-02,53884582,56.590,2-13-22,0.13,0.00,424,4.2,85.5,47.1,64.8,40,7,20,22.7,2.8,67.8,54.4
3,0001-0004-07,53962881,62.870,2-13-22,0.13,0.00,424,4.2,85.5,47.1,64.8,40,7,20,22.7,2.8,67.8,54.4
4,0001-0005-07,53884605,107.090,2-13-22,0.13,0.00,424,4.2,85.5,47.1,64.8,40,7,20,22.7,2.8,67.8,54.4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
511439,0165-1175-01,54423961,10.100,2-8-22,0.14,0.01,386,4.3,79.0,42.9,62.0,48,12,23,23.4,3.7,90.0,51.0
511440,0165-1176-01,54423960,13.270,2-8-22,0.14,0.01,386,4.3,79.0,42.9,62.0,48,12,23,23.4,3.7,90.0,51.0
511441,0165-1177-01,54423962,31.550,2-8-22,0.14,0.01,386,4.3,79.0,42.9,62.0,48,12,23,23.4,3.7,90.0,51.0
511442,0165-1178-01,54359361,15.960,2-8-22,0.14,0.01,386,4.3,79.0,42.9,62.0,48,12,23,23.4,3.7,90.0,51.0


In [3]:
# Look at the data's info
all_feb_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 511444 entries, 0 to 511443
Data columns (total 18 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   Account Number        511444 non-null  object 
 1   Meter Number          511444 non-null  object 
 2   Consumption           511444 non-null  object 
 3   Date                  511444 non-null  object 
 4   ETo (in)              511444 non-null  float64
 5   Precip (in)           511444 non-null  float64
 6   Sol Rad (Ly/day)      511444 non-null  int64  
 7   Avg Vap Pres (mBars)  511444 non-null  float64
 8   Max Air Temp (F)      511444 non-null  float64
 9   Min Air Temp (F)      511444 non-null  float64
 10  Avg Air Temp (F)      511444 non-null  float64
 11  Max Rel Hum (%)       511444 non-null  int64  
 12  Min Rel Hum (%)       511444 non-null  int64  
 13  Avg Rel Hum (%)       511444 non-null  int64  
 14  Dew Point (F)         511444 non-null  float64
 15  

### 1) Turning Account Number Column Values into Integers

In [4]:
is_num = all_feb_data["Account Number"] != "Account Number"

all_feb_data2 = all_feb_data[is_num]

all_feb_data2

all_feb_data = all_feb_data2

In [5]:
all_feb_data["AccountNumber"] = all_feb_data["Account Number"].apply(lambda x: x.replace('-', ''))

In [6]:
display(all_feb_data)
all_feb_data.info()

Unnamed: 0,Account Number,Meter Number,Consumption,Date,ETo (in),Precip (in),Sol Rad (Ly/day),Avg Vap Pres (mBars),Max Air Temp (F),Min Air Temp (F),Avg Air Temp (F),Max Rel Hum (%),Min Rel Hum (%),Avg Rel Hum (%),Dew Point (F),Avg Wind Speed (mph),Wind Run (miles),Avg Soil Temp (F),AccountNumber
0,0001-0001-09,53884583,33.910,2-13-22,0.13,0.00,424,4.2,85.5,47.1,64.8,40,7,20,22.7,2.8,67.8,54.4,0001000109
1,0001-0002-01,53962882,61.640,2-13-22,0.13,0.00,424,4.2,85.5,47.1,64.8,40,7,20,22.7,2.8,67.8,54.4,0001000201
2,0001-0003-02,53884582,56.590,2-13-22,0.13,0.00,424,4.2,85.5,47.1,64.8,40,7,20,22.7,2.8,67.8,54.4,0001000302
3,0001-0004-07,53962881,62.870,2-13-22,0.13,0.00,424,4.2,85.5,47.1,64.8,40,7,20,22.7,2.8,67.8,54.4,0001000407
4,0001-0005-07,53884605,107.090,2-13-22,0.13,0.00,424,4.2,85.5,47.1,64.8,40,7,20,22.7,2.8,67.8,54.4,0001000507
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
511439,0165-1175-01,54423961,10.100,2-8-22,0.14,0.01,386,4.3,79.0,42.9,62.0,48,12,23,23.4,3.7,90.0,51.0,0165117501
511440,0165-1176-01,54423960,13.270,2-8-22,0.14,0.01,386,4.3,79.0,42.9,62.0,48,12,23,23.4,3.7,90.0,51.0,0165117601
511441,0165-1177-01,54423962,31.550,2-8-22,0.14,0.01,386,4.3,79.0,42.9,62.0,48,12,23,23.4,3.7,90.0,51.0,0165117701
511442,0165-1178-01,54359361,15.960,2-8-22,0.14,0.01,386,4.3,79.0,42.9,62.0,48,12,23,23.4,3.7,90.0,51.0,0165117801


<class 'pandas.core.frame.DataFrame'>
Int64Index: 511419 entries, 0 to 511443
Data columns (total 19 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   Account Number        511419 non-null  object 
 1   Meter Number          511419 non-null  object 
 2   Consumption           511419 non-null  object 
 3   Date                  511419 non-null  object 
 4   ETo (in)              511419 non-null  float64
 5   Precip (in)           511419 non-null  float64
 6   Sol Rad (Ly/day)      511419 non-null  int64  
 7   Avg Vap Pres (mBars)  511419 non-null  float64
 8   Max Air Temp (F)      511419 non-null  float64
 9   Min Air Temp (F)      511419 non-null  float64
 10  Avg Air Temp (F)      511419 non-null  float64
 11  Max Rel Hum (%)       511419 non-null  int64  
 12  Min Rel Hum (%)       511419 non-null  int64  
 13  Avg Rel Hum (%)       511419 non-null  int64  
 14  Dew Point (F)         511419 non-null  float64
 15  

In [7]:
all_feb_data = all_feb_data.drop(["Account Number"], axis=1)

In [8]:
all_feb_data["Account Number"] = all_feb_data["AccountNumber"].apply(lambda x: int(x))

In [9]:
all_feb_data = all_feb_data.drop(["AccountNumber"], axis=1)

In [10]:
display(all_feb_data)
all_feb_data.info()

Unnamed: 0,Meter Number,Consumption,Date,ETo (in),Precip (in),Sol Rad (Ly/day),Avg Vap Pres (mBars),Max Air Temp (F),Min Air Temp (F),Avg Air Temp (F),Max Rel Hum (%),Min Rel Hum (%),Avg Rel Hum (%),Dew Point (F),Avg Wind Speed (mph),Wind Run (miles),Avg Soil Temp (F),Account Number
0,53884583,33.910,2-13-22,0.13,0.00,424,4.2,85.5,47.1,64.8,40,7,20,22.7,2.8,67.8,54.4,1000109
1,53962882,61.640,2-13-22,0.13,0.00,424,4.2,85.5,47.1,64.8,40,7,20,22.7,2.8,67.8,54.4,1000201
2,53884582,56.590,2-13-22,0.13,0.00,424,4.2,85.5,47.1,64.8,40,7,20,22.7,2.8,67.8,54.4,1000302
3,53962881,62.870,2-13-22,0.13,0.00,424,4.2,85.5,47.1,64.8,40,7,20,22.7,2.8,67.8,54.4,1000407
4,53884605,107.090,2-13-22,0.13,0.00,424,4.2,85.5,47.1,64.8,40,7,20,22.7,2.8,67.8,54.4,1000507
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
511439,54423961,10.100,2-8-22,0.14,0.01,386,4.3,79.0,42.9,62.0,48,12,23,23.4,3.7,90.0,51.0,165117501
511440,54423960,13.270,2-8-22,0.14,0.01,386,4.3,79.0,42.9,62.0,48,12,23,23.4,3.7,90.0,51.0,165117601
511441,54423962,31.550,2-8-22,0.14,0.01,386,4.3,79.0,42.9,62.0,48,12,23,23.4,3.7,90.0,51.0,165117701
511442,54359361,15.960,2-8-22,0.14,0.01,386,4.3,79.0,42.9,62.0,48,12,23,23.4,3.7,90.0,51.0,165117801


<class 'pandas.core.frame.DataFrame'>
Int64Index: 511419 entries, 0 to 511443
Data columns (total 18 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   Meter Number          511419 non-null  object 
 1   Consumption           511419 non-null  object 
 2   Date                  511419 non-null  object 
 3   ETo (in)              511419 non-null  float64
 4   Precip (in)           511419 non-null  float64
 5   Sol Rad (Ly/day)      511419 non-null  int64  
 6   Avg Vap Pres (mBars)  511419 non-null  float64
 7   Max Air Temp (F)      511419 non-null  float64
 8   Min Air Temp (F)      511419 non-null  float64
 9   Avg Air Temp (F)      511419 non-null  float64
 10  Max Rel Hum (%)       511419 non-null  int64  
 11  Min Rel Hum (%)       511419 non-null  int64  
 12  Avg Rel Hum (%)       511419 non-null  int64  
 13  Dew Point (F)         511419 non-null  float64
 14  Avg Wind Speed (mph)  511419 non-null  float64
 15  

In [11]:
# Reset the Indices 
all_feb_data.reset_index(drop=True, inplace=True)
display(all_feb_data)

Unnamed: 0,Meter Number,Consumption,Date,ETo (in),Precip (in),Sol Rad (Ly/day),Avg Vap Pres (mBars),Max Air Temp (F),Min Air Temp (F),Avg Air Temp (F),Max Rel Hum (%),Min Rel Hum (%),Avg Rel Hum (%),Dew Point (F),Avg Wind Speed (mph),Wind Run (miles),Avg Soil Temp (F),Account Number
0,53884583,33.910,2-13-22,0.13,0.00,424,4.2,85.5,47.1,64.8,40,7,20,22.7,2.8,67.8,54.4,1000109
1,53962882,61.640,2-13-22,0.13,0.00,424,4.2,85.5,47.1,64.8,40,7,20,22.7,2.8,67.8,54.4,1000201
2,53884582,56.590,2-13-22,0.13,0.00,424,4.2,85.5,47.1,64.8,40,7,20,22.7,2.8,67.8,54.4,1000302
3,53962881,62.870,2-13-22,0.13,0.00,424,4.2,85.5,47.1,64.8,40,7,20,22.7,2.8,67.8,54.4,1000407
4,53884605,107.090,2-13-22,0.13,0.00,424,4.2,85.5,47.1,64.8,40,7,20,22.7,2.8,67.8,54.4,1000507
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
511414,54423961,10.100,2-8-22,0.14,0.01,386,4.3,79.0,42.9,62.0,48,12,23,23.4,3.7,90.0,51.0,165117501
511415,54423960,13.270,2-8-22,0.14,0.01,386,4.3,79.0,42.9,62.0,48,12,23,23.4,3.7,90.0,51.0,165117601
511416,54423962,31.550,2-8-22,0.14,0.01,386,4.3,79.0,42.9,62.0,48,12,23,23.4,3.7,90.0,51.0,165117701
511417,54359361,15.960,2-8-22,0.14,0.01,386,4.3,79.0,42.9,62.0,48,12,23,23.4,3.7,90.0,51.0,165117801


### Turning Meter Number column values into integers

In [12]:
all_feb_data["MeterNumber"] = all_feb_data["Meter Number"].apply(lambda x: x.replace('a', ''))

In [13]:
display(all_feb_data)

Unnamed: 0,Meter Number,Consumption,Date,ETo (in),Precip (in),Sol Rad (Ly/day),Avg Vap Pres (mBars),Max Air Temp (F),Min Air Temp (F),Avg Air Temp (F),Max Rel Hum (%),Min Rel Hum (%),Avg Rel Hum (%),Dew Point (F),Avg Wind Speed (mph),Wind Run (miles),Avg Soil Temp (F),Account Number,MeterNumber
0,53884583,33.910,2-13-22,0.13,0.00,424,4.2,85.5,47.1,64.8,40,7,20,22.7,2.8,67.8,54.4,1000109,53884583
1,53962882,61.640,2-13-22,0.13,0.00,424,4.2,85.5,47.1,64.8,40,7,20,22.7,2.8,67.8,54.4,1000201,53962882
2,53884582,56.590,2-13-22,0.13,0.00,424,4.2,85.5,47.1,64.8,40,7,20,22.7,2.8,67.8,54.4,1000302,53884582
3,53962881,62.870,2-13-22,0.13,0.00,424,4.2,85.5,47.1,64.8,40,7,20,22.7,2.8,67.8,54.4,1000407,53962881
4,53884605,107.090,2-13-22,0.13,0.00,424,4.2,85.5,47.1,64.8,40,7,20,22.7,2.8,67.8,54.4,1000507,53884605
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
511414,54423961,10.100,2-8-22,0.14,0.01,386,4.3,79.0,42.9,62.0,48,12,23,23.4,3.7,90.0,51.0,165117501,54423961
511415,54423960,13.270,2-8-22,0.14,0.01,386,4.3,79.0,42.9,62.0,48,12,23,23.4,3.7,90.0,51.0,165117601,54423960
511416,54423962,31.550,2-8-22,0.14,0.01,386,4.3,79.0,42.9,62.0,48,12,23,23.4,3.7,90.0,51.0,165117701,54423962
511417,54359361,15.960,2-8-22,0.14,0.01,386,4.3,79.0,42.9,62.0,48,12,23,23.4,3.7,90.0,51.0,165117801,54359361


In [14]:
# Turn the data in the Meter Number column from strings to ints

all_feb_data["MeterNumber"] = pd.to_numeric(all_feb_data["MeterNumber"])
display(all_feb_data)
all_feb_data.info()

Unnamed: 0,Meter Number,Consumption,Date,ETo (in),Precip (in),Sol Rad (Ly/day),Avg Vap Pres (mBars),Max Air Temp (F),Min Air Temp (F),Avg Air Temp (F),Max Rel Hum (%),Min Rel Hum (%),Avg Rel Hum (%),Dew Point (F),Avg Wind Speed (mph),Wind Run (miles),Avg Soil Temp (F),Account Number,MeterNumber
0,53884583,33.910,2-13-22,0.13,0.00,424,4.2,85.5,47.1,64.8,40,7,20,22.7,2.8,67.8,54.4,1000109,53884583
1,53962882,61.640,2-13-22,0.13,0.00,424,4.2,85.5,47.1,64.8,40,7,20,22.7,2.8,67.8,54.4,1000201,53962882
2,53884582,56.590,2-13-22,0.13,0.00,424,4.2,85.5,47.1,64.8,40,7,20,22.7,2.8,67.8,54.4,1000302,53884582
3,53962881,62.870,2-13-22,0.13,0.00,424,4.2,85.5,47.1,64.8,40,7,20,22.7,2.8,67.8,54.4,1000407,53962881
4,53884605,107.090,2-13-22,0.13,0.00,424,4.2,85.5,47.1,64.8,40,7,20,22.7,2.8,67.8,54.4,1000507,53884605
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
511414,54423961,10.100,2-8-22,0.14,0.01,386,4.3,79.0,42.9,62.0,48,12,23,23.4,3.7,90.0,51.0,165117501,54423961
511415,54423960,13.270,2-8-22,0.14,0.01,386,4.3,79.0,42.9,62.0,48,12,23,23.4,3.7,90.0,51.0,165117601,54423960
511416,54423962,31.550,2-8-22,0.14,0.01,386,4.3,79.0,42.9,62.0,48,12,23,23.4,3.7,90.0,51.0,165117701,54423962
511417,54359361,15.960,2-8-22,0.14,0.01,386,4.3,79.0,42.9,62.0,48,12,23,23.4,3.7,90.0,51.0,165117801,54359361


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 511419 entries, 0 to 511418
Data columns (total 19 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   Meter Number          511419 non-null  object 
 1   Consumption           511419 non-null  object 
 2   Date                  511419 non-null  object 
 3   ETo (in)              511419 non-null  float64
 4   Precip (in)           511419 non-null  float64
 5   Sol Rad (Ly/day)      511419 non-null  int64  
 6   Avg Vap Pres (mBars)  511419 non-null  float64
 7   Max Air Temp (F)      511419 non-null  float64
 8   Min Air Temp (F)      511419 non-null  float64
 9   Avg Air Temp (F)      511419 non-null  float64
 10  Max Rel Hum (%)       511419 non-null  int64  
 11  Min Rel Hum (%)       511419 non-null  int64  
 12  Avg Rel Hum (%)       511419 non-null  int64  
 13  Dew Point (F)         511419 non-null  float64
 14  Avg Wind Speed (mph)  511419 non-null  float64
 15  

### Turn the Consumption Data into ints 

In [15]:
# Turn the data in the Consumption column from strings to ints

all_feb_data["consumption"] = pd.to_numeric(all_feb_data["Consumption"])
display(all_feb_data)
all_feb_data.info()

Unnamed: 0,Meter Number,Consumption,Date,ETo (in),Precip (in),Sol Rad (Ly/day),Avg Vap Pres (mBars),Max Air Temp (F),Min Air Temp (F),Avg Air Temp (F),Max Rel Hum (%),Min Rel Hum (%),Avg Rel Hum (%),Dew Point (F),Avg Wind Speed (mph),Wind Run (miles),Avg Soil Temp (F),Account Number,MeterNumber,consumption
0,53884583,33.910,2-13-22,0.13,0.00,424,4.2,85.5,47.1,64.8,40,7,20,22.7,2.8,67.8,54.4,1000109,53884583,33.91
1,53962882,61.640,2-13-22,0.13,0.00,424,4.2,85.5,47.1,64.8,40,7,20,22.7,2.8,67.8,54.4,1000201,53962882,61.64
2,53884582,56.590,2-13-22,0.13,0.00,424,4.2,85.5,47.1,64.8,40,7,20,22.7,2.8,67.8,54.4,1000302,53884582,56.59
3,53962881,62.870,2-13-22,0.13,0.00,424,4.2,85.5,47.1,64.8,40,7,20,22.7,2.8,67.8,54.4,1000407,53962881,62.87
4,53884605,107.090,2-13-22,0.13,0.00,424,4.2,85.5,47.1,64.8,40,7,20,22.7,2.8,67.8,54.4,1000507,53884605,107.09
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
511414,54423961,10.100,2-8-22,0.14,0.01,386,4.3,79.0,42.9,62.0,48,12,23,23.4,3.7,90.0,51.0,165117501,54423961,10.10
511415,54423960,13.270,2-8-22,0.14,0.01,386,4.3,79.0,42.9,62.0,48,12,23,23.4,3.7,90.0,51.0,165117601,54423960,13.27
511416,54423962,31.550,2-8-22,0.14,0.01,386,4.3,79.0,42.9,62.0,48,12,23,23.4,3.7,90.0,51.0,165117701,54423962,31.55
511417,54359361,15.960,2-8-22,0.14,0.01,386,4.3,79.0,42.9,62.0,48,12,23,23.4,3.7,90.0,51.0,165117801,54359361,15.96


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 511419 entries, 0 to 511418
Data columns (total 20 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   Meter Number          511419 non-null  object 
 1   Consumption           511419 non-null  object 
 2   Date                  511419 non-null  object 
 3   ETo (in)              511419 non-null  float64
 4   Precip (in)           511419 non-null  float64
 5   Sol Rad (Ly/day)      511419 non-null  int64  
 6   Avg Vap Pres (mBars)  511419 non-null  float64
 7   Max Air Temp (F)      511419 non-null  float64
 8   Min Air Temp (F)      511419 non-null  float64
 9   Avg Air Temp (F)      511419 non-null  float64
 10  Max Rel Hum (%)       511419 non-null  int64  
 11  Min Rel Hum (%)       511419 non-null  int64  
 12  Avg Rel Hum (%)       511419 non-null  int64  
 13  Dew Point (F)         511419 non-null  float64
 14  Avg Wind Speed (mph)  511419 non-null  float64
 15  

### Turn the Date into a numeric value for the Day of the Week

In [16]:
all_feb_data["date"] = pd.to_datetime(all_feb_data["Date"])
display(all_feb_data)

Unnamed: 0,Meter Number,Consumption,Date,ETo (in),Precip (in),Sol Rad (Ly/day),Avg Vap Pres (mBars),Max Air Temp (F),Min Air Temp (F),Avg Air Temp (F),...,Min Rel Hum (%),Avg Rel Hum (%),Dew Point (F),Avg Wind Speed (mph),Wind Run (miles),Avg Soil Temp (F),Account Number,MeterNumber,consumption,date
0,53884583,33.910,2-13-22,0.13,0.00,424,4.2,85.5,47.1,64.8,...,7,20,22.7,2.8,67.8,54.4,1000109,53884583,33.91,2022-02-13
1,53962882,61.640,2-13-22,0.13,0.00,424,4.2,85.5,47.1,64.8,...,7,20,22.7,2.8,67.8,54.4,1000201,53962882,61.64,2022-02-13
2,53884582,56.590,2-13-22,0.13,0.00,424,4.2,85.5,47.1,64.8,...,7,20,22.7,2.8,67.8,54.4,1000302,53884582,56.59,2022-02-13
3,53962881,62.870,2-13-22,0.13,0.00,424,4.2,85.5,47.1,64.8,...,7,20,22.7,2.8,67.8,54.4,1000407,53962881,62.87,2022-02-13
4,53884605,107.090,2-13-22,0.13,0.00,424,4.2,85.5,47.1,64.8,...,7,20,22.7,2.8,67.8,54.4,1000507,53884605,107.09,2022-02-13
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
511414,54423961,10.100,2-8-22,0.14,0.01,386,4.3,79.0,42.9,62.0,...,12,23,23.4,3.7,90.0,51.0,165117501,54423961,10.10,2022-02-08
511415,54423960,13.270,2-8-22,0.14,0.01,386,4.3,79.0,42.9,62.0,...,12,23,23.4,3.7,90.0,51.0,165117601,54423960,13.27,2022-02-08
511416,54423962,31.550,2-8-22,0.14,0.01,386,4.3,79.0,42.9,62.0,...,12,23,23.4,3.7,90.0,51.0,165117701,54423962,31.55,2022-02-08
511417,54359361,15.960,2-8-22,0.14,0.01,386,4.3,79.0,42.9,62.0,...,12,23,23.4,3.7,90.0,51.0,165117801,54359361,15.96,2022-02-08


In [17]:
all_feb_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 511419 entries, 0 to 511418
Data columns (total 21 columns):
 #   Column                Non-Null Count   Dtype         
---  ------                --------------   -----         
 0   Meter Number          511419 non-null  object        
 1   Consumption           511419 non-null  object        
 2   Date                  511419 non-null  object        
 3   ETo (in)              511419 non-null  float64       
 4   Precip (in)           511419 non-null  float64       
 5   Sol Rad (Ly/day)      511419 non-null  int64         
 6   Avg Vap Pres (mBars)  511419 non-null  float64       
 7   Max Air Temp (F)      511419 non-null  float64       
 8   Min Air Temp (F)      511419 non-null  float64       
 9   Avg Air Temp (F)      511419 non-null  float64       
 10  Max Rel Hum (%)       511419 non-null  int64         
 11  Min Rel Hum (%)       511419 non-null  int64         
 12  Avg Rel Hum (%)       511419 non-null  int64         
 13 

In [20]:
all_feb_data["DayOfWeek"] = all_feb_data["date"].dt.dayofweek
display(all_feb_data)
all_feb_data.info()

Unnamed: 0,Meter Number,Consumption,Date,ETo (in),Precip (in),Sol Rad (Ly/day),Avg Vap Pres (mBars),Max Air Temp (F),Min Air Temp (F),Avg Air Temp (F),...,Avg Rel Hum (%),Dew Point (F),Avg Wind Speed (mph),Wind Run (miles),Avg Soil Temp (F),Account Number,MeterNumber,consumption,date,DayOfWeek
0,53884583,33.910,2-13-22,0.13,0.00,424,4.2,85.5,47.1,64.8,...,20,22.7,2.8,67.8,54.4,1000109,53884583,33.91,2022-02-13,6
1,53962882,61.640,2-13-22,0.13,0.00,424,4.2,85.5,47.1,64.8,...,20,22.7,2.8,67.8,54.4,1000201,53962882,61.64,2022-02-13,6
2,53884582,56.590,2-13-22,0.13,0.00,424,4.2,85.5,47.1,64.8,...,20,22.7,2.8,67.8,54.4,1000302,53884582,56.59,2022-02-13,6
3,53962881,62.870,2-13-22,0.13,0.00,424,4.2,85.5,47.1,64.8,...,20,22.7,2.8,67.8,54.4,1000407,53962881,62.87,2022-02-13,6
4,53884605,107.090,2-13-22,0.13,0.00,424,4.2,85.5,47.1,64.8,...,20,22.7,2.8,67.8,54.4,1000507,53884605,107.09,2022-02-13,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
511414,54423961,10.100,2-8-22,0.14,0.01,386,4.3,79.0,42.9,62.0,...,23,23.4,3.7,90.0,51.0,165117501,54423961,10.10,2022-02-08,1
511415,54423960,13.270,2-8-22,0.14,0.01,386,4.3,79.0,42.9,62.0,...,23,23.4,3.7,90.0,51.0,165117601,54423960,13.27,2022-02-08,1
511416,54423962,31.550,2-8-22,0.14,0.01,386,4.3,79.0,42.9,62.0,...,23,23.4,3.7,90.0,51.0,165117701,54423962,31.55,2022-02-08,1
511417,54359361,15.960,2-8-22,0.14,0.01,386,4.3,79.0,42.9,62.0,...,23,23.4,3.7,90.0,51.0,165117801,54359361,15.96,2022-02-08,1


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 511419 entries, 0 to 511418
Data columns (total 22 columns):
 #   Column                Non-Null Count   Dtype         
---  ------                --------------   -----         
 0   Meter Number          511419 non-null  object        
 1   Consumption           511419 non-null  object        
 2   Date                  511419 non-null  object        
 3   ETo (in)              511419 non-null  float64       
 4   Precip (in)           511419 non-null  float64       
 5   Sol Rad (Ly/day)      511419 non-null  int64         
 6   Avg Vap Pres (mBars)  511419 non-null  float64       
 7   Max Air Temp (F)      511419 non-null  float64       
 8   Min Air Temp (F)      511419 non-null  float64       
 9   Avg Air Temp (F)      511419 non-null  float64       
 10  Max Rel Hum (%)       511419 non-null  int64         
 11  Min Rel Hum (%)       511419 non-null  int64         
 12  Avg Rel Hum (%)       511419 non-null  int64         
 13 

In [21]:
all_feb_data = all_feb_data.drop(["Meter Number", "Consumption", "Date", "date"], axis = 1)
display(all_feb_data)

Unnamed: 0,ETo (in),Precip (in),Sol Rad (Ly/day),Avg Vap Pres (mBars),Max Air Temp (F),Min Air Temp (F),Avg Air Temp (F),Max Rel Hum (%),Min Rel Hum (%),Avg Rel Hum (%),Dew Point (F),Avg Wind Speed (mph),Wind Run (miles),Avg Soil Temp (F),Account Number,MeterNumber,consumption,DayOfWeek
0,0.13,0.00,424,4.2,85.5,47.1,64.8,40,7,20,22.7,2.8,67.8,54.4,1000109,53884583,33.91,6
1,0.13,0.00,424,4.2,85.5,47.1,64.8,40,7,20,22.7,2.8,67.8,54.4,1000201,53962882,61.64,6
2,0.13,0.00,424,4.2,85.5,47.1,64.8,40,7,20,22.7,2.8,67.8,54.4,1000302,53884582,56.59,6
3,0.13,0.00,424,4.2,85.5,47.1,64.8,40,7,20,22.7,2.8,67.8,54.4,1000407,53962881,62.87,6
4,0.13,0.00,424,4.2,85.5,47.1,64.8,40,7,20,22.7,2.8,67.8,54.4,1000507,53884605,107.09,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
511414,0.14,0.01,386,4.3,79.0,42.9,62.0,48,12,23,23.4,3.7,90.0,51.0,165117501,54423961,10.10,1
511415,0.14,0.01,386,4.3,79.0,42.9,62.0,48,12,23,23.4,3.7,90.0,51.0,165117601,54423960,13.27,1
511416,0.14,0.01,386,4.3,79.0,42.9,62.0,48,12,23,23.4,3.7,90.0,51.0,165117701,54423962,31.55,1
511417,0.14,0.01,386,4.3,79.0,42.9,62.0,48,12,23,23.4,3.7,90.0,51.0,165117801,54359361,15.96,1


In [22]:
# Move the consumption column to the end of the data frame 
df1 = all_feb_data.pop('consumption') 
all_feb_data['consumption']= df1 

display(all_feb_data)
all_feb_data.info()

Unnamed: 0,ETo (in),Precip (in),Sol Rad (Ly/day),Avg Vap Pres (mBars),Max Air Temp (F),Min Air Temp (F),Avg Air Temp (F),Max Rel Hum (%),Min Rel Hum (%),Avg Rel Hum (%),Dew Point (F),Avg Wind Speed (mph),Wind Run (miles),Avg Soil Temp (F),Account Number,MeterNumber,DayOfWeek,consumption
0,0.13,0.00,424,4.2,85.5,47.1,64.8,40,7,20,22.7,2.8,67.8,54.4,1000109,53884583,6,33.91
1,0.13,0.00,424,4.2,85.5,47.1,64.8,40,7,20,22.7,2.8,67.8,54.4,1000201,53962882,6,61.64
2,0.13,0.00,424,4.2,85.5,47.1,64.8,40,7,20,22.7,2.8,67.8,54.4,1000302,53884582,6,56.59
3,0.13,0.00,424,4.2,85.5,47.1,64.8,40,7,20,22.7,2.8,67.8,54.4,1000407,53962881,6,62.87
4,0.13,0.00,424,4.2,85.5,47.1,64.8,40,7,20,22.7,2.8,67.8,54.4,1000507,53884605,6,107.09
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
511414,0.14,0.01,386,4.3,79.0,42.9,62.0,48,12,23,23.4,3.7,90.0,51.0,165117501,54423961,1,10.10
511415,0.14,0.01,386,4.3,79.0,42.9,62.0,48,12,23,23.4,3.7,90.0,51.0,165117601,54423960,1,13.27
511416,0.14,0.01,386,4.3,79.0,42.9,62.0,48,12,23,23.4,3.7,90.0,51.0,165117701,54423962,1,31.55
511417,0.14,0.01,386,4.3,79.0,42.9,62.0,48,12,23,23.4,3.7,90.0,51.0,165117801,54359361,1,15.96


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 511419 entries, 0 to 511418
Data columns (total 18 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   ETo (in)              511419 non-null  float64
 1   Precip (in)           511419 non-null  float64
 2   Sol Rad (Ly/day)      511419 non-null  int64  
 3   Avg Vap Pres (mBars)  511419 non-null  float64
 4   Max Air Temp (F)      511419 non-null  float64
 5   Min Air Temp (F)      511419 non-null  float64
 6   Avg Air Temp (F)      511419 non-null  float64
 7   Max Rel Hum (%)       511419 non-null  int64  
 8   Min Rel Hum (%)       511419 non-null  int64  
 9   Avg Rel Hum (%)       511419 non-null  int64  
 10  Dew Point (F)         511419 non-null  float64
 11  Avg Wind Speed (mph)  511419 non-null  float64
 12  Wind Run (miles)      511419 non-null  float64
 13  Avg Soil Temp (F)     511419 non-null  float64
 14  Account Number        511419 non-null  int64  
 15  

## Add the pressure zone data column using the meter number

In [23]:
# Read in the pressure zone data 
filename = '../CleanedData/MeterNumberAndPressureZoneData.csv'
pressure_zone_data = pd.read_csv(filename)      
print(f"{filename} : file read into a pandas dataframe.")

../CleanedData/MeterNumberAndPressureZoneData.csv : file read into a pandas dataframe.


In [24]:
# Display the data
display(pressure_zone_data)
pressure_zone_data.info()

Unnamed: 0,MeterNumber,Pressure Zone
63,770020,0.0
65,770039,0.0
86,630061,0.0
93,770026,0.0
96,64280,0.0
...,...,...
34,54729627.0,12.0
35,54724768.0,12.0
36,54725515.0,12.0
37,54725929.0,12.0


<class 'pandas.core.frame.DataFrame'>
Int64Index: 22667 entries, 63 to 38
Data columns (total 2 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   MeterNumber    22667 non-null  object 
 1   Pressure Zone  22667 non-null  float64
dtypes: float64(1), object(1)
memory usage: 531.3+ KB


In [25]:
# Remove letters from the Meter Numbers, is this an issue? 
pressure_zone_data["MeterNumber"] = pressure_zone_data["MeterNumber"].apply(lambda x: x.replace('B', ''))
pressure_zone_data["MeterNumber"] = pressure_zone_data["MeterNumber"].apply(lambda x: x.replace('P', ''))
pressure_zone_data["MeterNumber"] = pressure_zone_data["MeterNumber"].apply(lambda x: x.replace('A', ''))
pressure_zone_data["MeterNumber"] = pressure_zone_data["MeterNumber"].apply(lambda x: x.replace('S', ''))

In [26]:
# Turn the data in the Meter Number column from strings to ints
pressure_zone_data["MeterNumber"] = pd.to_numeric(pressure_zone_data["MeterNumber"])
display(pressure_zone_data)
pressure_zone_data.info()

Unnamed: 0,MeterNumber,Pressure Zone
63,770020.0,0.0
65,770039.0,0.0
86,630061.0,0.0
93,770026.0,0.0
96,64280.0,0.0
...,...,...
34,54729627.0,12.0
35,54724768.0,12.0
36,54725515.0,12.0
37,54725929.0,12.0


<class 'pandas.core.frame.DataFrame'>
Int64Index: 22667 entries, 63 to 38
Data columns (total 2 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   MeterNumber    22667 non-null  float64
 1   Pressure Zone  22667 non-null  float64
dtypes: float64(2)
memory usage: 531.3 KB


In [27]:
# Merging the two data frames together

merged_inner = pd.merge(left=all_feb_data, right=pressure_zone_data, left_on='MeterNumber', right_on='MeterNumber')
display(merged_inner)


Unnamed: 0,ETo (in),Precip (in),Sol Rad (Ly/day),Avg Vap Pres (mBars),Max Air Temp (F),Min Air Temp (F),Avg Air Temp (F),Max Rel Hum (%),Min Rel Hum (%),Avg Rel Hum (%),Dew Point (F),Avg Wind Speed (mph),Wind Run (miles),Avg Soil Temp (F),Account Number,MeterNumber,DayOfWeek,consumption,Pressure Zone
0,0.13,0.00,424,4.2,85.5,47.1,64.8,40,7,20,22.7,2.8,67.8,54.4,1000109,53884583,6,33.91,7.0
1,0.11,0.00,373,3.4,66.4,36.1,50.6,49,14,27,18.2,3.6,86.2,49.2,1000109,53884583,4,23.10,7.0
2,0.13,0.01,446,4.0,64.2,33.2,48.8,67,16,34,21.6,3.4,82.6,51.5,1000109,53884583,4,15.82,7.0
3,0.11,0.00,445,4.0,58.1,30.2,43.9,71,18,41,21.7,3.0,72.2,51.4,1000109,53884583,3,56.20,7.0
4,0.10,0.00,388,3.4,71.9,34.4,51.8,50,10,26,17.8,3.0,71.4,49.1,1000109,53884583,5,16.24,7.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
384936,0.11,0.00,406,7.4,62.8,37.1,49.7,92,34,61,36.9,3.7,89.8,52.8,155105704,54450246,2,380.92,0.0
384937,0.11,0.00,406,7.4,62.8,37.1,49.7,92,34,61,36.9,3.7,89.8,52.8,155288100,54450892,2,1035.52,0.0
384938,0.11,0.00,406,7.4,62.8,37.1,49.7,92,34,61,36.9,3.7,89.8,52.8,155288200,54450882,2,170.04,0.0
384939,0.11,0.00,406,7.4,62.8,37.1,49.7,92,34,61,36.9,3.7,89.8,52.8,165110101,54583926,2,574.39,6.0


In [28]:
# Move the consumption column to the end of the data frame 
df1 = merged_inner.pop('consumption') 
merged_inner['consumption']= df1 

display(merged_inner)
merged_inner.info()

Unnamed: 0,ETo (in),Precip (in),Sol Rad (Ly/day),Avg Vap Pres (mBars),Max Air Temp (F),Min Air Temp (F),Avg Air Temp (F),Max Rel Hum (%),Min Rel Hum (%),Avg Rel Hum (%),Dew Point (F),Avg Wind Speed (mph),Wind Run (miles),Avg Soil Temp (F),Account Number,MeterNumber,DayOfWeek,Pressure Zone,consumption
0,0.13,0.00,424,4.2,85.5,47.1,64.8,40,7,20,22.7,2.8,67.8,54.4,1000109,53884583,6,7.0,33.91
1,0.11,0.00,373,3.4,66.4,36.1,50.6,49,14,27,18.2,3.6,86.2,49.2,1000109,53884583,4,7.0,23.10
2,0.13,0.01,446,4.0,64.2,33.2,48.8,67,16,34,21.6,3.4,82.6,51.5,1000109,53884583,4,7.0,15.82
3,0.11,0.00,445,4.0,58.1,30.2,43.9,71,18,41,21.7,3.0,72.2,51.4,1000109,53884583,3,7.0,56.20
4,0.10,0.00,388,3.4,71.9,34.4,51.8,50,10,26,17.8,3.0,71.4,49.1,1000109,53884583,5,7.0,16.24
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
384936,0.11,0.00,406,7.4,62.8,37.1,49.7,92,34,61,36.9,3.7,89.8,52.8,155105704,54450246,2,0.0,380.92
384937,0.11,0.00,406,7.4,62.8,37.1,49.7,92,34,61,36.9,3.7,89.8,52.8,155288100,54450892,2,0.0,1035.52
384938,0.11,0.00,406,7.4,62.8,37.1,49.7,92,34,61,36.9,3.7,89.8,52.8,155288200,54450882,2,0.0,170.04
384939,0.11,0.00,406,7.4,62.8,37.1,49.7,92,34,61,36.9,3.7,89.8,52.8,165110101,54583926,2,6.0,574.39


<class 'pandas.core.frame.DataFrame'>
Int64Index: 384941 entries, 0 to 384940
Data columns (total 19 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   ETo (in)              384941 non-null  float64
 1   Precip (in)           384941 non-null  float64
 2   Sol Rad (Ly/day)      384941 non-null  int64  
 3   Avg Vap Pres (mBars)  384941 non-null  float64
 4   Max Air Temp (F)      384941 non-null  float64
 5   Min Air Temp (F)      384941 non-null  float64
 6   Avg Air Temp (F)      384941 non-null  float64
 7   Max Rel Hum (%)       384941 non-null  int64  
 8   Min Rel Hum (%)       384941 non-null  int64  
 9   Avg Rel Hum (%)       384941 non-null  int64  
 10  Dew Point (F)         384941 non-null  float64
 11  Avg Wind Speed (mph)  384941 non-null  float64
 12  Wind Run (miles)      384941 non-null  float64
 13  Avg Soil Temp (F)     384941 non-null  float64
 14  Account Number        384941 non-null  int64  
 15  

# Winsorizing the Data Values 


In [29]:
# Rename Data Frame 
feb_data = merged_inner

In [30]:
# Display the Data 
display(feb_data)

Unnamed: 0,ETo (in),Precip (in),Sol Rad (Ly/day),Avg Vap Pres (mBars),Max Air Temp (F),Min Air Temp (F),Avg Air Temp (F),Max Rel Hum (%),Min Rel Hum (%),Avg Rel Hum (%),Dew Point (F),Avg Wind Speed (mph),Wind Run (miles),Avg Soil Temp (F),Account Number,MeterNumber,DayOfWeek,Pressure Zone,consumption
0,0.13,0.00,424,4.2,85.5,47.1,64.8,40,7,20,22.7,2.8,67.8,54.4,1000109,53884583,6,7.0,33.91
1,0.11,0.00,373,3.4,66.4,36.1,50.6,49,14,27,18.2,3.6,86.2,49.2,1000109,53884583,4,7.0,23.10
2,0.13,0.01,446,4.0,64.2,33.2,48.8,67,16,34,21.6,3.4,82.6,51.5,1000109,53884583,4,7.0,15.82
3,0.11,0.00,445,4.0,58.1,30.2,43.9,71,18,41,21.7,3.0,72.2,51.4,1000109,53884583,3,7.0,56.20
4,0.10,0.00,388,3.4,71.9,34.4,51.8,50,10,26,17.8,3.0,71.4,49.1,1000109,53884583,5,7.0,16.24
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
384936,0.11,0.00,406,7.4,62.8,37.1,49.7,92,34,61,36.9,3.7,89.8,52.8,155105704,54450246,2,0.0,380.92
384937,0.11,0.00,406,7.4,62.8,37.1,49.7,92,34,61,36.9,3.7,89.8,52.8,155288100,54450892,2,0.0,1035.52
384938,0.11,0.00,406,7.4,62.8,37.1,49.7,92,34,61,36.9,3.7,89.8,52.8,155288200,54450882,2,0.0,170.04
384939,0.11,0.00,406,7.4,62.8,37.1,49.7,92,34,61,36.9,3.7,89.8,52.8,165110101,54583926,2,6.0,574.39


In [31]:
# Get the Max, Min, Mean, and Median 
print(feb_data['consumption'].max())
print(feb_data['consumption'].min())
print(feb_data['consumption'].mean())
print(feb_data['consumption'].median())

20000340.0
-599790.0
234.73809898140487
34.4


## Given that there are extremes in the data set we want to remove negative values and then Winsorize 

In [32]:
# Remove negative values

filtered_feb_data = feb_data[(feb_data.consumption > 0)]
display(filtered_feb_data)

Unnamed: 0,ETo (in),Precip (in),Sol Rad (Ly/day),Avg Vap Pres (mBars),Max Air Temp (F),Min Air Temp (F),Avg Air Temp (F),Max Rel Hum (%),Min Rel Hum (%),Avg Rel Hum (%),Dew Point (F),Avg Wind Speed (mph),Wind Run (miles),Avg Soil Temp (F),Account Number,MeterNumber,DayOfWeek,Pressure Zone,consumption
0,0.13,0.00,424,4.2,85.5,47.1,64.8,40,7,20,22.7,2.8,67.8,54.4,1000109,53884583,6,7.0,33.91
1,0.11,0.00,373,3.4,66.4,36.1,50.6,49,14,27,18.2,3.6,86.2,49.2,1000109,53884583,4,7.0,23.10
2,0.13,0.01,446,4.0,64.2,33.2,48.8,67,16,34,21.6,3.4,82.6,51.5,1000109,53884583,4,7.0,15.82
3,0.11,0.00,445,4.0,58.1,30.2,43.9,71,18,41,21.7,3.0,72.2,51.4,1000109,53884583,3,7.0,56.20
4,0.10,0.00,388,3.4,71.9,34.4,51.8,50,10,26,17.8,3.0,71.4,49.1,1000109,53884583,5,7.0,16.24
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
384936,0.11,0.00,406,7.4,62.8,37.1,49.7,92,34,61,36.9,3.7,89.8,52.8,155105704,54450246,2,0.0,380.92
384937,0.11,0.00,406,7.4,62.8,37.1,49.7,92,34,61,36.9,3.7,89.8,52.8,155288100,54450892,2,0.0,1035.52
384938,0.11,0.00,406,7.4,62.8,37.1,49.7,92,34,61,36.9,3.7,89.8,52.8,155288200,54450882,2,0.0,170.04
384939,0.11,0.00,406,7.4,62.8,37.1,49.7,92,34,61,36.9,3.7,89.8,52.8,165110101,54583926,2,6.0,574.39


In [33]:
# Check the New Max, Min, Mean, and Median 
print(filtered_feb_data['consumption'].max())
print(filtered_feb_data['consumption'].min())
print(filtered_feb_data['consumption'].mean())
print(filtered_feb_data['consumption'].median())

20000340.0
0.01
240.93251217550724
34.5


## Winsorize the Data

In [34]:
# Winsorize the Consumption Data
winsorized_consumption = winsorize(filtered_feb_data.consumption,(0.05,0.05))

print(winsorized_consumption)

[ 33.91  23.1   15.82 ... 170.04 186.49   4.5 ]


In [35]:
filtered_feb_data["WinConsumption"] = winsorized_consumption
winsorized_feb_data = filtered_feb_data

display(winsorized_feb_data)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,ETo (in),Precip (in),Sol Rad (Ly/day),Avg Vap Pres (mBars),Max Air Temp (F),Min Air Temp (F),Avg Air Temp (F),Max Rel Hum (%),Min Rel Hum (%),Avg Rel Hum (%),Dew Point (F),Avg Wind Speed (mph),Wind Run (miles),Avg Soil Temp (F),Account Number,MeterNumber,DayOfWeek,Pressure Zone,consumption,WinConsumption
0,0.13,0.00,424,4.2,85.5,47.1,64.8,40,7,20,22.7,2.8,67.8,54.4,1000109,53884583,6,7.0,33.91,33.91
1,0.11,0.00,373,3.4,66.4,36.1,50.6,49,14,27,18.2,3.6,86.2,49.2,1000109,53884583,4,7.0,23.10,23.10
2,0.13,0.01,446,4.0,64.2,33.2,48.8,67,16,34,21.6,3.4,82.6,51.5,1000109,53884583,4,7.0,15.82,15.82
3,0.11,0.00,445,4.0,58.1,30.2,43.9,71,18,41,21.7,3.0,72.2,51.4,1000109,53884583,3,7.0,56.20,56.20
4,0.10,0.00,388,3.4,71.9,34.4,51.8,50,10,26,17.8,3.0,71.4,49.1,1000109,53884583,5,7.0,16.24,16.24
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
384936,0.11,0.00,406,7.4,62.8,37.1,49.7,92,34,61,36.9,3.7,89.8,52.8,155105704,54450246,2,0.0,380.92,186.49
384937,0.11,0.00,406,7.4,62.8,37.1,49.7,92,34,61,36.9,3.7,89.8,52.8,155288100,54450892,2,0.0,1035.52,186.49
384938,0.11,0.00,406,7.4,62.8,37.1,49.7,92,34,61,36.9,3.7,89.8,52.8,155288200,54450882,2,0.0,170.04,170.04
384939,0.11,0.00,406,7.4,62.8,37.1,49.7,92,34,61,36.9,3.7,89.8,52.8,165110101,54583926,2,6.0,574.39,186.49


In [36]:
# Check the new Max, Min, Mean, and Median
print(winsorized_feb_data['WinConsumption'].max())
print(winsorized_feb_data['WinConsumption'].min())
print(winsorized_feb_data['WinConsumption'].mean())
print(winsorized_feb_data['WinConsumption'].median())

186.49
4.5
51.50979824192958
34.5


In [37]:
# Remove the un-winsorized consumption data 
winsorized_feb_data.drop("consumption", axis = 1)

Unnamed: 0,ETo (in),Precip (in),Sol Rad (Ly/day),Avg Vap Pres (mBars),Max Air Temp (F),Min Air Temp (F),Avg Air Temp (F),Max Rel Hum (%),Min Rel Hum (%),Avg Rel Hum (%),Dew Point (F),Avg Wind Speed (mph),Wind Run (miles),Avg Soil Temp (F),Account Number,MeterNumber,DayOfWeek,Pressure Zone,WinConsumption
0,0.13,0.00,424,4.2,85.5,47.1,64.8,40,7,20,22.7,2.8,67.8,54.4,1000109,53884583,6,7.0,33.91
1,0.11,0.00,373,3.4,66.4,36.1,50.6,49,14,27,18.2,3.6,86.2,49.2,1000109,53884583,4,7.0,23.10
2,0.13,0.01,446,4.0,64.2,33.2,48.8,67,16,34,21.6,3.4,82.6,51.5,1000109,53884583,4,7.0,15.82
3,0.11,0.00,445,4.0,58.1,30.2,43.9,71,18,41,21.7,3.0,72.2,51.4,1000109,53884583,3,7.0,56.20
4,0.10,0.00,388,3.4,71.9,34.4,51.8,50,10,26,17.8,3.0,71.4,49.1,1000109,53884583,5,7.0,16.24
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
384936,0.11,0.00,406,7.4,62.8,37.1,49.7,92,34,61,36.9,3.7,89.8,52.8,155105704,54450246,2,0.0,186.49
384937,0.11,0.00,406,7.4,62.8,37.1,49.7,92,34,61,36.9,3.7,89.8,52.8,155288100,54450892,2,0.0,186.49
384938,0.11,0.00,406,7.4,62.8,37.1,49.7,92,34,61,36.9,3.7,89.8,52.8,155288200,54450882,2,0.0,170.04
384939,0.11,0.00,406,7.4,62.8,37.1,49.7,92,34,61,36.9,3.7,89.8,52.8,165110101,54583926,2,6.0,186.49


# Save the New Cleaned Data

In [38]:
# Write current data frame into a new cvs file 
winsorized_feb_data.to_csv("FDCleanedUpdated.csv", index_label=False)

# Pressure Zone Number Key:

['Intermediate.xls', 'Baldridge Cyn.xls', 'Lower.xls', '34 Hydro.xls', 'Mercedes.xls', 'Highland Upper.xls', 'Foothill.xls', 'Upper.xls', '149 Hydro.xls', 'Mountain.xls', '101 Hydro.xls', 'Canal.xls', '59 Hydro.xls']

- 0: Intermediate
- 1: Baldridge Cyn
- 2: Lower
- 3: 34 Hydro
- 4: Mercedes
- 5: Highland Upper
- 6: Foothill
- 7: Upper
- 8: 149 Hydro
- 9: Mountain
- 10: 101 Hydro
- 11: Canal
- 12: 59 Hydro

# Day Of Week Number Key:

- 0: Monday
- 1: Tuesday
- 2: Wednesday
- 3: Thursday
- 4: Friday
- 5: Saturday
- 6: Sunday

# Consumption Categories:

- 0: <20
- 1: 20-40
- 2: 40-60
- 3: 60-80
- 4: 80-100
- 5: 100-120
- 6: 120-140
- 7: 140-160
- 8: 160-180
- 9: 180-200
- 10: 200-220
- 11: 220-240
