## Data Collection & Preparation

* ``Pandas`` for managing the data
* ``NumPy`` for math operations
* ``Sklearn`` for Machine Learning
* ``Seaborn`` for advanced visualization
* ``Matplotlib`` for additional plotting

In [2]:
# Import libraries 

import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score
import warnings
warnings.filterwarnings("ignore", category=UserWarning) 
%matplotlib inline

In [3]:
#read data files

cs = pd.read_csv("charging_sessions.csv")
wba = pd.read_csv("weather_burbank_airport.csv")

In [4]:
#print the first 5 rows of the charging_sessions dataset
cs.head()

Unnamed: 0.1,Unnamed: 0,id,connectionTime,disconnectTime,doneChargingTime,kWhDelivered,sessionID,siteID,spaceID,stationID,timezone,userID,userInputs
0,0,5e23b149f9af8b5fe4b973cf,2020-01-02 13:08:54+00:00,2020-01-02 19:11:15+00:00,2020-01-02 17:31:35+00:00,25.016,1_1_179_810_2020-01-02 13:08:53.870034,1,AG-3F30,1-1-179-810,America/Los_Angeles,194.0,"[{'WhPerMile': 250, 'kWhRequested': 25.0, 'mil..."
1,1,5e23b149f9af8b5fe4b973d0,2020-01-02 13:36:50+00:00,2020-01-02 22:38:21+00:00,2020-01-02 20:18:05+00:00,33.097,1_1_193_825_2020-01-02 13:36:49.599853,1,AG-1F01,1-1-193-825,America/Los_Angeles,4275.0,"[{'WhPerMile': 280, 'kWhRequested': 70.0, 'mil..."
2,2,5e23b149f9af8b5fe4b973d1,2020-01-02 13:56:35+00:00,2020-01-03 00:39:22+00:00,2020-01-02 16:35:06+00:00,6.521,1_1_193_829_2020-01-02 13:56:35.214993,1,AG-1F03,1-1-193-829,America/Los_Angeles,344.0,"[{'WhPerMile': 400, 'kWhRequested': 8.0, 'mile..."
3,3,5e23b149f9af8b5fe4b973d2,2020-01-02 13:59:58+00:00,2020-01-02 16:38:39+00:00,2020-01-02 15:18:45+00:00,2.355,1_1_193_820_2020-01-02 13:59:58.309319,1,AG-1F04,1-1-193-820,America/Los_Angeles,1117.0,"[{'WhPerMile': 400, 'kWhRequested': 8.0, 'mile..."
4,4,5e23b149f9af8b5fe4b973d3,2020-01-02 14:00:01+00:00,2020-01-02 22:08:40+00:00,2020-01-02 18:17:30+00:00,13.375,1_1_193_819_2020-01-02 14:00:00.779967,1,AG-1F06,1-1-193-819,America/Los_Angeles,334.0,"[{'WhPerMile': 400, 'kWhRequested': 16.0, 'mil..."


In [5]:
#print out the first 5 rows of the weather burbank airport dataset
wba.head()

Unnamed: 0,city,timestamp,temperature,cloud_cover,cloud_cover_description,pressure,windspeed,precipitation,felt_temperature
0,Burbank,2018-01-01 08:53:00,9.0,33.0,Fair,991.75,9.0,0.0,8.0
1,Burbank,2018-01-01 09:53:00,9.0,33.0,Fair,992.08,0.0,0.0,9.0
2,Burbank,2018-01-01 10:53:00,9.0,21.0,Haze,992.08,0.0,0.0,9.0
3,Burbank,2018-01-01 11:53:00,9.0,29.0,Partly Cloudy,992.08,0.0,0.0,9.0
4,Burbank,2018-01-01 12:53:00,8.0,33.0,Fair,992.08,0.0,0.0,8.0


In [6]:
#print out the columns of charging sessions dataset
cs.columns

Index(['Unnamed: 0', 'id', 'connectionTime', 'disconnectTime',
       'doneChargingTime', 'kWhDelivered', 'sessionID', 'siteID', 'spaceID',
       'stationID', 'timezone', 'userID', 'userInputs'],
      dtype='object')

In [7]:
#print out the columns of weather burbank airport dataset
wba.columns

Index(['city', 'timestamp', 'temperature', 'cloud_cover',
       'cloud_cover_description', 'pressure', 'windspeed', 'precipitation',
       'felt_temperature'],
      dtype='object')

In [8]:
#print out the tail of cs dataset to see how many entries there are
cs.tail()

Unnamed: 0.1,Unnamed: 0,id,connectionTime,disconnectTime,doneChargingTime,kWhDelivered,sessionID,siteID,spaceID,stationID,timezone,userID,userInputs
66445,10083,5d574ad2f9af8b4c10c03652,2019-07-31 18:08:04+00:00,2019-07-31 23:29:18+00:00,2019-07-31 23:30:18+00:00,28.787,1_1_179_809_2019-07-31 18:08:04.432654,1,AG-3F27,1-1-179-809,America/Los_Angeles,393.0,"[{'WhPerMile': 240, 'kWhRequested': 31.2, 'mil..."
66446,10084,5d574ad2f9af8b4c10c03653,2019-07-31 18:40:41+00:00,2019-08-01 00:59:42+00:00,2019-07-31 21:44:23+00:00,7.787,1_1_179_810_2019-07-31 18:40:40.900203,1,AG-3F30,1-1-179-810,America/Los_Angeles,220.0,"[{'WhPerMile': 333, 'kWhRequested': 6.66, 'mil..."
66447,10085,5d574ad2f9af8b4c10c03654,2019-07-31 19:04:40+00:00,2019-07-31 22:44:22+00:00,2019-07-31 22:45:21+00:00,11.274,1_1_191_795_2019-07-31 19:04:40.098273,1,AG-4F51,1-1-191-795,America/Los_Angeles,1974.0,"[{'WhPerMile': 333, 'kWhRequested': 19.98, 'mi..."
66448,10086,5d574ad2f9af8b4c10c03655,2019-07-31 19:19:47+00:00,2019-08-01 00:34:51+00:00,2019-07-31 21:25:30+00:00,11.589,1_1_191_778_2019-07-31 19:19:46.919358,1,AG-4F43,1-1-191-778,America/Los_Angeles,942.0,"[{'WhPerMile': 275, 'kWhRequested': 22.0, 'mil..."
66449,10087,5d574ad2f9af8b4c10c03656,2019-07-31 19:21:47+00:00,2019-07-31 22:00:04+00:00,2019-07-31 20:51:34+00:00,0.897,1_1_178_817_2019-07-31 19:21:46.727697,1,AG-1F09,1-1-178-817,America/Los_Angeles,,


In [9]:
#print out the tail of cs dataset to see how many entries there are
wba.tail()

Unnamed: 0,city,timestamp,temperature,cloud_cover,cloud_cover_description,pressure,windspeed,precipitation,felt_temperature
29239,Burbank,2021-01-01 03:53:00,13.0,33.0,Fair,986.81,0.0,0.0,13.0
29240,Burbank,2021-01-01 04:53:00,12.0,33.0,Fair,986.81,11.0,0.0,12.0
29241,Burbank,2021-01-01 05:53:00,12.0,33.0,Fair,987.47,9.0,0.0,12.0
29242,Burbank,2021-01-01 06:53:00,11.0,33.0,Fair,987.14,13.0,0.0,11.0
29243,Burbank,2021-01-01 07:53:00,10.0,33.0,Fair,987.8,6.0,0.0,10.0


### We will focus on the charging_sessions dataset for a while

In [10]:
#print information about cs dataset
cs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 66450 entries, 0 to 66449
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Unnamed: 0        66450 non-null  int64  
 1   id                66450 non-null  object 
 2   connectionTime    66450 non-null  object 
 3   disconnectTime    66450 non-null  object 
 4   doneChargingTime  62362 non-null  object 
 5   kWhDelivered      66450 non-null  float64
 6   sessionID         66450 non-null  object 
 7   siteID            66450 non-null  int64  
 8   spaceID           66450 non-null  object 
 9   stationID         66450 non-null  object 
 10  timezone          66450 non-null  object 
 11  userID            49187 non-null  float64
 12  userInputs        49187 non-null  object 
dtypes: float64(2), int64(2), object(9)
memory usage: 6.6+ MB


We can see that this dataset has an unnamed column that represents the entry number. For better readability, we will rename this column

In [11]:
#rename the unnamed column
cs.rename( columns={'Unnamed: 0':'number'}, inplace=True )

In [12]:
#check if the column was renamed
cs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 66450 entries, 0 to 66449
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   number            66450 non-null  int64  
 1   id                66450 non-null  object 
 2   connectionTime    66450 non-null  object 
 3   disconnectTime    66450 non-null  object 
 4   doneChargingTime  62362 non-null  object 
 5   kWhDelivered      66450 non-null  float64
 6   sessionID         66450 non-null  object 
 7   siteID            66450 non-null  int64  
 8   spaceID           66450 non-null  object 
 9   stationID         66450 non-null  object 
 10  timezone          66450 non-null  object 
 11  userID            49187 non-null  float64
 12  userInputs        49187 non-null  object 
dtypes: float64(2), int64(2), object(9)
memory usage: 6.6+ MB


From looking at the count of every column, we can see that the "doneChargingTime" has missing values. The missing values in the "userID" and "userInputs" columns are because the user was not registered, so we can ignore them for now.

**Detecting missing numerical data**

In [13]:
#check if we can spot null-values
cs.isnull()

Unnamed: 0,number,id,connectionTime,disconnectTime,doneChargingTime,kWhDelivered,sessionID,siteID,spaceID,stationID,timezone,userID,userInputs
0,False,False,False,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
66445,False,False,False,False,False,False,False,False,False,False,False,False,False
66446,False,False,False,False,False,False,False,False,False,False,False,False,False
66447,False,False,False,False,False,False,False,False,False,False,False,False,False
66448,False,False,False,False,False,False,False,False,False,False,False,False,False


In [14]:
#drop all rows containing null values
cs.dropna(axis=0, inplace=False)

Unnamed: 0,number,id,connectionTime,disconnectTime,doneChargingTime,kWhDelivered,sessionID,siteID,spaceID,stationID,timezone,userID,userInputs
0,0,5e23b149f9af8b5fe4b973cf,2020-01-02 13:08:54+00:00,2020-01-02 19:11:15+00:00,2020-01-02 17:31:35+00:00,25.016,1_1_179_810_2020-01-02 13:08:53.870034,1,AG-3F30,1-1-179-810,America/Los_Angeles,194.0,"[{'WhPerMile': 250, 'kWhRequested': 25.0, 'mil..."
1,1,5e23b149f9af8b5fe4b973d0,2020-01-02 13:36:50+00:00,2020-01-02 22:38:21+00:00,2020-01-02 20:18:05+00:00,33.097,1_1_193_825_2020-01-02 13:36:49.599853,1,AG-1F01,1-1-193-825,America/Los_Angeles,4275.0,"[{'WhPerMile': 280, 'kWhRequested': 70.0, 'mil..."
2,2,5e23b149f9af8b5fe4b973d1,2020-01-02 13:56:35+00:00,2020-01-03 00:39:22+00:00,2020-01-02 16:35:06+00:00,6.521,1_1_193_829_2020-01-02 13:56:35.214993,1,AG-1F03,1-1-193-829,America/Los_Angeles,344.0,"[{'WhPerMile': 400, 'kWhRequested': 8.0, 'mile..."
3,3,5e23b149f9af8b5fe4b973d2,2020-01-02 13:59:58+00:00,2020-01-02 16:38:39+00:00,2020-01-02 15:18:45+00:00,2.355,1_1_193_820_2020-01-02 13:59:58.309319,1,AG-1F04,1-1-193-820,America/Los_Angeles,1117.0,"[{'WhPerMile': 400, 'kWhRequested': 8.0, 'mile..."
4,4,5e23b149f9af8b5fe4b973d3,2020-01-02 14:00:01+00:00,2020-01-02 22:08:40+00:00,2020-01-02 18:17:30+00:00,13.375,1_1_193_819_2020-01-02 14:00:00.779967,1,AG-1F06,1-1-193-819,America/Los_Angeles,334.0,"[{'WhPerMile': 400, 'kWhRequested': 16.0, 'mil..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...
66444,10082,5d574ad2f9af8b4c10c03651,2019-07-31 14:50:17+00:00,2019-08-01 01:01:18+00:00,2019-07-31 20:29:24+00:00,27.174,1_1_191_784_2019-07-31 14:50:17.037367,1,AG-4F40,1-1-191-784,America/Los_Angeles,448.0,"[{'WhPerMile': 200, 'kWhRequested': 28.0, 'mil..."
66445,10083,5d574ad2f9af8b4c10c03652,2019-07-31 18:08:04+00:00,2019-07-31 23:29:18+00:00,2019-07-31 23:30:18+00:00,28.787,1_1_179_809_2019-07-31 18:08:04.432654,1,AG-3F27,1-1-179-809,America/Los_Angeles,393.0,"[{'WhPerMile': 240, 'kWhRequested': 31.2, 'mil..."
66446,10084,5d574ad2f9af8b4c10c03653,2019-07-31 18:40:41+00:00,2019-08-01 00:59:42+00:00,2019-07-31 21:44:23+00:00,7.787,1_1_179_810_2019-07-31 18:40:40.900203,1,AG-3F30,1-1-179-810,America/Los_Angeles,220.0,"[{'WhPerMile': 333, 'kWhRequested': 6.66, 'mil..."
66447,10085,5d574ad2f9af8b4c10c03654,2019-07-31 19:04:40+00:00,2019-07-31 22:44:22+00:00,2019-07-31 22:45:21+00:00,11.274,1_1_191_795_2019-07-31 19:04:40.098273,1,AG-4F51,1-1-191-795,America/Los_Angeles,1974.0,"[{'WhPerMile': 333, 'kWhRequested': 19.98, 'mi..."


In [15]:
#check how many rows are getting dropped
len(cs)-len(cs.dropna())

20442

This number is higher than expected, but because this factors in the null values from userID and userInputs(unregistered users or users with no Input), we want so specifically look for missing values in the "doneChargingTime"-column

In [16]:
len(cs)-len(cs.dropna(subset = ['doneChargingTime']))

4088

This number tells us that there are 4088 sessions with no timestamp for when the EV got done charging, meaning that it didn't fully charge. Let's look at this data.

In [17]:
#display records with 'doneCharging = Null
cs_notdone = cs[cs['doneChargingTime'].isnull()]
cs_notdone

Unnamed: 0,number,id,connectionTime,disconnectTime,doneChargingTime,kWhDelivered,sessionID,siteID,spaceID,stationID,timezone,userID,userInputs
3756,3756,5ea3848df9af8b428bf97225,2020-04-08 16:36:53+00:00,2020-04-09 03:43:29+00:00,,25.183,1_1_194_822_2020-04-08 16:36:53.316264,1,AG-1F12,1-1-194-822,America/Los_Angeles,507.0,"[{'WhPerMile': 250, 'kWhRequested': 75.0, 'mil..."
3757,3757,5ea4d60df9af8b46573f1ee7,2020-04-09 13:23:52+00:00,2020-04-09 17:59:02+00:00,,14.136,1_1_193_827_2020-04-09 13:23:51.689520,1,AG-1F02,1-1-193-827,America/Los_Angeles,419.0,"[{'WhPerMile': 400, 'kWhRequested': 40.0, 'mil..."
3758,3758,5ea4d60df9af8b46573f1ee8,2020-04-09 13:40:03+00:00,2020-04-09 21:43:22+00:00,,14.203,1_1_178_823_2020-04-09 13:39:55.115497,1,AG-1F08,1-1-178-823,America/Los_Angeles,5065.0,"[{'WhPerMile': 400, 'kWhRequested': 24.0, 'mil..."
3759,3759,5ea4d60df9af8b46573f1ee9,2020-04-09 14:01:53+00:00,2020-04-09 16:33:40+00:00,,13.041,1_1_178_828_2020-04-09 14:01:53.467693,1,AG-1F10,1-1-178-828,America/Los_Angeles,651.0,"[{'WhPerMile': 400, 'kWhRequested': 20.0, 'mil..."
3760,3760,5ea4d60df9af8b46573f1eea,2020-04-09 15:01:06+00:00,2020-04-09 23:38:08+00:00,,5.797,1_1_193_820_2020-04-09 15:01:06.343454,1,AG-1F04,1-1-193-820,America/Los_Angeles,466.0,"[{'WhPerMile': 300, 'kWhRequested': 6.0, 'mile..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...
60411,4049,5cb3d2a8f9af8b4551261e75,2019-03-30 00:32:10+00:00,2019-03-30 02:23:14+00:00,,11.148,1_1_193_829_2019-03-30 00:32:10.128651,1,AG-1F03,1-1-193-829,America/Los_Angeles,194.0,"[{'WhPerMile': 400, 'kWhRequested': 24.0, 'mil..."
61097,4735,5cc8e891f9af8b45d975b819,2019-04-14 16:19:12+00:00,2019-04-14 20:38:39+00:00,,11.678,1_1_193_816_2019-04-14 16:19:11.864262,1,AG-1F05,1-1-193-816,America/Los_Angeles,364.0,"[{'WhPerMile': 400, 'kWhRequested': 56.0, 'mil..."
61103,4741,5cca3a54f9af8b49aaa4cbac,2019-04-15 13:00:59+00:00,2019-04-15 13:55:25+00:00,,3.343,1_1_193_819_2019-04-15 13:00:58.722284,1,AG-1F06,1-1-193-819,America/Los_Angeles,651.0,"[{'WhPerMile': 400, 'kWhRequested': 20.0, 'mil..."
61117,4755,5cca3a54f9af8b49aaa4cbba,2019-04-15 13:52:45+00:00,2019-04-15 14:47:01+00:00,,4.558,1_1_179_800_2019-04-15 13:52:44.693153,1,AG-3F32,1-1-179-800,America/Los_Angeles,194.0,"[{'WhPerMile': 400, 'kWhRequested': 12.0, 'mil..."


### Dealing with missing values

When dealing with missing values, we can either **eliminate** them from the dataset or **impute** the null values with estimates. Because the missing data points are timestamps, it's hard them because they simply do not exist. The EV never got done charging, so estimating a value doesn't make any sense. And because our job is to optimize utilization, looking at data points where the EV never fully charged(and thus didn't stay plugged in without charging and taking up charging space) is not adequate. That is why we will drop these data points.

In [18]:
#we take every row where "doneChargingTime" is not null into our new dataset cs_clean
charging_sessions = cs[cs['doneChargingTime'].notna()]

In [19]:
charging_sessions.info()

<class 'pandas.core.frame.DataFrame'>
Index: 62362 entries, 0 to 66449
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   number            62362 non-null  int64  
 1   id                62362 non-null  object 
 2   connectionTime    62362 non-null  object 
 3   disconnectTime    62362 non-null  object 
 4   doneChargingTime  62362 non-null  object 
 5   kWhDelivered      62362 non-null  float64
 6   sessionID         62362 non-null  object 
 7   siteID            62362 non-null  int64  
 8   spaceID           62362 non-null  object 
 9   stationID         62362 non-null  object 
 10  timezone          62362 non-null  object 
 11  userID            46008 non-null  float64
 12  userInputs        46008 non-null  object 
dtypes: float64(2), int64(2), object(9)
memory usage: 6.7+ MB


### Quality Checks
Now, we have to do some **quality checks** to ensure that our data does not have any errors and that the entries make sense. First, lets check station and spaceIDs.

In [20]:
charging_sessions['stationID'].nunique()

106

In [21]:
charging_sessions['spaceID'].nunique()

106

In [22]:
charging_sessions['stationID'].unique()

array(['1-1-179-810', '1-1-193-825', '1-1-193-829', '1-1-193-820',
       '1-1-193-819', '1-1-194-821', '1-1-178-817', '1-1-191-804',
       '1-1-194-826', '1-1-178-823', '1-1-179-788', '1-1-194-818',
       '1-1-178-824', '1-1-194-822', '1-1-179-783', '1-1-193-816',
       '1-1-179-798', '1-1-178-828', '1-1-179-787', '1-1-191-792',
       '1-1-193-827', '1-1-179-777', '1-1-179-799', '1-1-179-796',
       '1-1-179-815', '1-1-179-797', '1-1-179-781', '1-1-179-800',
       '1-1-179-809', '1-1-179-791', '1-1-179-794', '1-1-179-779',
       '1-1-179-801', '1-1-179-790', '1-1-191-802', '1-1-191-789',
       '1-1-191-806', '1-1-191-785', '1-1-191-808', '1-1-191-782',
       '1-1-179-813', '1-1-191-795', '1-1-191-807', '1-1-191-812',
       '1-1-191-803', '1-1-191-778', '1-1-191-811', '1-1-191-780',
       '1-1-191-786', '1-1-191-793', '1-1-191-784', '1-1-191-805',
       '2-39-138-566', '2-39-79-379', '2-39-79-378', '2-39-139-28',
       '2-39-79-377', '2-39-125-21', '2-39-127-19', '2-39-79-

In [23]:
charging_sessions['spaceID'].unique()

array(['AG-3F30', 'AG-1F01', 'AG-1F03', 'AG-1F04', 'AG-1F06', 'AG-1F13',
       'AG-1F09', 'AG-4F34', 'AG-1F11', 'AG-1F08', 'AG-3F22', 'AG-1F14',
       'AG-1F07', 'AG-1F12', 'AG-3F29', 'AG-1F05', 'AG-3F18', 'AG-1F10',
       'AG-3F16', 'AG-4F37', 'AG-1F02', 'AG-3F28', 'AG-3F25', 'AG-3F24',
       'AG-3F33', 'AG-3F23', 'AG-3F31', 'AG-3F32', 'AG-3F27', 'AG-3F26',
       'AG-3F20', 'AG-3F17', 'AG-3F21', 'AG-3F19', 'AG-4F44', 'AG-4F52',
       'AG-4F41', 'AG-4F48', 'AG-4F35', 'AG-4F50', 'AG-3F15', 'AG-4F51',
       'AG-4F47', 'AG-4F46', 'AG-4F49', 'AG-4F43', 'AG-4F42', 'AG-4F45',
       'AG-4F36', 'AG-4F38', 'AG-4F40', 'AG-4F39', 'CA-512', 'CA-327',
       'CA-326', 'CA-303', 'CA-325', 'CA-311', 'CA-309', 'CA-491',
       'CA-493', 'CA-499', 'CA-323', 'CA-313', 'CA-324', 'CA-513',
       'CA-305', 'CA-317', 'CA-510', 'CA-315', 'CA-321', 'CA-490',
       'CA-497', 'CA-489', 'CA-319', 'CA-492', 'CA-494', 'CA-502',
       'CA-500', 'CA-495', 'CA-498', 'CA-304', 'CA-307', 'CA-306',
       'CA

We see that both stationID and spaceID have 106 entries, so we will assume that they are mapped 1:1. spaceID most likely describes the parking space and stationID the charging station besides that parking space.

Now, lets turn timestamp data into the right datatypes

In [48]:
charging_sessions['connectionTime'] = pd.to_datetime(charging_sessions['connectionTime'])
charging_sessions['disconnectTime'] = pd.to_datetime(charging_sessions['disconnectTime'])
charging_sessions['doneChargingTime'] = pd.to_datetime(charging_sessions['doneChargingTime'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  charging_sessions['connectionTime'] = pd.to_datetime(charging_sessions['connectionTime'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  charging_sessions['disconnectTime'] = pd.to_datetime(charging_sessions['disconnectTime'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  charging_sessions['doneCh

In [49]:
charging_sessions.info()

<class 'pandas.core.frame.DataFrame'>
Index: 62335 entries, 0 to 66449
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype              
---  ------            --------------  -----              
 0   number            62335 non-null  int64              
 1   id                62335 non-null  object             
 2   connectionTime    62335 non-null  datetime64[ns, UTC]
 3   disconnectTime    62335 non-null  datetime64[ns, UTC]
 4   doneChargingTime  62335 non-null  datetime64[ns, UTC]
 5   kWhDelivered      62335 non-null  float64            
 6   sessionID         62335 non-null  object             
 7   siteID            62335 non-null  int64              
 8   spaceID           62335 non-null  object             
 9   stationID         62335 non-null  object             
 10  timezone          62335 non-null  object             
dtypes: datetime64[ns, UTC](3), float64(1), int64(2), object(5)
memory usage: 5.7+ MB


### Logic checks
Now we want to check if the data makes sense. For that, we will check three things. First, we will check if every **connectionTime** timestamp comes before the corresponding **disconnectTime** timestamp. An EV first has to connect to the station if it wants to disconnect from it. Second, we want to check if every **doneChargingTime** comes after every **connectionTime**, because an EV has to be still connected to be done charging. Lastly, we want to check if there are **doneChargingTime**s that come after **disconnectTime**. This would make no sense, because an unplugged EV can not be done charging after it got unplugged.


In [24]:
len(charging_sessions[charging_sessions['connectionTime']>charging_sessions['disconnectTime']])

0

There are no entries where connectionTime comes before disconnectTime

In [25]:
len(charging_sessions[charging_sessions['connectionTime']>charging_sessions['doneChargingTime']])

27

There are 27 entries where connectionTime comes after doneChargingTime. Lets further investigate.

In [26]:
charging_sessions[charging_sessions['connectionTime']>charging_sessions['doneChargingTime']]

Unnamed: 0,number,id,connectionTime,disconnectTime,doneChargingTime,kWhDelivered,sessionID,siteID,spaceID,stationID,timezone,userID,userInputs
22219,2040,5c942ca4f9af8b06b04b3bb4,2019-03-05 19:13:55+00:00,2019-03-05 22:50:39+00:00,2019-03-05 19:12:56+00:00,0.706655,2_39_78_367_2019-03-05 19:13:55.113078,2,CA-494,2-39-78-367,America/Los_Angeles,,
22253,2074,5c957e1cf9af8b42f440af03,2019-03-06 20:26:30+00:00,2019-03-07 01:48:54+00:00,2019-03-06 20:25:34+00:00,1.046381,2_39_78_367_2019-03-06 20:26:30.479644,2,CA-494,2-39-78-367,America/Los_Angeles,,
23562,3383,5cca3a22f9af8b49aaa4cba0,2019-04-15 20:24:13+00:00,2019-04-15 23:39:04+00:00,2019-04-15 20:23:14+00:00,0.635278,2_39_78_367_2019-04-15 20:24:13.365605,2,CA-494,2-39-78-367,America/Los_Angeles,1154.0,"[{'WhPerMile': 308, 'kWhRequested': 9.24, 'mil..."
23586,3407,5ccb8ba6f9af8b4d9721df00,2019-04-16 16:11:08+00:00,2019-04-16 19:10:48+00:00,2019-04-16 16:10:11+00:00,0.585977,2_39_78_367_2019-04-16 16:11:07.939710,2,CA-494,2-39-78-367,America/Los_Angeles,1154.0,"[{'WhPerMile': 308, 'kWhRequested': 6.16, 'mil..."
27689,7510,5d856f1ff9af8b0c7bdf245c,2019-09-04 16:35:04+00:00,2019-09-05 00:44:27+00:00,2019-09-04 16:34:05+00:00,1.5845,2_39_78_367_2019-09-04 16:35:04.129327,2,CA-494,2-39-78-367,America/Los_Angeles,,
27740,7561,5d86c0a5f9af8b1022a81870,2019-09-05 18:44:57+00:00,2019-09-06 00:55:19+00:00,2019-09-05 18:43:57+00:00,1.06723,2_39_78_360_2019-09-05 18:44:57.410168,2,CA-322,2-39-78-360,America/Los_Angeles,,
29295,9116,5dcdffbdf9af8b220a19be8b,2019-10-29 17:22:32+00:00,2019-10-31 01:57:20+00:00,2019-10-29 17:21:33+00:00,6.31621,2_39_78_367_2019-10-29 17:22:32.086306,2,CA-494,2-39-78-367,America/Los_Angeles,1470.0,"[{'WhPerMile': 292, 'kWhRequested': 14.6, 'mil..."
31285,492,5bc91740f9af8b0dc677b860,2018-05-04 19:08:37+00:00,2018-05-04 22:07:47+00:00,2018-05-04 19:07:40+00:00,0.551722,2_39_78_363_2018-05-04 19:08:36.642114,2,CA-320,2-39-78-363,America/Los_Angeles,,
31287,494,5bc91740f9af8b0dc677b862,2018-05-04 19:23:52+00:00,2018-05-05 00:04:15+00:00,2018-05-04 19:22:52+00:00,0.912297,2_39_78_367_2018-05-04 19:23:51.897392,2,CA-494,2-39-78-367,America/Los_Angeles,,
31403,610,5bc917d0f9af8b0dc677b8d6,2018-05-07 20:47:51+00:00,2018-05-08 02:16:00+00:00,2018-05-07 20:47:50+00:00,14.967,2_39_139_567_2018-05-07 20:47:50.862655,2,CA-513,2-39-139-567,America/Los_Angeles,,


If we take a closer look at these datapoints, we can see that for every datapoint where connectionTime > doneChargingTime holds true, the doneChargingTime occured round about a minute before the connectionTime. This can either be the system telling us that the EV was already full, or this could simply be an erroneous entry. Either way, only 27 entires are affected, so we decided to drop the entries.

In [27]:
charging_sessions.drop(charging_sessions[charging_sessions['connectionTime']>charging_sessions['doneChargingTime']].index,inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  charging_sessions.drop(charging_sessions[charging_sessions['connectionTime']>charging_sessions['doneChargingTime']].index,inplace=True)


In [28]:
len(charging_sessions[charging_sessions['connectionTime']>charging_sessions['doneChargingTime']])

0

We succesfully got rid of these entries. Now lets check for the third requirement.

In [29]:
len(charging_sessions[charging_sessions['doneChargingTime']>charging_sessions['disconnectTime']])

4692

Almost 4700 entries, which is quite a lot. We can not simply delete these sessions, so lets further investigate.

In [30]:
charging_sessions[charging_sessions['doneChargingTime']>charging_sessions['disconnectTime']]

Unnamed: 0,number,id,connectionTime,disconnectTime,doneChargingTime,kWhDelivered,sessionID,siteID,spaceID,stationID,timezone,userID,userInputs
12,12,5e23b149f9af8b5fe4b973db,2020-01-02 15:04:38+00:00,2020-01-02 22:08:39+00:00,2020-01-02 22:09:36+00:00,25.567,1_1_178_824_2020-01-02 15:04:38.051735,1,AG-1F07,1-1-178-824,America/Los_Angeles,528.0,"[{'WhPerMile': 250, 'kWhRequested': 50.0, 'mil..."
20,20,5e23b149f9af8b5fe4b973e3,2020-01-02 15:28:47+00:00,2020-01-02 19:01:54+00:00,2020-01-02 19:02:51+00:00,7.417,1_1_193_827_2020-01-02 15:28:46.685366,1,AG-1F02,1-1-193-827,America/Los_Angeles,1283.0,"[{'WhPerMile': 350, 'kWhRequested': 42.0, 'mil..."
25,25,5e23b149f9af8b5fe4b973e8,2020-01-02 15:42:05+00:00,2020-01-02 21:58:45+00:00,2020-01-02 21:59:42+00:00,36.701,1_1_179_797_2020-01-02 15:42:05.217965,1,AG-3F23,1-1-179-797,America/Los_Angeles,474.0,"[{'WhPerMile': 400, 'kWhRequested': 32.0, 'mil..."
26,26,5e23b149f9af8b5fe4b973e9,2020-01-02 15:57:24+00:00,2020-01-02 16:35:37+00:00,2020-01-02 16:36:34+00:00,3.689,1_1_179_781_2020-01-02 15:57:23.951170,1,AG-3F31,1-1-179-781,America/Los_Angeles,724.0,"[{'WhPerMile': 400, 'kWhRequested': 8.0, 'mile..."
33,33,5e23b149f9af8b5fe4b973f0,2020-01-02 16:34:35+00:00,2020-01-02 18:49:41+00:00,2020-01-02 18:50:38+00:00,7.120,1_1_179_790_2020-01-02 16:34:34.999200,1,AG-3F19,1-1-179-790,America/Los_Angeles,2276.0,"[{'WhPerMile': 600, 'kWhRequested': 18.0, 'mil..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...
66432,10070,5d574ad2f9af8b4c10c03645,2019-07-31 14:36:02+00:00,2019-07-31 22:34:10+00:00,2019-07-31 22:35:10+00:00,9.459,1_1_191_803_2019-07-31 14:36:02.181346,1,AG-4F49,1-1-191-803,America/Los_Angeles,826.0,"[{'WhPerMile': 250, 'kWhRequested': 35.0, 'mil..."
66439,10077,5d574ad2f9af8b4c10c0364c,2019-07-31 14:45:02+00:00,2019-07-31 23:43:20+00:00,2019-07-31 23:44:19+00:00,8.289,1_1_191_804_2019-07-31 14:45:01.555937,1,AG-4F34,1-1-191-804,America/Los_Angeles,572.0,"[{'WhPerMile': 231, 'kWhRequested': 9.24, 'mil..."
66440,10078,5d574ad2f9af8b4c10c0364d,2019-07-31 14:45:29+00:00,2019-07-31 22:01:32+00:00,2019-07-31 22:02:32+00:00,31.376,1_1_191_811_2019-07-31 14:45:29.388046,1,AG-4F42,1-1-191-811,America/Los_Angeles,1626.0,"[{'WhPerMile': 200, 'kWhRequested': 38.0, 'mil..."
66445,10083,5d574ad2f9af8b4c10c03652,2019-07-31 18:08:04+00:00,2019-07-31 23:29:18+00:00,2019-07-31 23:30:18+00:00,28.787,1_1_179_809_2019-07-31 18:08:04.432654,1,AG-3F27,1-1-179-809,America/Los_Angeles,393.0,"[{'WhPerMile': 240, 'kWhRequested': 31.2, 'mil..."


It becomes pretty apparent that the doneChargingTime also lies within a minute of the disconnectTime. This could mean that the station always delivers electricity for approx. another minute before it stops supplying energy. We will keep those entries, because discarding them would make us lose too many data points.

In [31]:
charging_sessions.nunique()

number              15277
id                  60923
connectionTime      60734
disconnectTime      60801
doneChargingTime    60610
kWhDelivered        24656
sessionID           60923
siteID                  2
spaceID               106
stationID             106
timezone                1
userID                888
userInputs          44638
dtype: int64

### Handle userInput and userID 
Working with the userInput and userID columns can be problematic. According to the task manual, they can change overtime. The amount of miles and Kwh request per user can change, which results in multiple userInputs per row. The data is complex to preprocess. And for the given task, we have not thought of a way on how we are going to use this data yet. This may change in the future, but as of now, we will ignore these columns, so we drop them.

In [32]:
charging_sessions.drop(columns=['userID', 'userInputs'],inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  charging_sessions.drop(columns=['userID', 'userInputs'],inplace=True)


In [33]:
charging_sessions.info()

<class 'pandas.core.frame.DataFrame'>
Index: 62335 entries, 0 to 66449
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   number            62335 non-null  int64  
 1   id                62335 non-null  object 
 2   connectionTime    62335 non-null  object 
 3   disconnectTime    62335 non-null  object 
 4   doneChargingTime  62335 non-null  object 
 5   kWhDelivered      62335 non-null  float64
 6   sessionID         62335 non-null  object 
 7   siteID            62335 non-null  int64  
 8   spaceID           62335 non-null  object 
 9   stationID         62335 non-null  object 
 10  timezone          62335 non-null  object 
dtypes: float64(1), int64(2), object(8)
memory usage: 5.7+ MB


### Outliers
Last thing we need to do, is delete outlier data form our dataset. An outlier is a data point that differs significantly from other observations. In our case, it would be a session which was too long, lets say more than 20 hours. Lets look for these sessions. To do that, we need to first figure out the length of the session. We will handle outliers in 02_Descriptive Analytics.

## Weather data preparation

We first have to think of a way we want to utilize the weather data and manipulate the dataframe based on our plan.

In [35]:
wba.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29244 entries, 0 to 29243
Data columns (total 9 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   city                     29244 non-null  object 
 1   timestamp                29244 non-null  object 
 2   temperature              29219 non-null  float64
 3   cloud_cover              29224 non-null  float64
 4   cloud_cover_description  29224 non-null  object 
 5   pressure                 29236 non-null  float64
 6   windspeed                29158 non-null  float64
 7   precipitation            29244 non-null  float64
 8   felt_temperature         29218 non-null  float64
dtypes: float64(6), object(3)
memory usage: 2.0+ MB


We change the timestamp column to the timedate datatype.

In [41]:
wba['timestamp'] = pd.to_datetime(wba['timestamp'])

In [42]:
wba.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29244 entries, 0 to 29243
Data columns (total 9 columns):
 #   Column                   Non-Null Count  Dtype         
---  ------                   --------------  -----         
 0   city                     29244 non-null  object        
 1   timestamp                29244 non-null  datetime64[ns]
 2   temperature              29219 non-null  float64       
 3   cloud_cover              29224 non-null  float64       
 4   cloud_cover_description  29224 non-null  object        
 5   pressure                 29236 non-null  float64       
 6   windspeed                29158 non-null  float64       
 7   precipitation            29244 non-null  float64       
 8   felt_temperature         29218 non-null  float64       
dtypes: datetime64[ns](1), float64(6), object(2)
memory usage: 2.0+ MB


Lets further investigate the columns, so see which ones will be interesting for us.

In [38]:
wba.describe(include = "all")

Unnamed: 0,city,timestamp,temperature,cloud_cover,cloud_cover_description,pressure,windspeed,precipitation,felt_temperature
count,29244,29244,29219.0,29224.0,29224,29236.0,29158.0,29244.0,29218.0
unique,1,29244,,,23,,,,
top,Burbank,2018-01-01 08:53:00,,,Fair,,,,
freq,29244,1,,,17122,,,,
mean,,,17.885622,30.107993,,986.8794,8.566568,0.05936,17.731433
std,,,6.454604,5.400388,,3.607065,6.699889,0.471682,6.3211
min,,,2.0,4.0,,971.0,0.0,0.0,0.0
25%,,,13.0,26.0,,984.5,6.0,0.0,13.0
50%,,,17.0,33.0,,986.48,7.0,0.0,17.0
75%,,,22.0,34.0,,989.11,13.0,0.0,22.0


The describe.(include ="all") function gives us a good look on all the variables. Based on this, we can see what columns we can drop and what columns might help us for the prediction model part. For now, we will keep the temperature variable and the percipation.

In [44]:
wba.cloud_cover_description.unique()

array(['Fair', 'Haze', 'Partly Cloudy', 'Mostly Cloudy', 'Cloudy', 'Fog',
       'Light Rain', 'Rain', 'Heavy Rain', 'Heavy Rain / Windy',
       'Light Rain / Windy', 'T-Storm', 'Fair / Windy', 'Cloudy / Windy',
       'Mostly Cloudy / Windy', 'Partly Cloudy / Windy',
       'Thunder in the Vicinity', 'Thunder', nan, 'Smoke',
       'Light Rain with Thunder', 'Heavy T-Storm', 'Rain / Windy',
       'Blowing Dust'], dtype=object)

The cloud cover description might also come in handy, so we will keep this column also.

We now want to look at from when to when the weather data is. For that, we first need to sort the dataframe, then look at the tail.

In [45]:
wba.sort_values(by='timestamp',inplace=True)

wba.head()

Unnamed: 0,city,timestamp,temperature,cloud_cover,cloud_cover_description,pressure,windspeed,precipitation,felt_temperature
0,Burbank,2018-01-01 08:53:00,9.0,33.0,Fair,991.75,9.0,0.0,8.0
1,Burbank,2018-01-01 09:53:00,9.0,33.0,Fair,992.08,0.0,0.0,9.0
2,Burbank,2018-01-01 10:53:00,9.0,21.0,Haze,992.08,0.0,0.0,9.0
3,Burbank,2018-01-01 11:53:00,9.0,29.0,Partly Cloudy,992.08,0.0,0.0,9.0
4,Burbank,2018-01-01 12:53:00,8.0,33.0,Fair,992.08,0.0,0.0,8.0


In [46]:
wba.tail()

Unnamed: 0,city,timestamp,temperature,cloud_cover,cloud_cover_description,pressure,windspeed,precipitation,felt_temperature
29239,Burbank,2021-01-01 03:53:00,13.0,33.0,Fair,986.81,0.0,0.0,13.0
29240,Burbank,2021-01-01 04:53:00,12.0,33.0,Fair,986.81,11.0,0.0,12.0
29241,Burbank,2021-01-01 05:53:00,12.0,33.0,Fair,987.47,9.0,0.0,12.0
29242,Burbank,2021-01-01 06:53:00,11.0,33.0,Fair,987.14,13.0,0.0,11.0
29243,Burbank,2021-01-01 07:53:00,10.0,33.0,Fair,987.8,6.0,0.0,10.0


The dataframe has records from the first of January 2018 to the first of January 2021.

In [52]:
charging_sessions_sorted= charging_sessions.sort_values(by='connectionTime')

In [53]:
charging_sessions_sorted.tail()

Unnamed: 0,number,id,connectionTime,disconnectTime,doneChargingTime,kWhDelivered,sessionID,siteID,spaceID,stationID,timezone
49115,3030,61550519f9af8b76960e169b,2021-09-13 21:17:04+00:00,2021-09-14 01:01:49+00:00,2021-09-13 23:18:07+00:00,6.715,2_39_123_23_2021-09-13 21:16:44.026068,2,CA-313,2-39-123-23,America/Los_Angeles
20176,5873,6155053bf9af8b76960e16cf,2021-09-13 21:37:59+00:00,2021-09-14 00:12:49+00:00,2021-09-14 00:12:36+00:00,8.547,1_1_178_824_2021-09-13 21:37:44.554924,1,AG-1F07,1-1-178-824,America/Los_Angeles
49117,3032,61550519f9af8b76960e169d,2021-09-13 23:11:12+00:00,2021-09-14 01:43:11+00:00,2021-09-14 00:13:35+00:00,2.018,2_39_91_437_2021-09-13 23:10:59.528292,2,CA-317,2-39-91-437,America/Los_Angeles
20177,5874,6155053bf9af8b76960e16d0,2021-09-14 01:08:16+00:00,2021-09-14 01:31:24+00:00,2021-09-14 01:31:19+00:00,1.253,1_1_179_783_2021-09-14 01:08:03.220292,1,AG-3F29,1-1-179-783,America/Los_Angeles
20178,5875,6155053bf9af8b76960e16d1,2021-09-14 05:43:39+00:00,2021-09-14 14:46:28+00:00,2021-09-14 14:46:22+00:00,53.937,1_1_178_817_2021-09-14 05:43:27.354300,1,AG-1F09,1-1-178-817,America/Los_Angeles


In [54]:
charging_sessions_sorted.head()

Unnamed: 0,number,id,connectionTime,disconnectTime,doneChargingTime,kWhDelivered,sessionID,siteID,spaceID,stationID,timezone
30793,0,5bc90cb9f9af8b0d7fe77cd2,2018-04-25 11:08:04+00:00,2018-04-25 13:20:10+00:00,2018-04-25 13:21:10+00:00,7.932,2_39_78_362_2018-04-25 11:08:04.400812,2,CA-496,2-39-78-362,America/Los_Angeles
30794,1,5bc90cb9f9af8b0d7fe77cd3,2018-04-25 13:45:10+00:00,2018-04-26 00:56:16+00:00,2018-04-25 16:44:15+00:00,10.013,2_39_95_27_2018-04-25 13:45:09.617470,2,CA-319,2-39-95-27,America/Los_Angeles
30795,2,5bc90cb9f9af8b0d7fe77cd4,2018-04-25 13:45:50+00:00,2018-04-25 23:04:45+00:00,2018-04-25 14:51:44+00:00,5.257,2_39_79_380_2018-04-25 13:45:49.962001,2,CA-489,2-39-79-380,America/Los_Angeles
30796,3,5bc90cb9f9af8b0d7fe77cd5,2018-04-25 14:37:06+00:00,2018-04-25 23:55:34+00:00,2018-04-25 16:05:22+00:00,5.177,2_39_79_379_2018-04-25 14:37:06.460772,2,CA-327,2-39-79-379,America/Los_Angeles
30797,4,5bc90cb9f9af8b0d7fe77cd6,2018-04-25 14:40:34+00:00,2018-04-25 23:03:12+00:00,2018-04-25 17:40:30+00:00,10.119,2_39_79_381_2018-04-25 14:40:33.638896,2,CA-490,2-39-79-381,America/Los_Angeles


We can see that the weather data does not quite reach the charging session dates. That means, we have weather data up until **January 2021**, but charging sessions till **September 2021**. Also, the charging session data starts at the 25th of April 2018. We will have to truncate the data such that the charging session daterange and the weather dateranges match(25-04-2018 till 01-01-2021).

In [55]:
all_hours = pd.date_range(start='2018-04-25', end='2021-01-01', freq='H')
full_df = pd.DataFrame({'timestamp': all_hours})

In [57]:
weather_range = pd.merge_asof(full_df, wba, on='timestamp', direction='backward', suffixes=('_new', '_original'))

In [59]:
weather_range

Unnamed: 0,timestamp,city,temperature,cloud_cover,cloud_cover_description,pressure,windspeed,precipitation,felt_temperature
0,2018-04-25 00:00:00,Burbank,24.0,34.0,Fair,986.81,15.0,0.0,24.0
1,2018-04-25 01:00:00,Burbank,24.0,30.0,Partly Cloudy,986.48,17.0,0.0,24.0
2,2018-04-25 02:00:00,Burbank,21.0,30.0,Partly Cloudy,986.81,13.0,0.0,21.0
3,2018-04-25 03:00:00,Burbank,19.0,29.0,Partly Cloudy,987.14,11.0,0.0,19.0
4,2018-04-25 04:00:00,Burbank,18.0,29.0,Partly Cloudy,987.47,7.0,0.0,18.0
...,...,...,...,...,...,...,...,...,...
23564,2020-12-31 20:00:00,Burbank,17.0,34.0,Fair,985.82,19.0,0.0,17.0
23565,2020-12-31 21:00:00,Burbank,18.0,34.0,Fair,984.50,26.0,0.0,18.0
23566,2020-12-31 22:00:00,Burbank,19.0,34.0,Fair,985.16,19.0,0.0,19.0
23567,2020-12-31 23:00:00,Burbank,18.0,34.0,Fair,984.83,31.0,0.0,18.0


In [60]:
weather_range['temperature'].fillna(method='ffill', inplace=True)
weather_range['precipitation'].fillna(method='ffill', inplace=True)
weather_range['cloud_cover_description'].fillna(method='ffill', inplace=True)
weather_range.drop(columns='city',inplace=True)
weather_range.drop(columns='cloud_cover',inplace=True)
weather_range.drop(columns='pressure',inplace=True)
weather_range.drop(columns='windspeed',inplace=True)
weather_range.drop(columns='felt_temperature',inplace=True)
weather_range.head(1000)

  weather_range['temperature'].fillna(method='ffill', inplace=True)
  weather_range['precipitation'].fillna(method='ffill', inplace=True)
  weather_range['cloud_cover_description'].fillna(method='ffill', inplace=True)


Unnamed: 0,timestamp,temperature,cloud_cover_description,precipitation
0,2018-04-25 00:00:00,24.0,Fair,0.0
1,2018-04-25 01:00:00,24.0,Partly Cloudy,0.0
2,2018-04-25 02:00:00,21.0,Partly Cloudy,0.0
3,2018-04-25 03:00:00,19.0,Partly Cloudy,0.0
4,2018-04-25 04:00:00,18.0,Partly Cloudy,0.0
...,...,...,...,...
995,2018-06-05 11:00:00,16.0,Cloudy,0.0
996,2018-06-05 12:00:00,16.0,Cloudy,0.0
997,2018-06-05 13:00:00,16.0,Cloudy,0.0
998,2018-06-05 14:00:00,16.0,Cloudy,0.0


In [61]:
weather_range.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23569 entries, 0 to 23568
Data columns (total 4 columns):
 #   Column                   Non-Null Count  Dtype         
---  ------                   --------------  -----         
 0   timestamp                23569 non-null  datetime64[ns]
 1   temperature              23569 non-null  float64       
 2   cloud_cover_description  23569 non-null  object        
 3   precipitation            23569 non-null  float64       
dtypes: datetime64[ns](1), float64(2), object(1)
memory usage: 736.7+ KB
