In [1]:
!pip install wget



In [1]:
# import the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from pandas import datetime
import wget
import seaborn as sns # you can use other packages such as matplotlib too
sns.set(style="whitegrid")

In [2]:
# read the clean data file and load into dataframe for preprocessing
df_pedstran_preprocessing = pd.read_csv("Datafile post wrangling.csv")

In [5]:
# drop the unnecessary column
df_pedstran_preprocessing.drop(['Unnamed: 0'],axis = 1,inplace = True)

In [3]:
df_pedstran_preprocessing

Unnamed: 0.1,Unnamed: 0,Year,Month,Mdate,Day,Time,Hourly_Counts,Date_Time,Sensor_ID
0,0.0,2009,5,1,4,0,53.0,2009-05-01 00:00:00,1
1,1.0,2009,5,1,4,1,43.0,2009-05-01 01:00:00,1
2,2.0,2009,5,1,4,2,10.0,2009-05-01 02:00:00,1
3,3.0,2009,5,1,4,3,5.0,2009-05-01 03:00:00,1
4,4.0,2009,5,1,4,4,25.0,2009-05-01 04:00:00,1
...,...,...,...,...,...,...,...,...,...
3237150,451.0,2020,3,31,1,19,115.0,2020-03-31 19:00:00,65
3237151,452.0,2020,3,31,1,20,61.0,2020-03-31 20:00:00,65
3237152,453.0,2020,3,31,1,21,42.0,2020-03-31 21:00:00,65
3237153,454.0,2020,3,31,1,22,47.0,2020-03-31 22:00:00,65


In [7]:
# drop the duplicate records if exists
df_pedstran_preprocessing.drop_duplicates(subset=['Date_Time','Sensor_ID'], inplace=True)

In [8]:
df_pedstran_preprocessing

Unnamed: 0,Year,Month,Mdate,Day,Time,Hourly_Counts,Date_Time,Sensor_ID
0,2009,5,1,4,0,53.0,2009-05-01 00:00:00,1
1,2009,5,1,4,1,43.0,2009-05-01 01:00:00,1
2,2009,5,1,4,2,10.0,2009-05-01 02:00:00,1
3,2009,5,1,4,3,5.0,2009-05-01 03:00:00,1
4,2009,5,1,4,4,25.0,2009-05-01 04:00:00,1
...,...,...,...,...,...,...,...,...
3237150,2020,3,31,1,19,115.0,2020-03-31 19:00:00,65
3237151,2020,3,31,1,20,61.0,2020-03-31 20:00:00,65
3237152,2020,3,31,1,21,42.0,2020-03-31 21:00:00,65
3237153,2020,3,31,1,22,47.0,2020-03-31 22:00:00,65


In [9]:
# preprocessing for sensor id 1
sensor_id_1 = df_pedstran_preprocessing[df_pedstran_preprocessing['Sensor_ID'] == 1]
cnt = len(sensor_id_1)
cnt

95712

In [10]:
# check duplicate records exist
chk = sensor_id_1.duplicated(subset=['Date_Time','Sensor_ID']).any()
chk

False

In [11]:
# convert all the column datatypes to category
sensor_id_1 = sensor_id_1.astype('category')

In [12]:
# information of datatypes
sensor_id_1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 95712 entries, 0 to 95711
Data columns (total 8 columns):
Year             95712 non-null category
Month            95712 non-null category
Mdate            95712 non-null category
Day              95712 non-null category
Time             95712 non-null category
Hourly_Counts    95712 non-null category
Date_Time        95712 non-null category
Sensor_ID        95712 non-null category
dtypes: category(8)
memory usage: 5.4 MB


In [13]:
# convert the dependent variable dataype to integer
sensor_id_1['Hourly_Counts'] = sensor_id_1['Hourly_Counts'].astype('int64')

In [14]:
sensor_id_1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 95712 entries, 0 to 95711
Data columns (total 8 columns):
Year             95712 non-null category
Month            95712 non-null category
Mdate            95712 non-null category
Day              95712 non-null category
Time             95712 non-null category
Hourly_Counts    95712 non-null int64
Date_Time        95712 non-null category
Sensor_ID        95712 non-null category
dtypes: category(7), int64(1)
memory usage: 5.6 MB


In [15]:
# arrange the columns
sensor_id_1 = sensor_id_1[['Date_Time','Year','Month','Mdate','Day','Time','Sensor_ID','Hourly_Counts']]

In [16]:
sensor_id_1

Unnamed: 0,Date_Time,Year,Month,Mdate,Day,Time,Sensor_ID,Hourly_Counts
0,2009-05-01 00:00:00,2009,5,1,4,0,1,53
1,2009-05-01 01:00:00,2009,5,1,4,1,1,43
2,2009-05-01 02:00:00,2009,5,1,4,2,1,10
3,2009-05-01 03:00:00,2009,5,1,4,3,1,5
4,2009-05-01 04:00:00,2009,5,1,4,4,1,25
...,...,...,...,...,...,...,...,...
95707,2020-03-31 19:00:00,2020,3,31,1,19,1,244
95708,2020-03-31 20:00:00,2020,3,31,1,20,1,84
95709,2020-03-31 21:00:00,2020,3,31,1,21,1,69
95710,2020-03-31 22:00:00,2020,3,31,1,22,1,39


In [17]:
# check if there is null value
sensor_id_1.isnull().sum()

Date_Time        0
Year             0
Month            0
Mdate            0
Day              0
Time             0
Sensor_ID        0
Hourly_Counts    0
dtype: int64

In [19]:
# drop columns using list of column names
sensor_id_1.drop(['Date_Time', 'Sensor_ID'], axis=1,inplace = True)

KeyError: "['Date_Time' 'Sensor_ID'] not found in axis"

In [20]:
sensor_id_1

Unnamed: 0,Year,Month,Mdate,Day,Time,Hourly_Counts
0,2009,5,1,4,0,53
1,2009,5,1,4,1,43
2,2009,5,1,4,2,10
3,2009,5,1,4,3,5
4,2009,5,1,4,4,25
...,...,...,...,...,...,...
95707,2020,3,31,1,19,244
95708,2020,3,31,1,20,84
95709,2020,3,31,1,21,69
95710,2020,3,31,1,22,39


In [45]:
# create dummy variable for categorical variables
sensor_id1_dummy = pd.get_dummies(sensor_id_1, columns=['Year','Month','Mdate','Day','Time'])

In [46]:
# Arrange the dependend variable to last column
list = sensor_id1_dummy.columns.tolist() # list the columns in the df
list.insert(87, list.pop(list.index('Hourly_Counts'))) # Assign new position
sensor_id1_dummy = sensor_id1_dummy.reindex(columns= list) # Now move 'Hourly_Counts' to ist new position

In [47]:
sensor_id1_dummy

Unnamed: 0,Year_2009,Year_2010,Year_2011,Year_2012,Year_2013,Year_2014,Year_2015,Year_2016,Year_2017,Year_2018,...,Time_15,Time_16,Time_17,Time_18,Time_19,Time_20,Time_21,Time_22,Time_23,Hourly_Counts
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,53
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,43
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,10
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,5
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,25
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95707,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,244
95708,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,84
95709,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,69
95710,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,39


In [49]:
# divide the data into 80:20 ratio. 80% for train data and 20% for test data for dependent and independent variables respectively
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(sensor_id1_dummy.iloc[:,:-1],sensor_id1_dummy['Hourly_Counts'], 
                                                    test_size=0.2)

In [56]:
# load the data into file after removing duplicate records
Y_train.to_csv('Dependent_Train_data1.csv',index=False)
Y_test.to_csv('Dependent_Test_data1.csv',index = False)
X_train.to_csv('Independent_Train_data1.csv',index = False)
X_test.to_csv('Independent_Test_data1.csv',index = False)

  
  This is separate from the ipykernel package so we can avoid doing imports until


In [59]:
# Preprocessing for all the sensors
var_catgry = df_pedstran_preprocessing.astype('category') # convert all column datatypes to category
var_catgry['Hourly_Counts'] = var_catgry['Hourly_Counts'].astype('int64') # convert the dependent variable datatype as int
# Re-arrange the corder of columns
var_catgry = var_catgry[['Date_Time','Year','Month','Mdate','Day','Time','Sensor_ID','Hourly_Counts']]
var_catgry.drop(['Date_Time'],axis = 1,inplace = True) # drop the columns which are not required
# create the dummy variable
var_catgry_dummy = pd.get_dummies(var_catgry, columns=['Year','Month','Mdate','Day','Time'])
# Arrange the dependend variable to last column
list = var_catgry_dummy.columns.tolist() # list the columns in the df
list.insert(88, list.pop(list.index('Hourly_Counts'))) # Assign new position 
var_catgry_dummy_argnd = var_catgry_dummy.reindex(columns= list) # Now move 'Hourly_Counts' to ist new position
var_catgry_dummy_argnd

Unnamed: 0,Sensor_ID,Year_2009,Year_2010,Year_2011,Year_2012,Year_2013,Year_2014,Year_2015,Year_2016,Year_2017,...,Time_15,Time_16,Time_17,Time_18,Time_19,Time_20,Time_21,Time_22,Time_23,Hourly_Counts
0,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,53
1,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,43
2,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,10
3,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,5
4,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,25
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3237150,65,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,115
3237151,65,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,61
3237152,65,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,42
3237153,65,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,47


In [71]:
"""divide the data into 80:20 ratio. 80% for train data and 20% for test data for dependent and independent variables 
respectively and load the data into files"""
colmns = var_catgry_dummy_argnd.columns  # headers for the columns
for i in var_catgry_dummy_argnd['Sensor_ID'].unique():
    print('working on sensor',i)
    sensor_id=var_catgry_dummy_argnd[var_catgry_dummy_argnd['Sensor_ID']==i]
    X_train, X_test, Y_train, Y_test = train_test_split(sensor_id.iloc[:,:-1],sensor_id['Hourly_Counts'], test_size=0.2)
    sensor_id.columns = colmns
    Y_train.to_csv('Dependent_Train_data{n}.csv'.format(n=i),index = False)
    Y_test.to_csv('Dependent_Test_data{n}.csv'.format(n=i),index = False)
    X_train.to_csv('Independent_Train_data{n}.csv'.format(n=i),index = False)
    X_test.to_csv('Independent_Test_data{n}.csv'.format(n=i),index = False)

working on sensor 1


  import sys
  


working on sensor 2
working on sensor 3
working on sensor 4
working on sensor 5
working on sensor 6
working on sensor 7
working on sensor 8
working on sensor 9
working on sensor 10
working on sensor 11
working on sensor 12
working on sensor 13
working on sensor 14
working on sensor 15
working on sensor 16
working on sensor 17
working on sensor 18
working on sensor 19
working on sensor 20
working on sensor 21
working on sensor 22
working on sensor 23
working on sensor 24
working on sensor 25
working on sensor 26
working on sensor 27
working on sensor 28
working on sensor 29
working on sensor 30
working on sensor 31
working on sensor 32
working on sensor 33
working on sensor 34
working on sensor 35
working on sensor 36
working on sensor 37
working on sensor 38
working on sensor 39
working on sensor 40
working on sensor 41
working on sensor 42
working on sensor 43
working on sensor 44
working on sensor 45
working on sensor 46
working on sensor 47
working on sensor 48
working on sensor 49
