In [1]:
#homogenous multidimensional arrays

import numpy as np

a = np.array([[1, 2, 3],
              [3, 2, 1]])
a

array([[1, 2, 3],
       [3, 2, 1]])

In [2]:
print(a.ndim)  #number of dimensions

2


In [3]:
print(a.shape) # shape, rows x columns

(2, 3)


In [4]:
np.zeros((2, 3, 2)) # 2 x 3 x 2 array of zeros

array([[[0., 0.],
        [0., 0.],
        [0., 0.]],

       [[0., 0.],
        [0., 0.],
        [0., 0.]]])

In [5]:
np.ones_like(a)

array([[1, 1, 1],
       [1, 1, 1]])

In [6]:
np.arange(1, 10, 1)

array([1, 2, 3, 4, 5, 6, 7, 8, 9])

In [7]:
np.arange(0, 1, 0.1)

array([0. , 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9])

In [8]:
# seed used to create a reproducible random example
np.random.seed(3)
np.random.randint(1, 10, (3, 4))

array([[9, 4, 9, 9],
       [1, 6, 4, 6],
       [8, 7, 1, 5]])

In [9]:
# create a 1D array through repetition
np.repeat(10, 5)

array([10, 10, 10, 10, 10])

In [10]:
# create a 2D array through repetition
onedim_arr = np.array([1, 2, 3, 4, 5])
onedim_arr

array([1, 2, 3, 4, 5])

In [11]:
multidim_arr = np.tile(onedim_arr, (5, 1))
multidim_arr

array([[1, 2, 3, 4, 5],
       [1, 2, 3, 4, 5],
       [1, 2, 3, 4, 5],
       [1, 2, 3, 4, 5],
       [1, 2, 3, 4, 5]])

In [12]:
multidim_arr = np.repeat(onedim_arr, 5)
multidim_arr

array([1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 5, 5,
       5, 5, 5])

In [13]:
# create the sample Generator
rng = np.random.default_rng(seed=42)

# create a 5 x 4 array w/ normally distributed data
# mean 10, std 2.5
randrng = rng.normal(10, 2.5, (5, 4))
randrng

array([[10.7617927 ,  7.40003973, 11.87612799, 12.35141179],
       [ 5.12241203,  6.74455123, 10.31960101,  9.20939352],
       [ 9.95799711,  7.86739018, 12.19849494, 11.94447984],
       [10.16507674, 12.81810302, 11.16877336,  7.85176884],
       [10.92187696,  7.6027935 , 12.19612575,  9.87518522]])

In [14]:
# using the same Generator to draw a random sample from a Poisson distribution
# lam is 1 and size is 5x6

poissonrng = rng.poisson(1, (5, 6))
poissonrng

array([[1, 3, 1, 0, 3, 1],
       [0, 0, 1, 1, 2, 2],
       [0, 1, 0, 0, 3, 1],
       [0, 0, 1, 2, 1, 0],
       [0, 1, 1, 0, 0, 0]])

In [15]:
b = np.array([[5, 4, 3, 2, 1, 0],
              [10, 8, 6, 4, 2, 0]])
b

array([[ 5,  4,  3,  2,  1,  0],
       [10,  8,  6,  4,  2,  0]])

In [16]:
#transpose
b.T

array([[ 5, 10],
       [ 4,  8],
       [ 3,  6],
       [ 2,  4],
       [ 1,  2],
       [ 0,  0]])

In [17]:
b

array([[ 5,  4,  3,  2,  1,  0],
       [10,  8,  6,  4,  2,  0]])

In [18]:
# change dimensions
b.reshape(4, 3)

array([[ 5,  4,  3],
       [ 2,  1,  0],
       [10,  8,  6],
       [ 4,  2,  0]])

In [19]:
b.flatten()

array([ 5,  4,  3,  2,  1,  0, 10,  8,  6,  4,  2,  0])

In [20]:
a

array([[1, 2, 3],
       [3, 2, 1]])

In [21]:
b

array([[ 5,  4,  3,  2,  1,  0],
       [10,  8,  6,  4,  2,  0]])

In [22]:
# stack a and b horizontally
np.hstack((b, a))

array([[ 5,  4,  3,  2,  1,  0,  1,  2,  3],
       [10,  8,  6,  4,  2,  0,  3,  2,  1]])

In [23]:
# stack a and b vertically
np.vstack((b.reshape(4, 3), a))

array([[ 5,  4,  3],
       [ 2,  1,  0],
       [10,  8,  6],
       [ 4,  2,  0],
       [ 1,  2,  3],
       [ 3,  2,  1]])

In [24]:
arr1 = np.array([5, 10, 15, 20])
arr1

array([ 5, 10, 15, 20])

In [25]:
arr2 = np.arange(5, 9)
arr2

array([5, 6, 7, 8])

In [26]:
arr1 - arr2

array([ 0,  4,  8, 12])

In [27]:
arr1 * 2

array([10, 20, 30, 40])

In [29]:
arr2 + np.array([1, 2])

ValueError: ignored

In [30]:
arr1

array([ 5, 10, 15, 20])

In [31]:
print(f'''arr1 sums to {arr1.sum()}.
Its max value is {arr1.max()}, and its mean is {arr1.mean()}.''')

arr1 sums to 50.
Its max value is 20, and its mean is 12.5.


In [32]:
np.median(arr1)

12.5

In [34]:
arr1.median()

AttributeError: ignored

In [35]:
tens = np.arange(0, 120, 10).reshape(3, 4)
tens

array([[  0,  10,  20,  30],
       [ 40,  50,  60,  70],
       [ 80,  90, 100, 110]])

In [36]:
horizontal = np.array([-5, -10, -15, -20])
horizontal

array([ -5, -10, -15, -20])

In [37]:
tens + horizontal

array([[-5,  0,  5, 10],
       [35, 40, 45, 50],
       [75, 80, 85, 90]])

In [38]:
vertical = np.array([[100],
                     [200],
                     [300]])

In [39]:
vertical

array([[100],
       [200],
       [300]])

In [40]:
tens + vertical

array([[100, 110, 120, 130],
       [240, 250, 260, 270],
       [380, 390, 400, 410]])

In [41]:
tens

array([[  0,  10,  20,  30],
       [ 40,  50,  60,  70],
       [ 80,  90, 100, 110]])

In [42]:
# axis = 0 --> calculating values for each column
tens.mean(axis=0)

array([40., 50., 60., 70.])

In [43]:
# axis = 1 --> calculating values for each row
tens.mean(axis=1)

array([15., 55., 95.])

In [44]:
arr1

array([ 5, 10, 15, 20])

In [45]:
arr1[1]

10

In [46]:
arr1[1:3]

array([10, 15])

In [47]:
tens

array([[  0,  10,  20,  30],
       [ 40,  50,  60,  70],
       [ 80,  90, 100, 110]])

In [48]:
# index according to row, column
tens[1, 2]

60

In [49]:
# get the first row
tens[0]

array([ 0, 10, 20, 30])

In [50]:
# get the first column
tens[:,0]

array([ 0, 40, 80])

In [51]:
# slice rows 0 and 1, columns 1 and 2
tens[0:2, 1:3]

array([[10, 20],
       [50, 60]])

In [52]:
# create a 3 x 4 array of random integers
matrix = np.random.randint(1, 11, 12).reshape(3, 4)
matrix

array([[8, 9, 2, 7],
       [3, 3, 2, 4],
       [6, 9, 2, 9]])

In [53]:
matrix2 = matrix

In [54]:
matrix3 = matrix.copy()

In [55]:
matrix2[1] = [0, 0, 0, 0]
matrix2

array([[8, 9, 2, 7],
       [0, 0, 0, 0],
       [6, 9, 2, 9]])

In [56]:
matrix

array([[8, 9, 2, 7],
       [0, 0, 0, 0],
       [6, 9, 2, 9]])

In [57]:
matrix3

array([[8, 9, 2, 7],
       [3, 3, 2, 4],
       [6, 9, 2, 9]])

In [58]:
tens

array([[  0,  10,  20,  30],
       [ 40,  50,  60,  70],
       [ 80,  90, 100, 110]])

In [59]:
# evaluate whether each element is dividible by 3

tens % 3 == 0


array([[ True, False, False,  True],
       [False, False,  True, False],
       [False,  True, False, False]])

In [60]:
tens[tens % 3 == 0]

array([ 0, 30, 60, 90])

In [61]:
mask = (tens % 3 == 0)
mask

array([[ True, False, False,  True],
       [False, False,  True, False],
       [False,  True, False, False]])

In [62]:
tens[mask]

array([ 0, 30, 60, 90])

In [63]:
tens

array([[  0,  10,  20,  30],
       [ 40,  50,  60,  70],
       [ 80,  90, 100, 110]])

In [64]:
tens % 3 == 0

array([[ True, False, False,  True],
       [False, False,  True, False],
       [False,  True, False, False]])

In [65]:
np.where(tens % 3 == 0, #condition
         tens, # return the element if True
         0) # return 0 if False

array([[ 0,  0,  0, 30],
       [ 0,  0, 60,  0],
       [ 0, 90,  0,  0]])

In [66]:
# pandas --> data analysis and manipulation

import numpy as np

# pd is the conventional alias for pandas
import pandas as pd

# display all columns
pd.set_option("display.max_columns", None)

In [67]:
# columns labeled with their names
# rows have a label, or index --> pandas will use numbers as the default
# each column is a Series, or one-dimensional array where values share the same data type
# unlike numpy arrays, DataFrames can have columns of different data types
# DataFames are mutable

In [68]:
trees = pd.DataFrame({
    'name': ['sugar maple', 'black oak', 'white ash', 'douglas fir'],
    'avg_lifespan': [300, 100, 260, 450],
    'quantity': [53, 207, 178, 93]
})

In [69]:
trees

Unnamed: 0,name,avg_lifespan,quantity
0,sugar maple,300,53
1,black oak,100,207
2,white ash,260,178
3,douglas fir,450,93


In [70]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [71]:
thefts = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/data/bicycle-thefts - 4326.csv')

In [72]:
thefts.shape

(25569, 33)

In [73]:
thefts.dtypes

_id                        int64
OBJECTID                   int64
event_unique_id           object
Primary_Offence           object
Occurrence_Date           object
Occurrence_Year            int64
Occurrence_Month          object
Occurrence_DayOfWeek      object
Occurrence_DayOfMonth      int64
Occurrence_DayOfYear       int64
Occurrence_Hour            int64
Report_Date               object
Report_Year                int64
Report_Month              object
Report_DayOfWeek          object
Report_DayOfMonth          int64
Report_DayOfYear           int64
Report_Hour                int64
Division                  object
City                      object
Hood_ID                   object
NeighbourhoodName         object
Location_Type             object
Premises_Type             object
Bike_Make                 object
Bike_Model                object
Bike_Type                 object
Bike_Speed                 int64
Bike_Colour               object
Cost_of_Bike             float64
Status    

In [74]:
thefts.head()

Unnamed: 0,_id,OBJECTID,event_unique_id,Primary_Offence,Occurrence_Date,Occurrence_Year,Occurrence_Month,Occurrence_DayOfWeek,Occurrence_DayOfMonth,Occurrence_DayOfYear,Occurrence_Hour,Report_Date,Report_Year,Report_Month,Report_DayOfWeek,Report_DayOfMonth,Report_DayOfYear,Report_Hour,Division,City,Hood_ID,NeighbourhoodName,Location_Type,Premises_Type,Bike_Make,Bike_Model,Bike_Type,Bike_Speed,Bike_Colour,Cost_of_Bike,Status,ObjectId2,geometry
0,1,17744,GO-20179016397,THEFT UNDER,2017-10-03T00:00:00,2017,October,Tuesday,3,276,14,2017-10-03T00:00:00,2017,October,Tuesday,3,276,18,D22,Toronto,15,Kingsway South (15),"Streets, Roads, Highways (Bicycle Path, Privat...",Outside,GI,ESCAPE 2,OT,7,BLK,700.0,STOLEN,1,"{'type': 'Point', 'coordinates': (-79.50655965..."
1,2,17759,GO-20172033056,THEFT UNDER - BICYCLE,2017-11-08T00:00:00,2017,November,Wednesday,8,312,3,2017-11-08T00:00:00,2017,November,Wednesday,8,312,22,D22,Toronto,15,Kingsway South (15),"Single Home, House (Attach Garage, Cottage, Mo...",House,UNKNOWN MAKE,,TO,1,BLK,1100.0,RECOVERED,2,"{'type': 'Point', 'coordinates': (-79.50484874..."
2,3,17906,GO-20189030822,THEFT UNDER - BICYCLE,2018-09-14T00:00:00,2018,September,Friday,14,257,9,2018-09-17T00:00:00,2018,September,Monday,17,260,16,D22,Toronto,15,Kingsway South (15),Ttc Subway Station,Transit,OT,CROSSTRAIL,MT,24,BLK,904.0,STOLEN,3,"{'type': 'Point', 'coordinates': (-79.51170915..."
3,4,17962,GO-2015804467,THEFT UNDER,2015-05-07T00:00:00,2015,May,Thursday,7,127,18,2015-05-14T00:00:00,2015,May,Thursday,14,134,14,D22,Toronto,15,Kingsway South (15),Ttc Subway Station,Transit,GT,,TO,10,BLKDGR,400.0,STOLEN,4,"{'type': 'Point', 'coordinates': (-79.51170915..."
4,5,17963,GO-20159002781,THEFT UNDER,2015-05-16T00:00:00,2015,May,Saturday,16,136,12,2015-05-16T00:00:00,2015,May,Saturday,16,136,15,D22,Toronto,15,Kingsway South (15),Ttc Subway Station,Transit,GI,,MT,6,RED,600.0,STOLEN,5,"{'type': 'Point', 'coordinates': (-79.51132657..."


In [75]:
thefts.tail(2)

Unnamed: 0,_id,OBJECTID,event_unique_id,Primary_Offence,Occurrence_Date,Occurrence_Year,Occurrence_Month,Occurrence_DayOfWeek,Occurrence_DayOfMonth,Occurrence_DayOfYear,Occurrence_Hour,Report_Date,Report_Year,Report_Month,Report_DayOfWeek,Report_DayOfMonth,Report_DayOfYear,Report_Hour,Division,City,Hood_ID,NeighbourhoodName,Location_Type,Premises_Type,Bike_Make,Bike_Model,Bike_Type,Bike_Speed,Bike_Colour,Cost_of_Bike,Status,ObjectId2,geometry
25567,25568,11695,GO-20161170896,THEFT UNDER,2016-07-04T00:00:00,2016,July,Monday,4,186,20,2016-07-04T00:00:00,2016,July,Monday,4,186,20,D42,Toronto,132,Malvern (132),Other Commercial / Corporate Places (For Profi...,Commercial,UNKNOWN MAKE,,SC,1,,3000.0,STOLEN,25568,"{'type': 'Point', 'coordinates': (-79.20060719..."
25568,25569,11883,GO-20169007653,THEFT UNDER - BICYCLE,2016-07-22T00:00:00,2016,July,Friday,22,204,9,2016-07-23T00:00:00,2016,July,Saturday,23,205,11,D42,Toronto,132,Malvern (132),"Parking Lots (Apt., Commercial Or Non-Commercial)",Outside,SU,ASCENT MOUNTAIN,MT,21,ONG,200.0,STOLEN,25569,"{'type': 'Point', 'coordinates': (-79.23734742..."


In [76]:
thefts = thefts.rename(columns=str.lower)

In [77]:
thefts.head()

Unnamed: 0,_id,objectid,event_unique_id,primary_offence,occurrence_date,occurrence_year,occurrence_month,occurrence_dayofweek,occurrence_dayofmonth,occurrence_dayofyear,occurrence_hour,report_date,report_year,report_month,report_dayofweek,report_dayofmonth,report_dayofyear,report_hour,division,city,hood_id,neighbourhoodname,location_type,premises_type,bike_make,bike_model,bike_type,bike_speed,bike_colour,cost_of_bike,status,objectid2,geometry
0,1,17744,GO-20179016397,THEFT UNDER,2017-10-03T00:00:00,2017,October,Tuesday,3,276,14,2017-10-03T00:00:00,2017,October,Tuesday,3,276,18,D22,Toronto,15,Kingsway South (15),"Streets, Roads, Highways (Bicycle Path, Privat...",Outside,GI,ESCAPE 2,OT,7,BLK,700.0,STOLEN,1,"{'type': 'Point', 'coordinates': (-79.50655965..."
1,2,17759,GO-20172033056,THEFT UNDER - BICYCLE,2017-11-08T00:00:00,2017,November,Wednesday,8,312,3,2017-11-08T00:00:00,2017,November,Wednesday,8,312,22,D22,Toronto,15,Kingsway South (15),"Single Home, House (Attach Garage, Cottage, Mo...",House,UNKNOWN MAKE,,TO,1,BLK,1100.0,RECOVERED,2,"{'type': 'Point', 'coordinates': (-79.50484874..."
2,3,17906,GO-20189030822,THEFT UNDER - BICYCLE,2018-09-14T00:00:00,2018,September,Friday,14,257,9,2018-09-17T00:00:00,2018,September,Monday,17,260,16,D22,Toronto,15,Kingsway South (15),Ttc Subway Station,Transit,OT,CROSSTRAIL,MT,24,BLK,904.0,STOLEN,3,"{'type': 'Point', 'coordinates': (-79.51170915..."
3,4,17962,GO-2015804467,THEFT UNDER,2015-05-07T00:00:00,2015,May,Thursday,7,127,18,2015-05-14T00:00:00,2015,May,Thursday,14,134,14,D22,Toronto,15,Kingsway South (15),Ttc Subway Station,Transit,GT,,TO,10,BLKDGR,400.0,STOLEN,4,"{'type': 'Point', 'coordinates': (-79.51170915..."
4,5,17963,GO-20159002781,THEFT UNDER,2015-05-16T00:00:00,2015,May,Saturday,16,136,12,2015-05-16T00:00:00,2015,May,Saturday,16,136,15,D22,Toronto,15,Kingsway South (15),Ttc Subway Station,Transit,GI,,MT,6,RED,600.0,STOLEN,5,"{'type': 'Point', 'coordinates': (-79.51132657..."


In [78]:
thefts = thefts.rename(columns={'cost_of_bike':'bike_cost'})

In [79]:
thefts.head()

Unnamed: 0,_id,objectid,event_unique_id,primary_offence,occurrence_date,occurrence_year,occurrence_month,occurrence_dayofweek,occurrence_dayofmonth,occurrence_dayofyear,occurrence_hour,report_date,report_year,report_month,report_dayofweek,report_dayofmonth,report_dayofyear,report_hour,division,city,hood_id,neighbourhoodname,location_type,premises_type,bike_make,bike_model,bike_type,bike_speed,bike_colour,bike_cost,status,objectid2,geometry
0,1,17744,GO-20179016397,THEFT UNDER,2017-10-03T00:00:00,2017,October,Tuesday,3,276,14,2017-10-03T00:00:00,2017,October,Tuesday,3,276,18,D22,Toronto,15,Kingsway South (15),"Streets, Roads, Highways (Bicycle Path, Privat...",Outside,GI,ESCAPE 2,OT,7,BLK,700.0,STOLEN,1,"{'type': 'Point', 'coordinates': (-79.50655965..."
1,2,17759,GO-20172033056,THEFT UNDER - BICYCLE,2017-11-08T00:00:00,2017,November,Wednesday,8,312,3,2017-11-08T00:00:00,2017,November,Wednesday,8,312,22,D22,Toronto,15,Kingsway South (15),"Single Home, House (Attach Garage, Cottage, Mo...",House,UNKNOWN MAKE,,TO,1,BLK,1100.0,RECOVERED,2,"{'type': 'Point', 'coordinates': (-79.50484874..."
2,3,17906,GO-20189030822,THEFT UNDER - BICYCLE,2018-09-14T00:00:00,2018,September,Friday,14,257,9,2018-09-17T00:00:00,2018,September,Monday,17,260,16,D22,Toronto,15,Kingsway South (15),Ttc Subway Station,Transit,OT,CROSSTRAIL,MT,24,BLK,904.0,STOLEN,3,"{'type': 'Point', 'coordinates': (-79.51170915..."
3,4,17962,GO-2015804467,THEFT UNDER,2015-05-07T00:00:00,2015,May,Thursday,7,127,18,2015-05-14T00:00:00,2015,May,Thursday,14,134,14,D22,Toronto,15,Kingsway South (15),Ttc Subway Station,Transit,GT,,TO,10,BLKDGR,400.0,STOLEN,4,"{'type': 'Point', 'coordinates': (-79.51170915..."
4,5,17963,GO-20159002781,THEFT UNDER,2015-05-16T00:00:00,2015,May,Saturday,16,136,12,2015-05-16T00:00:00,2015,May,Saturday,16,136,15,D22,Toronto,15,Kingsway South (15),Ttc Subway Station,Transit,GI,,MT,6,RED,600.0,STOLEN,5,"{'type': 'Point', 'coordinates': (-79.51132657..."


In [80]:
thefts['status'].unique()

array(['STOLEN', 'RECOVERED', 'UNKNOWN'], dtype=object)

In [81]:
thefts['status'].value_counts()

STOLEN       24807
UNKNOWN        454
RECOVERED      308
Name: status, dtype: int64

In [82]:
thefts['bike_cost'].median()

600.0

In [83]:
thefts['bike_cost'].quantile(0.9)

2000.0

In [84]:
thefts.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25569 entries, 0 to 25568
Data columns (total 33 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   _id                    25569 non-null  int64  
 1   objectid               25569 non-null  int64  
 2   event_unique_id        25569 non-null  object 
 3   primary_offence        25569 non-null  object 
 4   occurrence_date        25569 non-null  object 
 5   occurrence_year        25569 non-null  int64  
 6   occurrence_month       25569 non-null  object 
 7   occurrence_dayofweek   25569 non-null  object 
 8   occurrence_dayofmonth  25569 non-null  int64  
 9   occurrence_dayofyear   25569 non-null  int64  
 10  occurrence_hour        25569 non-null  int64  
 11  report_date            25569 non-null  object 
 12  report_year            25569 non-null  int64  
 13  report_month           25569 non-null  object 
 14  report_dayofweek       25569 non-null  object 
 15  re

In [85]:
thefts['occurrence_date']

0        2017-10-03T00:00:00
1        2017-11-08T00:00:00
2        2018-09-14T00:00:00
3        2015-05-07T00:00:00
4        2015-05-16T00:00:00
                ...         
25564    2015-04-01T00:00:00
25565    2016-05-16T00:00:00
25566    2016-06-04T00:00:00
25567    2016-07-04T00:00:00
25568    2016-07-22T00:00:00
Name: occurrence_date, Length: 25569, dtype: object

In [86]:
thefts['occurrence_date'] = pd.to_datetime(thefts['occurrence_date'])
thefts['occurrence_date']

0       2017-10-03
1       2017-11-08
2       2018-09-14
3       2015-05-07
4       2015-05-16
           ...    
25564   2015-04-01
25565   2016-05-16
25566   2016-06-04
25567   2016-07-04
25568   2016-07-22
Name: occurrence_date, Length: 25569, dtype: datetime64[ns]

In [87]:
thefts['report_date'] = pd.to_datetime(thefts['report_date'])
thefts['report_date']

0       2017-10-03
1       2017-11-08
2       2018-09-17
3       2015-05-14
4       2015-05-16
           ...    
25564   2015-04-01
25565   2016-05-16
25566   2016-06-07
25567   2016-07-04
25568   2016-07-23
Name: report_date, Length: 25569, dtype: datetime64[ns]

In [88]:
thefts['status'] = thefts['status'].astype('category')
thefts['status']

0           STOLEN
1        RECOVERED
2           STOLEN
3           STOLEN
4           STOLEN
           ...    
25564       STOLEN
25565       STOLEN
25566       STOLEN
25567       STOLEN
25568       STOLEN
Name: status, Length: 25569, dtype: category
Categories (3, object): ['RECOVERED', 'STOLEN', 'UNKNOWN']

In [89]:
thefts['premises_type'].unique()

array(['Outside', 'House', 'Transit', 'Other', 'Educational', 'Apartment',
       'Commercial'], dtype=object)

In [90]:
thefts[['location_type', 'premises_type']] = thefts[['location_type', 'premises_type']].astype('category')

In [91]:
thefts[['location_type', 'premises_type']].dtypes

location_type    category
premises_type    category
dtype: object

In [92]:
# describe() summarize only numeric columns by default, pass `include='all'` to consider all columns

thefts.describe(include='all', datetime_is_numeric=True)

Unnamed: 0,_id,objectid,event_unique_id,primary_offence,occurrence_date,occurrence_year,occurrence_month,occurrence_dayofweek,occurrence_dayofmonth,occurrence_dayofyear,occurrence_hour,report_date,report_year,report_month,report_dayofweek,report_dayofmonth,report_dayofyear,report_hour,division,city,hood_id,neighbourhoodname,location_type,premises_type,bike_make,bike_model,bike_type,bike_speed,bike_colour,bike_cost,status,objectid2,geometry
count,25569.0,25569.0,25569,25569,25569,25569.0,25569,25569,25569.0,25569.0,25569.0,25569,25569.0,25569,25569,25569.0,25569.0,25569.0,25569,25569,25569.0,25569,25569,25569,25448,15923,25569,25569.0,23508,23825.0,25569,25569.0,25569
unique,,,22771,66,,,12,7,,,,,,12,7,,,,18,2,141.0,141,42,7,820,8097,13,,252,,3,,5816
top,,,GO-20201550944,THEFT UNDER,,,July,Friday,,,,,,July,Monday,,,,D14,Toronto,77.0,Waterfront Communities-The Island (77),"Apartment (Rooming House, Condo)",Outside,OT,UNKNOWN,MT,,BLK,,STOLEN,,"{'type': 'Point', 'coordinates': (-79.38372586..."
freq,,,14,11904,,,4002,3924,,,,,,3988,4318,,,,4580,25560,2576.0,2576,5887,7960,4991,304,8245,,7422,,24807,,167
mean,12785.0,12909.173218,,,2017-09-04 03:39:28.321013504,2017.124174,,,15.616684,202.227698,13.274395,2017-09-12 12:02:37.127771904,2017.143572,,,15.92487,203.493723,14.224139,,,,,,,,,,14.164144,,949.542371,,12785.0,
min,1.0,1.0,,,2009-09-01 00:00:00,2009.0,,,1.0,1.0,0.0,2014-01-01 00:00:00,2014.0,,,1.0,1.0,0.0,,,,,,,,,,0.0,,0.0,,1.0,
25%,6393.0,6456.0,,,2016-01-06 00:00:00,2016.0,,,8.0,153.0,9.0,2016-01-22 00:00:00,2016.0,,,9.0,154.0,11.0,,,,,,,,,,6.0,,350.0,,6393.0,
50%,12785.0,12918.0,,,2017-09-05 00:00:00,2017.0,,,16.0,205.0,14.0,2017-09-12 00:00:00,2017.0,,,16.0,206.0,14.0,,,,,,,,,,15.0,,600.0,,12785.0,
75%,19177.0,19360.0,,,2019-06-20 00:00:00,2019.0,,,23.0,259.0,19.0,2019-06-26 00:00:00,2019.0,,,23.0,260.0,18.0,,,,,,,,,,21.0,,1000.0,,19177.0,
max,25569.0,25806.0,,,2020-12-30 00:00:00,2020.0,,,31.0,366.0,23.0,2020-12-31 00:00:00,2020.0,,,31.0,366.0,23.0,,,,,,,,,,99.0,,120000.0,,25569.0,


In [93]:
# read_csv; read_excel()

delays = pd.read_excel('/content/drive/MyDrive/Colab Notebooks/data/ttc-subway-delay-data-2021.xlsx', sheet_name=None)

In [94]:
type(delays)

dict

In [95]:
# key, value pairs
# keys --> sheet name
# values --> DataFrame sheet contents

# create an empty DataFrame
all_delays = pd.DataFrame()

for sheet, values in delays.items(): # sheet <-- January21, values <-- 1217
  # print the number of rows
  print(f'Adding {values.shape[0]} rows from {sheet}')
  all_delays = pd.concat([all_delays, values],
                         axis=0,
                         ignore_index=True) #reset row labels



Adding 1216 rows from January21
Adding 1245 rows from Feb 21
Adding 1167 rows from March '21
Adding 1170 rows from April '21
Adding 1168 rows from May '21
Adding 1265 rows from June 21
Adding 1244 rows from July 21
Adding 1273 rows from August 21
Adding 1433 rows from Sept 21
Adding 1560 rows from Oct 21
Adding 1771 rows from Nov 21
Adding 1858 rows from December21


In [96]:
all_delays.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16370 entries, 0 to 16369
Data columns (total 10 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   Date       16370 non-null  datetime64[ns]
 1   Time       16370 non-null  object        
 2   Day        16370 non-null  object        
 3   Station    16370 non-null  object        
 4   Code       16370 non-null  object        
 5   Min Delay  16370 non-null  int64         
 6   Min Gap    16370 non-null  int64         
 7   Bound      12119 non-null  object        
 8   Line       16318 non-null  object        
 9   Vehicle    16370 non-null  int64         
dtypes: datetime64[ns](1), int64(3), object(6)
memory usage: 1.2+ MB


In [97]:
all_delays.head()

Unnamed: 0,Date,Time,Day,Station,Code,Min Delay,Min Gap,Bound,Line,Vehicle
0,2021-01-01,00:33,Friday,BLOOR STATION,MUPAA,0,0,N,YU,6046
1,2021-01-01,00:39,Friday,SHERBOURNE STATION,EUCO,5,9,E,BD,5250
2,2021-01-01,01:07,Friday,KENNEDY BD STATION,EUCD,5,9,E,BD,5249
3,2021-01-01,01:41,Friday,ST CLAIR STATION,MUIS,0,0,,YU,0
4,2021-01-01,02:04,Friday,SHEPPARD WEST STATION,MUIS,0,0,,YU,0


In [98]:
all_delays.tail()

Unnamed: 0,Date,Time,Day,Station,Code,Min Delay,Min Gap,Bound,Line,Vehicle
16365,2021-12-31,01:10,Friday,MUSEUM STATION,SUUT,0,0,N,YU,5591
16366,2021-12-31,01:12,Friday,FINCH STATION,SUDP,5,10,S,YU,5983
16367,2021-12-31,01:21,Friday,EGLINTON WEST STATION,PUOPO,3,8,N,YU,6046
16368,2021-12-31,01:37,Friday,SHEPPARD WEST STATION,SUDP,0,0,S,YU,5536
16369,2021-12-31,07:00,Friday,DON MILLS STATION,TUSC,0,0,E,SHP,6146


In [99]:
dr = pd.read_excel('/content/drive/MyDrive/Colab Notebooks/data/ttc-subway-delay-codes.xlsx', sheet_name=None)

type(dr)

dict

In [100]:
delay_reasons = pd.DataFrame()
for sheet, values in dr.items():
  delay_reasons = pd.concat([delay_reasons, values],
                            axis=0,
                            ignore_index=True)
delay_reasons

Unnamed: 0,RMENU CODE,CODE DESCRIPTION,SUB OR SRT
0,EUAC,Air Conditioning,SUB
1,EUAL,Alternating Current,SUB
2,EUATC,ATC RC&S Equipment,SUB
3,EUBK,Brakes,SUB
4,EUBO,Body,SUB
...,...,...,...
195,TRNOA,No Operator Immediately Available,SRT
196,TRO,Transportation Department - Other,SRT
197,TRSET,Train Controls Improperly Shut Down,SRT
198,TRST,Storm Trains,SRT


In [101]:
def clean_names(string):
  return string.lower().replace(' ', '_')

print(list(delay_reasons))
print(list(all_delays))

['RMENU CODE', 'CODE DESCRIPTION', 'SUB OR SRT']
['Date', 'Time', 'Day', 'Station', 'Code', 'Min Delay', 'Min Gap', 'Bound', 'Line', 'Vehicle']


In [102]:
delay_reasons = delay_reasons.rename(columns=clean_names)
all_delays = all_delays.rename(columns=clean_names)

In [103]:
print(list(delay_reasons))
print(list(all_delays))

['rmenu_code', 'code_description', 'sub_or_srt']
['date', 'time', 'day', 'station', 'code', 'min_delay', 'min_gap', 'bound', 'line', 'vehicle']
