In [10]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

In [11]:
# load dataset and create copy
csv_file = '../datasets/chicago_trimmed_10000.csv'
df = pd.read_csv(csv_file)

In [12]:
df.head()

Unnamed: 0,date,primary_type,description,location_description,arrest,latitude,longitude
0,2010-10-03 11:30:00+00:00,BATTERY,SIMPLE,SIDEWALK,False,41.745809,-87.547926
1,2005-10-31 03:55:00+00:00,BATTERY,SIMPLE,SIDEWALK,False,41.751061,-87.569876
2,2003-10-07 02:30:00+00:00,BATTERY,SIMPLE,APARTMENT,True,41.751215,-87.571098
3,2009-01-22 02:30:00+00:00,BATTERY,SIMPLE,"SCHOOL, PUBLIC, BUILDING",False,41.73365,-87.557845
4,2020-07-03 12:15:00+00:00,BATTERY,SIMPLE,RESIDENCE,False,41.749243,-87.549214


In [13]:
# check to see if all dates are the same length
df['date'].str.len().unique()

array([25])

In [14]:
# remove time
df['date'] = df['date'].str.slice(stop=10)

# convert str to datetime object
df['date'] = pd.to_datetime(df['date'])

In [15]:
df['date_dayofweek'] = df['date'].dt.dayofweek

df.head()

Unnamed: 0,date,primary_type,description,location_description,arrest,latitude,longitude,date_dayofweek
0,2010-10-03,BATTERY,SIMPLE,SIDEWALK,False,41.745809,-87.547926,6
1,2005-10-31,BATTERY,SIMPLE,SIDEWALK,False,41.751061,-87.569876,0
2,2003-10-07,BATTERY,SIMPLE,APARTMENT,True,41.751215,-87.571098,1
3,2009-01-22,BATTERY,SIMPLE,"SCHOOL, PUBLIC, BUILDING",False,41.73365,-87.557845,3
4,2020-07-03,BATTERY,SIMPLE,RESIDENCE,False,41.749243,-87.549214,4


In [17]:
# if first cond is true take the first value from choice list
# if second cond is true take the second value from choice list
# else take default
day = df['date'].dt.day
df['time_of_month'] = np.select(condlist=[day < 10, day < 20], choicelist=[0,1], default=2)
df.head()

Unnamed: 0,date,primary_type,description,location_description,arrest,latitude,longitude,date_dayofweek,time_of_month
0,2010-10-03,BATTERY,SIMPLE,SIDEWALK,False,41.745809,-87.547926,6,0
1,2005-10-31,BATTERY,SIMPLE,SIDEWALK,False,41.751061,-87.569876,0,2
2,2003-10-07,BATTERY,SIMPLE,APARTMENT,True,41.751215,-87.571098,1,0
3,2009-01-22,BATTERY,SIMPLE,"SCHOOL, PUBLIC, BUILDING",False,41.73365,-87.557845,3,2
4,2020-07-03,BATTERY,SIMPLE,RESIDENCE,False,41.749243,-87.549214,4,0


### Convert date to epoch timestamp
Docs: https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#from-timestamps-to-epoch

In [21]:
# convert to UNIX/EPOCH
epoch_col = (df['date'] - pd.Timestamp("1970-01-01")) // pd.Timedelta('1s')
df['date'] = epoch_col
epoch_col

0       1286064000
1       1130716800
2       1065484800
3       1232582400
4       1593734400
           ...    
9995    1331856000
9996    1172620800
9997    1253664000
9998    1266019200
9999    1365379200
Name: date, Length: 10000, dtype: int64

## Normalize date values
Using MinMaxScaler from scikitlearn. If there is too many outliers it will not perform as well because values is only between 0 and 1 

### Example of scaling values

In [22]:
def scale_df(df):
    # create a scaler object
    scaler = MinMaxScaler()

    # fit and transform the data
    scaled = pd.DataFrame(scaler.fit_transform(test_df), columns=test_df.columns)
    
    return scaled

In [24]:
# create test dataframe
test_df = pd.DataFrame({
    'boolean':[0.0, 0.0, 0.0, 0.0, 0.0, 0.1, 0.1, 0.1, 0.1, 0.1], 
    'numbers':[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
})
scale_df(test_df)

Unnamed: 0,boolean,numbers
0,0.0,0.0
1,0.0,0.111111
2,0.0,0.222222
3,0.0,0.333333
4,0.0,0.444444
5,1.0,0.555556
6,1.0,0.666667
7,1.0,0.777778
8,1.0,0.888889
9,1.0,1.0
