In [13]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA

In [2]:
# load dataset and create copy
csv_file = '../../datasets/chicago_trimmed_with_police_data.csv'
df = pd.read_csv(csv_file)

In [3]:
df.head()

Unnamed: 0,date,primary_type,description,location_description,arrest,latitude,longitude,year,population,officers,civilians
0,2010-10-03 11:30:00+00:00,BATTERY,SIMPLE,SIDEWALK,False,41.745809,-87.547926,2010,2833649,12515.0,803.0
1,2010-08-02 04:30:00+00:00,BATTERY,DOMESTIC BATTERY SIMPLE,RESIDENCE,True,41.751936,-87.550992,2010,2833649,12515.0,803.0
2,2010-09-17 10:10:00+00:00,BATTERY,DOMESTIC BATTERY SIMPLE,RESIDENCE,False,41.731268,-87.542923,2010,2833649,12515.0,803.0
3,2010-12-31 04:50:00+00:00,BATTERY,DOMESTIC BATTERY SIMPLE,APARTMENT,False,41.740451,-87.544181,2010,2833649,12515.0,803.0
4,2010-09-08 06:55:00+00:00,BATTERY,DOMESTIC BATTERY SIMPLE,PARKING LOT/GARAGE(NON.RESID.),False,41.736469,-87.54582,2010,2833649,12515.0,803.0


In [4]:
# check to see if all dates are the same length
df['date'].str.len().unique()

array([25])

In [5]:
# remove time
df['date'] = df['date'].str.slice(stop=10)

# convert str to datetime object
df['date'] = pd.to_datetime(df['date'])

In [6]:
df['date_day_of_week'] = df['date'].dt.dayofweek

df.head()

Unnamed: 0,date,primary_type,description,location_description,arrest,latitude,longitude,year,population,officers,civilians,date_day_of_week
0,2010-10-03,BATTERY,SIMPLE,SIDEWALK,False,41.745809,-87.547926,2010,2833649,12515.0,803.0,6
1,2010-08-02,BATTERY,DOMESTIC BATTERY SIMPLE,RESIDENCE,True,41.751936,-87.550992,2010,2833649,12515.0,803.0,0
2,2010-09-17,BATTERY,DOMESTIC BATTERY SIMPLE,RESIDENCE,False,41.731268,-87.542923,2010,2833649,12515.0,803.0,4
3,2010-12-31,BATTERY,DOMESTIC BATTERY SIMPLE,APARTMENT,False,41.740451,-87.544181,2010,2833649,12515.0,803.0,4
4,2010-09-08,BATTERY,DOMESTIC BATTERY SIMPLE,PARKING LOT/GARAGE(NON.RESID.),False,41.736469,-87.54582,2010,2833649,12515.0,803.0,2


In [7]:
# if first cond is true take the first value from choice list
# if second cond is true take the second value from choice list
# else take default
day = df['date'].dt.day
df['time_of_month'] = np.select(condlist=[day < 10, day < 20], choicelist=[0,1], default=2)
df.head()

Unnamed: 0,date,primary_type,description,location_description,arrest,latitude,longitude,year,population,officers,civilians,date_day_of_week,time_of_month
0,2010-10-03,BATTERY,SIMPLE,SIDEWALK,False,41.745809,-87.547926,2010,2833649,12515.0,803.0,6,0
1,2010-08-02,BATTERY,DOMESTIC BATTERY SIMPLE,RESIDENCE,True,41.751936,-87.550992,2010,2833649,12515.0,803.0,0,0
2,2010-09-17,BATTERY,DOMESTIC BATTERY SIMPLE,RESIDENCE,False,41.731268,-87.542923,2010,2833649,12515.0,803.0,4,1
3,2010-12-31,BATTERY,DOMESTIC BATTERY SIMPLE,APARTMENT,False,41.740451,-87.544181,2010,2833649,12515.0,803.0,4,2
4,2010-09-08,BATTERY,DOMESTIC BATTERY SIMPLE,PARKING LOT/GARAGE(NON.RESID.),False,41.736469,-87.54582,2010,2833649,12515.0,803.0,2,0


### Convert date to epoch timestamp
Docs: https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#from-timestamps-to-epoch

In [8]:
# convert to UNIX/EPOCH
epoch_col = (df['date'] - pd.Timestamp("1970-01-01")) // pd.Timedelta('1s')
df['date'] = epoch_col
epoch_col

0          1286064000
1          1280707200
2          1284681600
3          1293753600
4          1283904000
              ...    
6976325     978307200
6976326     978307200
6976327     983491200
6976328     998956800
6976329    1006819200
Name: date, Length: 6976330, dtype: int64

## Normalize date values
Using MinMaxScaler from scikitlearn. If there is too many outliers it will not perform as well because values is only between 0 and 1 

### Example of scaling values

In [9]:
def scale_df(df):
    # create a scaler object
    scaler = MinMaxScaler()

    # fit and transform the data
    scaled = pd.DataFrame(scaler.fit_transform(test_df), columns=test_df.columns)
    
    return scaled

In [10]:
# create test dataframe
test_df = pd.DataFrame({
    'boolean':[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0], 
    'numbers':[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
})
scale_df(test_df)

Unnamed: 0,boolean,numbers
0,0.0,0.0
1,0.0,0.111111
2,0.0,0.222222
3,0.0,0.333333
4,0.0,0.444444
5,1.0,0.555556
6,1.0,0.666667
7,1.0,0.777778
8,1.0,0.888889
9,1.0,1.0


## PCA

In [16]:

def apply_pca(df, variance=0.99):
    pca = PCA(variance)
    pca.fit(df)
    df_pca = pca_transform(df)
    return df_pca

In [17]:
df_pca = apply_pca(df)
df_pca

ValueError: could not convert string to float: 'BATTERY'

## Removing year from the police part of dataset

In [11]:
df = df.drop(['year'], axis=1)

In [12]:
df.head()

Unnamed: 0,date,primary_type,description,location_description,arrest,latitude,longitude,population,officers,civilians,date_day_of_week,time_of_month
0,1286064000,BATTERY,SIMPLE,SIDEWALK,False,41.745809,-87.547926,2833649,12515.0,803.0,6,0
1,1280707200,BATTERY,DOMESTIC BATTERY SIMPLE,RESIDENCE,True,41.751936,-87.550992,2833649,12515.0,803.0,0,0
2,1284681600,BATTERY,DOMESTIC BATTERY SIMPLE,RESIDENCE,False,41.731268,-87.542923,2833649,12515.0,803.0,4,1
3,1293753600,BATTERY,DOMESTIC BATTERY SIMPLE,APARTMENT,False,41.740451,-87.544181,2833649,12515.0,803.0,4,2
4,1283904000,BATTERY,DOMESTIC BATTERY SIMPLE,PARKING LOT/GARAGE(NON.RESID.),False,41.736469,-87.54582,2833649,12515.0,803.0,2,0
