In [75]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import  cross_validate
from sklearn.base import BaseEstimator
from sklearn.metrics import (make_scorer, mean_absolute_error)

In [76]:
from sklearn import __version__
__version__


'0.21.1'

In [77]:
from _compute_median import _read_all_data

In [78]:
data = _read_all_data()

In [79]:
data.head(4)

Unnamed: 0,address_type,agency,agency_name,bbl,borough,city,closed_date,community_board,complaint_type,created_date,...,taxi_pick_up_location,unique_key,x_coordinate_state_plane,y_coordinate_state_plane,bridge_highway_direction,bridge_highway_name,bridge_highway_segment,landmark,road_ramp,vehicle_type
0,ADDRESS,DSNY,Department of Sanitation,4035140000.0,QUEENS,Ridgewood,NaT,05 QUEENS,Electronics Waste,2019-06-08 00:00:00,...,,42895926,1012550.0,195539.0,,,,,,
1,ADDRESS,NYPD,New York City Police Department,3052260000.0,BROOKLYN,BROOKLYN,NaT,14 BROOKLYN,Noise - Commercial,2019-06-08 00:00:09,...,,42903971,997657.0,171220.0,,,,,,
2,ADDRESS,NYPD,New York City Police Department,2023950000.0,BRONX,BRONX,2019-06-08 01:16:40,03 BRONX,Noise - Residential,2019-06-08 00:00:10,...,,42903748,1009513.0,242412.0,,,,,,
3,ADDRESS,NYPD,New York City Police Department,4003320000.0,QUEENS,ASTORIA,NaT,01 QUEENS,Noise - Residential,2019-06-08 00:00:10,...,,42902798,1002120.0,216630.0,,,,,,


In [80]:
data.columns

Index(['address_type', 'agency', 'agency_name', 'bbl', 'borough', 'city',
       'closed_date', 'community_board', 'complaint_type', 'created_date',
       'cross_street_1', 'cross_street_2', 'descriptor', 'due_date',
       'facility_type', 'incident_address', 'incident_zip',
       'intersection_street_1', 'intersection_street_2', 'latitude',
       'location', 'location_type', 'longitude', 'open_data_channel_type',
       'park_borough', 'park_facility_name', 'resolution_action_updated_date',
       'resolution_description', 'status', 'street_name',
       'taxi_company_borough', 'taxi_pick_up_location', 'unique_key',
       'x_coordinate_state_plane', 'y_coordinate_state_plane',
       'bridge_highway_direction', 'bridge_highway_name',
       'bridge_highway_segment', 'landmark', 'road_ramp', 'vehicle_type'],
      dtype='object')

In [81]:
features = ['complaint_type', 'latitude','longitude', 'created_date']

In [82]:
data['time_to_action'] = (data['resolution_action_updated_date'] - data['created_date']) # / pd.np.timedelta64(1, 'M')

In [83]:
data_ = data.loc[data.complaint_type.str.contains('Noise'), features + ['time_to_action']]
data_ = data_[data_.notnull().all(1)]

In [84]:
data_['time_to_action'] = (data_['time_to_action'].dt.seconds / 3600).astype(int)

In [85]:
y = data_['time_to_action']
X = data_.drop('time_to_action', axis=1)

In [86]:
len(X)

17626

# Little cleaning

In [87]:
X['complaint_type'].unique()

array(['Noise - Commercial', 'Noise - Residential',
       'Noise - Street/Sidewalk', 'Noise - Vehicle', 'Noise - Park',
       'Noise', 'Noise - House of Worship', 'Collection Truck Noise'],
      dtype=object)

In [88]:
proper_names = {
    'Noise - Commercial':'commercial', 
    'Noise - Residential':'residential',
    'Noise - Street/Sidewalk':'street',
    'Noise - Vehicle':'vehicle', 
    'Noise - Park':'park',
    'Noise':'other', 
    'Noise - House of Worship':'worship', 
    'Collection Truck Noise':'truck'
}

In [89]:
X['complaint_type'] = X['complaint_type'].map(proper_names)

In [90]:
X.head(5)

Unnamed: 0,complaint_type,latitude,longitude,created_date
1,commercial,40.636626,-73.951694,2019-06-08 00:00:09
2,residential,40.832004,-73.908709,2019-06-08 00:00:10
3,residential,40.761258,-73.935493,2019-06-08 00:00:10
8,residential,40.707498,-73.949284,2019-06-08 00:01:01
9,commercial,40.707686,-73.931992,2019-06-08 00:01:31


## Feature Generation

In [91]:
# class TimeTransformer(BaseEstimator):
#     cols = None
    
#     def __init__(self, cols=None):
#         self.cols = cols
    
#     def fit(self, X=None, y=None, groups=None):
        
#         if self.cols is None:
#             self.cols = X.select_dtypes(include=pd.np.datetime64).columns
#         return self
    
#     def transform(self, X, y=None, groups=None, cols=None):
        
#         for col in self.cols:
#             dates = X[col]
#             X = X.drop(col, axis=1)
#             X[f'{col}_dow'] = dates.dt.dayofweek
#             X[f'{col}_doy'] = dates.dt.dayofyear
#             X[f'{col}_tod'] = dates.dt.second

#         return X

from ml import TimeTransformer

In [92]:
t = TimeTransformer()

In [93]:
t.fit(X).transform(X).head(3)

Unnamed: 0,complaint_type,latitude,longitude,created_date_dow,created_date_doy,created_date_tod
1,commercial,40.636626,-73.951694,5,159,9
2,residential,40.832004,-73.908709,5,159,10
3,residential,40.761258,-73.935493,5,159,10


In [94]:
cats = X['complaint_type'].unique().tolist()

In [95]:
ct = ColumnTransformer(
        transformers=[
            ('ordinal', OrdinalEncoder(categories=[cats,]), ['complaint_type']),
            ('time', TimeTransformer(cols=['created_date']), ['created_date',])
        ], remainder='passthrough')

In [96]:
model = RandomForestRegressor(n_estimators=100)

In [97]:
pipe = Pipeline(steps=[('preprocessor', ct),
                        ('model', model)])

## Cross-validate

In [98]:
cv = cross_validate(pipe, X, y, cv=5, scoring=make_scorer(mean_absolute_error),
                    verbose=1, n_jobs=3)

[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done   5 out of   5 | elapsed:    9.3s finished


In [99]:
pd.DataFrame(cv)

Unnamed: 0,fit_time,score_time,test_score
0,3.798304,0.06316,2.568024
1,3.818128,0.057942,3.049546
2,3.827793,0.065426,2.80146
3,3.648316,0.054539,2.683843
4,3.691697,0.061576,2.775557


In [100]:
pd.DataFrame(cv)['test_score'].mean()

2.7756860646509987

## Train and store Model

In [101]:
pipe.fit(X, y)

Pipeline(memory=None,
         steps=[('preprocessor',
                 ColumnTransformer(n_jobs=None, remainder='passthrough',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('ordinal',
                                                  OrdinalEncoder(categories=[['commercial',
                                                                              'residential',
                                                                              'street',
                                                                              'vehicle',
                                                                              'park',
                                                                              'other',
                                                                              'worship',
                                                                     

In [102]:
pipe.predict(X.head(1))[0]

0.93

In [103]:
# from joblib import dump, load
import pickle

In [104]:
with open('./model.pkl', 'wb') as f:
    pickle.dump(pipe, f)


## Testing

In [105]:
singleton = pd.DataFrame([{'complaint_type':'dummy', 
                           'latitude':1.1111, 
                           'longitude':1.1111,
                           'created_date':pd.to_datetime('2019-01-01')}])

In [106]:
BODY = {
    'complaint_type': 'residential',
    'lat': "40.636626",
    'lon': "-73.951694",
    "date": "2019-06-08 00:00:09"
}

In [107]:
mapping = {
    'lon': 'longitude',
    'lat': 'latitude',
    'date': 'created_date'
}

dtypes = {
    'lon': float,
    'lat': float,
    'date': pd.to_datetime
}

In [108]:
singleton.loc[0, 'complaint_type'] = BODY['complaint_type']

for k, col in mapping.items():
    singleton.loc[0, col] = dtypes[k](BODY.get(k, pd.np.nan))

In [109]:
singleton

Unnamed: 0,complaint_type,created_date,latitude,longitude
0,residential,2019-06-08 00:00:09,40.636626,-73.951694


In [110]:
singleton.dtypes

complaint_type            object
created_date      datetime64[ns]
latitude                 float64
longitude                float64
dtype: object

In [111]:
X.dtypes

complaint_type            object
latitude                 float64
longitude                float64
created_date      datetime64[ns]
dtype: object

In [112]:
pipe.predict(singleton[['complaint_type', 'latitude', 'longitude','created_date']])[0]

0.82