In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import  cross_validate
from sklearn.base import BaseEstimator
from sklearn.metrics import (make_scorer, mean_absolute_error)

In [2]:
from _compute_median import _read_all_data

In [3]:
data = _read_all_data()

In [4]:
data.head(4)

Unnamed: 0,address_type,agency,agency_name,bbl,borough,city,closed_date,community_board,complaint_type,created_date,...,taxi_pick_up_location,unique_key,x_coordinate_state_plane,y_coordinate_state_plane,bridge_highway_direction,bridge_highway_name,bridge_highway_segment,landmark,road_ramp,vehicle_type
0,ADDRESS,DSNY,Department of Sanitation,4035140000.0,QUEENS,Ridgewood,NaT,05 QUEENS,Electronics Waste,2019-06-08 00:00:00,...,,42895926,1012550.0,195539.0,,,,,,
1,ADDRESS,NYPD,New York City Police Department,3052260000.0,BROOKLYN,BROOKLYN,NaT,14 BROOKLYN,Noise - Commercial,2019-06-08 00:00:09,...,,42903971,997657.0,171220.0,,,,,,
2,ADDRESS,NYPD,New York City Police Department,2023950000.0,BRONX,BRONX,2019-06-08 01:16:40,03 BRONX,Noise - Residential,2019-06-08 00:00:10,...,,42903748,1009513.0,242412.0,,,,,,
3,ADDRESS,NYPD,New York City Police Department,4003320000.0,QUEENS,ASTORIA,NaT,01 QUEENS,Noise - Residential,2019-06-08 00:00:10,...,,42902798,1002120.0,216630.0,,,,,,


In [5]:
data.columns

Index(['address_type', 'agency', 'agency_name', 'bbl', 'borough', 'city',
       'closed_date', 'community_board', 'complaint_type', 'created_date',
       'cross_street_1', 'cross_street_2', 'descriptor', 'due_date',
       'facility_type', 'incident_address', 'incident_zip',
       'intersection_street_1', 'intersection_street_2', 'latitude',
       'location', 'location_type', 'longitude', 'open_data_channel_type',
       'park_borough', 'park_facility_name', 'resolution_action_updated_date',
       'resolution_description', 'status', 'street_name',
       'taxi_company_borough', 'taxi_pick_up_location', 'unique_key',
       'x_coordinate_state_plane', 'y_coordinate_state_plane',
       'bridge_highway_direction', 'bridge_highway_name',
       'bridge_highway_segment', 'landmark', 'road_ramp', 'vehicle_type'],
      dtype='object')

In [6]:
features = ['complaint_type', 'latitude','longitude', 'created_date']

In [7]:
data['time_to_action'] = (data['resolution_action_updated_date'] - data['created_date']) # / pd.np.timedelta64(1, 'M')

In [8]:
data_ = data.loc[data.complaint_type.str.contains('Noise'), features + ['time_to_action']]
data_ = data_[data_.notnull().all(1)]

In [9]:
data_['time_to_action'] = (data_['time_to_action'].dt.seconds / 3600).astype(int)

In [10]:
y = data_['time_to_action']
X = data_.drop('time_to_action', axis=1)

In [11]:
len(X)

17626

# Little cleaning

In [12]:
X['complaint_type'].unique()

array(['Noise - Commercial', 'Noise - Residential',
       'Noise - Street/Sidewalk', 'Noise - Vehicle', 'Noise - Park',
       'Noise', 'Noise - House of Worship', 'Collection Truck Noise'],
      dtype=object)

In [13]:
proper_names = {
    'Noise - Commercial':'commercial', 
    'Noise - Residential':'residential',
    'Noise - Street/Sidewalk':'street',
    'Noise - Vehicle':'vehicle', 
    'Noise - Park':'park',
    'Noise':'other', 
    'Noise - House of Worship':'worship', 
    'Collection Truck Noise':'truck'
}

In [14]:
X['complaint_type'] = X['complaint_type'].map(proper_names)

In [15]:
X.head(5)

Unnamed: 0,complaint_type,latitude,longitude,created_date
1,commercial,40.636626,-73.951694,2019-06-08 00:00:09
2,residential,40.832004,-73.908709,2019-06-08 00:00:10
3,residential,40.761258,-73.935493,2019-06-08 00:00:10
8,residential,40.707498,-73.949284,2019-06-08 00:01:01
9,commercial,40.707686,-73.931992,2019-06-08 00:01:31


## Feature Generation

In [16]:
X.select_dtypes(include=pd.np.datetime64).columns

Index(['created_date'], dtype='object')

In [17]:
# class TimeTransformer(BaseEstimator):
#     cols = None
    
#     def __init__(self, cols=None):
#         self.cols = cols
    
#     def fit(self, X=None, y=None, groups=None):
        
#         if self.cols is None:
#             self.cols = X.select_dtypes(include=pd.np.datetime64).columns
#         return self
    
#     def transform(self, X, y=None, groups=None, cols=None):
        
#         for col in self.cols:
#             dates = X[col]
#             X = X.drop(col, axis=1)
#             X[f'{col}_dow'] = dates.dt.dayofweek
#             X[f'{col}_doy'] = dates.dt.dayofyear
#             X[f'{col}_tod'] = dates.dt.second

#         return X

from ml import TimeTransformer

In [18]:
t = TimeTransformer()

In [19]:
t.fit(X).transform(X).head(3)

Unnamed: 0,complaint_type,latitude,longitude,created_date_dow,created_date_doy,created_date_tod
1,commercial,40.636626,-73.951694,5,159,9
2,residential,40.832004,-73.908709,5,159,10
3,residential,40.761258,-73.935493,5,159,10


In [20]:
cats = X['complaint_type'].unique().tolist()

In [39]:
ct = ColumnTransformer(
        transformers=[
            ('ordinal', OrdinalEncoder(categories=[cats,]), ['complaint_type']),
            ('time', TimeTransformer(cols=['created_date']), ['created_date',])
        ], remainder='passthrough')

In [40]:
model = RandomForestRegressor(n_estimators=100)

In [41]:
pipe = Pipeline(steps=[('preprocessor', ct),
                        ('model', model)])

## Cross-validate

In [42]:
cv = cross_validate(pipe, X, y, cv=5, scoring=make_scorer(mean_absolute_error),
                    verbose=1, n_jobs=3)

[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done   5 out of   5 | elapsed:    9.5s finished


In [43]:
pd.DataFrame(cv)

Unnamed: 0,fit_time,score_time,test_score
0,4.048088,0.066592,2.560216
1,4.063643,0.063388,3.083713
2,4.051275,0.064837,2.8011
3,3.679384,0.055425,2.681519
4,3.748595,0.059142,2.781879


In [44]:
pd.DataFrame(cv)['test_score'].mean()

2.7816854239715223

## Train and store Model

In [45]:
pipe.fit(X, y)

Pipeline(memory=None,
         steps=[('preprocessor',
                 ColumnTransformer(n_jobs=None, remainder='passthrough',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('ordinal',
                                                  OrdinalEncoder(categories=[['commercial',
                                                                              'residential',
                                                                              'street',
                                                                              'vehicle',
                                                                              'park',
                                                                              'other',
                                                                              'worship',
                                                                     

In [46]:
pipe.predict(X.head(1))[0]

0.93

In [53]:
pipe.steps[0][1].transformers[0][1]

OrdinalEncoder(categories=[['commercial', 'residential', 'street', 'vehicle',
                            'park', 'other', 'worship', 'truck']],
               dtype=<class 'numpy.float64'>)

In [48]:
from joblib import dump, load

In [49]:
# with open('./model.pkl', 'wb') as f:
#     pickle.dump(pipe, f)
dump(pipe, 'model.joblib') 

['model.joblib']