In [1]:
%load_ext autoreload
%autoreload 2

In [20]:
import os
import pandas as pd
import numpy as np
import pickle as pkl
from sklearn.linear_model import LinearRegression, ElasticNet
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.model_selection import learning_curve, GridSearchCV

In [3]:
from ml_logic.data_import import get_divvy_data, get_weather_data
from ml_logic.cleaning import merge_divvy_weather, cleaning_divvy, weather_cleaning
from ml_logic.cleaning import station_stats, features_target
from ml_logic.preprocessor import preprocess_features, target_process
from ml_logic.main import preprocess_test, preprocess

In [4]:
year=os.environ.get('DIVVY_YEAR')
quarter=os.environ.get('DIVVY_QUARTER')

In [5]:
year

'2021'

In [7]:
weather_df=get_weather_data()
weather_df.head()

Unnamed: 0,dt,dt_iso,timezone,city_name,lat,lon,temp,visibility,dew_point,feels_like,...,wind_gust,rain_1h,rain_3h,snow_1h,snow_3h,clouds_all,weather_id,weather_main,weather_description,weather_icon
0,1356998400,2013-01-01 00:00:00 +0000 UTC,-21600,Chicago,41.878114,-87.629798,-2.87,10000.0,-7.38,-7.9,...,,,,,,100,804,Clouds,overcast clouds,04n
1,1357002000,2013-01-01 01:00:00 +0000 UTC,-21600,Chicago,41.878114,-87.629798,-3.12,10000.0,-7.45,-7.35,...,,,,,,100,804,Clouds,overcast clouds,04n
2,1357005600,2013-01-01 02:00:00 +0000 UTC,-21600,Chicago,41.878114,-87.629798,-3.12,10000.0,-7.45,-6.83,...,,,,,,100,804,Clouds,overcast clouds,04n
3,1357009200,2013-01-01 03:00:00 +0000 UTC,-21600,Chicago,41.878114,-87.629798,-2.87,10000.0,-7.72,-7.9,...,,,,,,100,804,Clouds,overcast clouds,04n
4,1357012800,2013-01-01 04:00:00 +0000 UTC,-21600,Chicago,41.878114,-87.629798,-4.17,10000.0,-9.32,-10.57,...,,,,,,100,804,Clouds,overcast clouds,04n


In [8]:
trips_df=get_divvy_data(year,quarter)
trips_df.head()

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual
0,9DC7B962304CBFD8,electric_bike,2021-09-28 16:07:10,2021-09-28 16:09:54,,,,,41.89,-87.68,41.89,-87.67,casual
1,F930E2C6872D6B32,electric_bike,2021-09-28 14:24:51,2021-09-28 14:40:05,,,,,41.94,-87.64,41.98,-87.67,casual
2,6EF72137900BB910,electric_bike,2021-09-28 00:20:16,2021-09-28 00:23:57,,,,,41.81,-87.72,41.8,-87.72,casual
3,78D1DE133B3DBF55,electric_bike,2021-09-28 14:51:17,2021-09-28 15:00:06,,,,,41.8,-87.72,41.81,-87.72,casual
4,E03D4ACDCAEF6E00,electric_bike,2021-09-28 09:53:12,2021-09-28 10:03:44,,,,,41.88,-87.74,41.88,-87.71,casual


In [9]:
station_name = os.environ.get("DIVVY_STATION_NAME")
station_name

'Canal St & Adams St'

In [10]:
clean_divvy_df = cleaning_divvy(trips_df,station_name)
clean_weather_df = weather_cleaning(weather_df)
merged_df = merge_divvy_weather(clean_divvy_df, clean_weather_df)

In [11]:
clean_weather_df.head()

Unnamed: 0,dt_iso,temp,pressure,humidity,wind_speed,wind_deg,clouds_all,hourly_data
0,2013-01-01 00:00:00,-2.87,1018,68,4.12,300,100,2013-01-01 00:00:00
1,2013-01-01 01:00:00,-3.12,1019,69,3.1,310,100,2013-01-01 01:00:00
2,2013-01-01 02:00:00,-3.12,1019,69,2.6,290,100,2013-01-01 02:00:00
3,2013-01-01 03:00:00,-2.87,1019,66,4.12,360,100,2013-01-01 03:00:00
4,2013-01-01 04:00:00,-4.17,1020,64,5.7,330,100,2013-01-01 04:00:00


In [12]:
clean_divvy_df

Unnamed: 0_level_0,nb_departures,nb_arrivals,ratio
hourly_data,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2021-07-01 01:00:00,5.0,2.0,2.5
2021-07-01 02:00:00,1.0,5.0,0.2
2021-07-01 05:00:00,1.0,1.0,1.0
2021-07-01 06:00:00,2.0,2.0,1.0
2021-07-01 07:00:00,5.0,2.0,2.5
...,...,...,...
2021-09-27 10:00:00,0.0,3.0,0.0
2021-09-29 06:00:00,0.0,4.0,0.0
2021-09-30 10:00:00,0.0,5.0,0.0
2021-09-30 15:00:00,0.0,5.0,0.0


In [13]:
merged_df

Unnamed: 0,hourly_data,nb_departures,nb_arrivals,ratio,dt_iso,temp,pressure,humidity,wind_speed,wind_deg,clouds_all
0,2021-07-01 01:00:00,5.0,2.0,2.5,2021-07-01 01:00:00,23.42,1002,82,2.57,190,75
1,2021-07-01 02:00:00,1.0,5.0,0.2,2021-07-01 02:00:00,22.00,1002,81,0.00,0,75
2,2021-07-01 05:00:00,1.0,1.0,1.0,2021-07-01 05:00:00,20.95,1001,85,0.00,0,75
3,2021-07-01 06:00:00,2.0,2.0,1.0,2021-07-01 06:00:00,20.94,1006,85,3.13,38,100
4,2021-07-01 07:00:00,5.0,2.0,2.5,2021-07-01 07:00:00,20.48,1006,84,2.06,320,100
...,...,...,...,...,...,...,...,...,...,...,...
1660,2021-09-27 10:00:00,0.0,3.0,0.0,2021-09-27 10:00:00,19.41,1010,67,3.60,210,20
1661,2021-09-29 06:00:00,0.0,4.0,0.0,2021-09-29 06:00:00,16.77,1007,86,2.57,80,0
1662,2021-09-30 10:00:00,0.0,5.0,0.0,2021-09-30 10:00:00,18.26,1010,74,0.89,199,97
1663,2021-09-30 15:00:00,0.0,5.0,0.0,2021-09-30 15:00:00,21.92,1022,54,2.06,110,75


### Selecting the two targets

In [14]:
target_departures="nb_departures"
target_arrivals="nb_arrivals"

### Building X_departures and X_arrivals and the corresponding targets

In [15]:
X_dep, y_dep=features_target(merged_df,target_departures)
X_arr, y_arr=features_target(merged_df,target_arrivals)

In [16]:
preprocessor_dep,X_dep_processed=preprocess_features(X_dep)
preprocessor_arr,X_arr_processed=preprocess_features(X_arr)

In [17]:
preprocessor_dep

In [18]:
preprocessor_arr

In [21]:
pkl.dump(preprocessor_dep,open('preprocessors/preprocessor_dep.pickle','wb'))
pkl.dump(preprocessor_arr,open('preprocessors/preprocessor_arr.pickle','wb'))

In [22]:
loaded_pp=pkl.load(open('preprocessors/preprocessor_dep.pickle','rb'))

In [23]:
loaded_pp

In [24]:
loaded_pp=pkl.load(open('preprocessors/preprocessor_arr.pickle','rb'))

In [13]:
X_dep_processed.shape

(1843, 27)

### Building X_tests and their corresponding targets

In [25]:
X_test_dep_pro,y_test_dep_pro=preprocess_test(preprocessor_dep,target_departures)
X_test_arr_pro,y_test_arr_pro=preprocess_test(preprocessor_arr,target_arrivals)

Test Raw data imported
Test Data cleaned and merged
Test features and target dataframes created
nb_departures picked as target
Preprocessing of test set is done
Test Raw data imported
Test Data cleaned and merged
Test features and target dataframes created
nb_arrivals picked as target
Preprocessing of test set is done


### Separating X (departures/arrivals) into training and validation splits

In [26]:
X_dep_train,X_dep_val,y_dep_train,y_dep_val=train_test_split(X_dep_processed,
                                                             y_dep,
                                                             test_size=0.2,
                                                             random_state=1)
X_arr_train,X_arr_val,y_arr_train,y_arr_val=train_test_split(X_arr_processed,
                                                             y_arr,
                                                             test_size=0.2,
                                                             random_state=1)

## LINEAR REGRESSION

In [33]:
model_lr=LinearRegression()
cv_results=cross_validate(model_lr,X_dep_train,y_dep_train,
                          cv=5,scoring=('r2','neg_mean_absolute_error'))

In [34]:
cv_results

{'fit_time': array([0.00283694, 0.00123644, 0.00116301, 0.00224423, 0.00112391]),
 'score_time': array([0.000875  , 0.00065279, 0.00060391, 0.00060487, 0.00059605]),
 'test_r2': array([0.33967909, 0.2973448 , 0.30822434, 0.36486714, 0.27413846]),
 'test_neg_mean_absolute_error': array([-2.72428496, -3.15933197, -2.95667373, -3.09356462, -3.13549971])}

In [35]:
r2_lr=round(cv_results['test_r2'].mean(),5)
mse_lr=round(cv_results['test_neg_mean_absolute_error'].mean(),5)
r2_lr,mse_lr

(0.31685, -3.01387)

## ELASTIC NET

In [27]:
model_en=ElasticNet()
grid={'alpha':[0.001,0.01,0.1,1], 
      'l1_ratio':[0.1,0.3,0.5,0.7,0.9]}
search=GridSearchCV(model_en, 
                    grid, 
                    scoring='neg_mean_absolute_error',
                    cv=5,
                    n_jobs=-1)
search.fit(X_dep_train,y_dep_train)

In [28]:
search.best_score_

-3.2201354094220136

In [29]:
search.best_estimator_

In [30]:
search.score(X_test_dep_pro,y_test_dep_pro)

-2.66902946184735

In [34]:
model_en2.coef_.shape

(27,)

In [35]:
model_en2.intercept_

3.7713070521021126

In [31]:
search.predict(X_test_dep_pro)

array([3.3251759 , 1.91447074, 2.21154524, ..., 3.7558994 , 4.35687372,
       3.38739273])

In [75]:
y_test_dep_pro

0       1.0
1       2.0
2       2.0
3       1.0
4       1.0
       ... 
1553    0.0
1554    0.0
1555    0.0
1556    0.0
1557    0.0
Name: nb_departures, Length: 1558, dtype: float64

In [33]:
model_en2=search.best_estimator_
params=model_en2.get_params
type(params)

method

In [40]:
pkl.dump(model_en2,open('models/elasticnet_departures.pickle','wb'))

In [126]:
loaded_model=pkl.load(open('models/elasticnet_departures.pickle','rb'))

In [111]:
savedmodel

b"\x80\x04\x95\xf1\x02\x00\x00\x00\x00\x00\x00\x8c(sklearn.linear_model._coordinate_descent\x94\x8c\nElasticNet\x94\x93\x94)\x81\x94}\x94(\x8c\x05alpha\x94G?\x84z\xe1G\xae\x14{\x8c\x08l1_ratio\x94G?\xb9\x99\x99\x99\x99\x99\x9a\x8c\rfit_intercept\x94\x88\x8c\tnormalize\x94\x8c\ndeprecated\x94\x8c\nprecompute\x94\x89\x8c\x08max_iter\x94M\xe8\x03\x8c\x06copy_X\x94\x88\x8c\x03tol\x94G?\x1a6\xe2\xeb\x1cC-\x8c\nwarm_start\x94\x89\x8c\x08positive\x94\x89\x8c\x0crandom_state\x94N\x8c\tselection\x94\x8c\x06cyclic\x94\x8c\x0en_features_in_\x94K\x1b\x8c\x07n_iter_\x94KB\x8c\x05coef_\x94\x8c\x15numpy.core.multiarray\x94\x8c\x0c_reconstruct\x94\x93\x94\x8c\x05numpy\x94\x8c\x07ndarray\x94\x93\x94K\x00\x85\x94C\x01b\x94\x87\x94R\x94(K\x01K\x1b\x85\x94h\x19\x8c\x05dtype\x94\x93\x94\x8c\x02f8\x94\x89\x88\x87\x94R\x94(K\x03\x8c\x01<\x94NNNJ\xff\xff\xff\xffJ\xff\xff\xff\xffK\x00t\x94b\x89C\xd8Te\xf3\x90O\x80\xa5\xbf\xac\xbc\xf3\x01\xf5\xff\xdf\xbf\xeaP\tY\xdc\x9a\xc9\xbfKj\x9f\xf7\x01\xc8\xae?\xefz\x88\x

In [112]:
model_5=pkl.loads(savedmodel)
model_5

In [113]:
model_5.coef_

array([-0.04199456, -0.49999738, -0.20003847,  0.06011969,  0.17876349,
        0.82849554, -0.2143444 ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        , -0.        ,  1.00545278, -0.44649741,
       -0.43675502,  0.        ,  0.        ,  0.        ,  1.6059971 ,
       -3.91844636,  0.01795492,  0.04694117, -0.26194761, -0.34984194,
       -0.08647541,  0.19291028])

In [114]:
model_5.predict(X_dep_processed)

array([2.27039977, 1.30637488, 1.01918442, ..., 0.78112077, 5.65673808,
       3.74884361])

In [115]:
model_en2.predict(X_dep_processed)

array([2.27039977, 1.30637488, 1.01918442, ..., 0.78112077, 5.65673808,
       3.74884361])

In [116]:
type(savedmodel)

bytes

In [36]:
model_arr=ElasticNet()
grid={'alpha':[0.001,0.01,0.1,1], 
      'l1_ratio':[0.1,0.3,0.5,0.7,0.9]}
search=GridSearchCV(model_arr, 
                    grid, 
                    scoring='neg_mean_absolute_error',
                    cv=5,
                    n_jobs=-1)
search.fit(X_arr_train,y_arr_train)

  model = cd_fast.enet_coordinate_descent(


In [37]:
model_2arr=search.best_estimator_
model_2arr

In [39]:
pkl.dump(model_2arr,open('models/elasticnet_arrivals.pickle','wb'))

In [46]:
departuress=model_en2.predict(X_dep_processed)[800]
departuress

6.562701150956581

In [45]:
arrivals=model_2arr.predict(X_arr_processed)[800]
arrivals

5.49496315877833

In [48]:
from interface_ui.flow.flow import get_station_availability
from ml_logic.model import availability

In [49]:
availability=availability(arrivals=arrivals,departures=departuress)

In [50]:
availability

1

## K NEIGHBORS REGRESSOR

In [28]:
model_knr=KNeighborsRegressor()
cv_results_knr=cross_validate(model_knr,X_dep_train,y_dep_train,
                              cv=5,scoring=('r2','neg_mean_absolute_error'))
cv_results_knr

{'fit_time': array([0.00117826, 0.00085592, 0.00074911, 0.00071573, 0.00073695]),
 'score_time': array([0.00346684, 0.00253081, 0.00226903, 0.0025022 , 0.00258398]),
 'test_r2': array([0.18897639, 0.20357965, 0.19231076, 0.26308964, 0.12342413]),
 'test_neg_mean_absolute_error': array([-3.12135593, -3.42508475, -3.28271186, -3.30779661, -3.57414966])}

In [29]:
r2_knr=cv_results_knr['test_r2'].mean()
mse_knr=cv_results_knr['test_neg_mean_absolute_error'].mean()
r2_knr,mse_knr

(0.19427611554647956, -3.342219762481264)

## LSTM

### First, we need to add more stations to a given model

In [37]:
## retrieving train data for Streeter Dr & Illinois St
X_dep_pro_st2, y_dep_pro_st2, preproc_st2= preprocess(target_departures)

Raw data imported
Data cleaned and merged
features and target dataframes created
features preprocessed
nb_departures picked as target
Preprocessing of Training set is done


In [38]:
## retrieving test data for Streeter Dr & Illinois St
X_test_dep_pro_st2,y_test_dep_pro_st2=preprocess_test(preproc_st2,target_departures)

Test Raw data imported
Test Data cleaned and merged
Test features and target dataframes created
nb_departures picked as target
Preprocessing of test set is done


In [39]:
X_dep_pro_st2.shape,X_dep_processed.shape

((1843, 27), (1843, 27))

In [46]:
## retrieving train data for Canal St & Adams St
X_dep_pro_st3, y_dep_pro_st3, preproc_st3= preprocess(target_departures)

Raw data imported
Data cleaned and merged
features and target dataframes created
features preprocessed
nb_departures picked as target
Preprocessing of Training set is done


In [47]:
X_dep_pro_st3.shape

(1843, 27)

In [48]:
X_dep_pro_st3.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,17,18,19,20,21,22,23,24,25,26
0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,-0.965926,0.258819,0.001727,0.435903,1.173797,-0.936658,-0.145222,0.694638
1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,-0.866025,0.5,-0.054222,-1.479921,1.039771,-0.67979,0.244702,0.694638
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,-0.258819,0.965926,-0.530908,-1.607642,1.173797,-1.974205,-1.607436,0.694638
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.258819,0.965926,-0.609237,-0.969035,1.24081,-0.397738,-1.237008,1.420961
4,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.5,0.866025,-0.712183,-0.969035,1.173797,-0.936658,1.511954,1.420961


In [49]:
X_dep_pro_st2.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,17,18,19,20,21,22,23,24,25,26
0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,-0.965926,0.258819,0.001727,0.435903,1.173797,-0.936658,-0.145222,0.694638
1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,-0.866025,0.5,-0.054222,-1.479921,1.039771,-0.67979,0.244702,0.694638
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,-0.258819,0.965926,-0.530908,-1.607642,1.173797,-1.974205,-1.607436,0.694638
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.258819,0.965926,-0.609237,-0.969035,1.24081,-0.397738,-1.237008,1.420961
4,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.5,0.866025,-0.712183,-0.969035,1.173797,-0.936658,1.511954,1.420961


In [60]:
X_dep_pro_st2.to_csv('raw_data/data/X_dep_proc_st1',index=False)

In [61]:
y_dep_pro_st2.to_csv('raw_data/data/y_dep_proc_st1',index=False)