In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import pandas as pd
import numpy as np
import pickle as pkl
from sklearn.linear_model import LinearRegression, ElasticNet
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.model_selection import learning_curve, GridSearchCV

In [6]:
from divvy.ml_logic.data_import import get_divvy_data, get_weather_data, get_station_data
from divvy.ml_logic.cleaning import cleaning_divvy_gen,cleaning_divvy_gen_agg,weather_cleaning
from divvy.ml_logic.cleaning import features_target, merge_divvy_weather, compute_geohash_stations
from divvy.ml_logic.preprocessor import preprocess_features, target_process
from divvy.ml_logic.main import preprocess_test, preprocess
from divvy.interface_ui.flow.flow import transform_user_inputs,get_station_availability
from divvy.interface_ui.flow.ui_utils import process_weather_inputs,get_coordinates
from divvy.ml_logic.model import initialize_model

In [7]:
year=os.environ.get('DIVVY_YEAR')
quarter=os.environ.get('DIVVY_QUARTER')

In [8]:
year

'2021'

In [9]:
weather_df=get_weather_data()
weather_df.head()

Unnamed: 0,dt,dt_iso,timezone,city_name,lat,lon,temp,visibility,dew_point,feels_like,...,wind_gust,rain_1h,rain_3h,snow_1h,snow_3h,clouds_all,weather_id,weather_main,weather_description,weather_icon
0,1356998400,2013-01-01 00:00:00 +0000 UTC,-21600,Chicago,41.878114,-87.629798,-2.87,10000.0,-7.38,-7.9,...,,,,,,100,804,Clouds,overcast clouds,04n
1,1357002000,2013-01-01 01:00:00 +0000 UTC,-21600,Chicago,41.878114,-87.629798,-3.12,10000.0,-7.45,-7.35,...,,,,,,100,804,Clouds,overcast clouds,04n
2,1357005600,2013-01-01 02:00:00 +0000 UTC,-21600,Chicago,41.878114,-87.629798,-3.12,10000.0,-7.45,-6.83,...,,,,,,100,804,Clouds,overcast clouds,04n
3,1357009200,2013-01-01 03:00:00 +0000 UTC,-21600,Chicago,41.878114,-87.629798,-2.87,10000.0,-7.72,-7.9,...,,,,,,100,804,Clouds,overcast clouds,04n
4,1357012800,2013-01-01 04:00:00 +0000 UTC,-21600,Chicago,41.878114,-87.629798,-4.17,10000.0,-9.32,-10.57,...,,,,,,100,804,Clouds,overcast clouds,04n


In [10]:
trips_df=get_divvy_data(year,quarter)
trips_df.head()

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual
0,9DC7B962304CBFD8,electric_bike,2021-09-28 16:07:10,2021-09-28 16:09:54,,,,,41.89,-87.68,41.89,-87.67,casual
1,F930E2C6872D6B32,electric_bike,2021-09-28 14:24:51,2021-09-28 14:40:05,,,,,41.94,-87.64,41.98,-87.67,casual
2,6EF72137900BB910,electric_bike,2021-09-28 00:20:16,2021-09-28 00:23:57,,,,,41.81,-87.72,41.8,-87.72,casual
3,78D1DE133B3DBF55,electric_bike,2021-09-28 14:51:17,2021-09-28 15:00:06,,,,,41.8,-87.72,41.81,-87.72,casual
4,E03D4ACDCAEF6E00,electric_bike,2021-09-28 09:53:12,2021-09-28 10:03:44,,,,,41.88,-87.74,41.88,-87.71,casual


In [11]:
station_name = os.environ.get("DIVVY_STATION_NAME")
station_name

'Canal St & Adams St'

In [12]:
clean_divvy_df = cleaning_divvy_gen_agg(trips_df)
clean_weather_df = weather_cleaning(weather_df)
merged_df = merge_divvy_weather(clean_divvy_df, clean_weather_df)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_stations_reduced.rename(columns={"name":"station_name"}, inplace=True)


In [13]:
clean_weather_df.head()

Unnamed: 0,dt_iso,temp,pressure,humidity,wind_speed,wind_deg,clouds_all,hourly_data
0,2013-01-01 00:00:00,-2.87,1018,68,4.12,300,100,2013-01-01 00:00:00
1,2013-01-01 01:00:00,-3.12,1019,69,3.1,310,100,2013-01-01 01:00:00
2,2013-01-01 02:00:00,-3.12,1019,69,2.6,290,100,2013-01-01 02:00:00
3,2013-01-01 03:00:00,-2.87,1019,66,4.12,360,100,2013-01-01 03:00:00
4,2013-01-01 04:00:00,-4.17,1020,64,5.7,330,100,2013-01-01 04:00:00


In [14]:
clean_divvy_df

Unnamed: 0,geohash,hourly_data,nb_departures,nb_arrivals,ratio
0,dp3sy,2021-07-01 04:00:00,1.0,1.0,1.0
1,dp3sy,2021-07-01 17:00:00,1.0,0.0,inf
2,dp3sy,2021-07-01 18:00:00,1.0,2.0,0.5
3,dp3sy,2021-07-01 21:00:00,1.0,0.0,inf
4,dp3sy,2021-07-01 23:00:00,4.0,1.0,4.0
...,...,...,...,...,...
52541,dp3xk,2021-09-28 07:00:00,0.0,1.0,0.0
52542,dp3xk,2021-09-29 02:00:00,0.0,1.0,0.0
52543,dp3xk,2021-09-30 07:00:00,0.0,1.0,0.0
52544,dp3xk,2021-09-30 20:00:00,0.0,4.0,0.0


In [15]:
model=initialize_model()
model

In [18]:
type(model)

sklearn.ensemble._stacking.StackingRegressor

In [17]:
merged_df

Unnamed: 0,geohash,hourly_data,nb_departures,nb_arrivals,ratio,dt_iso,temp,pressure,humidity,wind_speed,wind_deg,clouds_all
0,dp3sy,2021-07-01 04:00:00,1.0,1.0,1.0,2021-07-01 04:00:00,21.29,1001,84,0.00,0,75
1,dp3sy,2021-07-01 17:00:00,1.0,0.0,inf,2021-07-01 17:00:00,23.18,1018,62,7.20,20,20
2,dp3sy,2021-07-01 18:00:00,1.0,2.0,0.5,2021-07-01 18:00:00,23.36,1018,61,8.23,30,20
3,dp3sy,2021-07-01 21:00:00,1.0,0.0,inf,2021-07-01 21:00:00,22.44,1018,55,8.75,30,20
4,dp3sy,2021-07-01 23:00:00,4.0,1.0,4.0,2021-07-01 23:00:00,20.64,1019,62,7.72,40,20
...,...,...,...,...,...,...,...,...,...,...,...,...
52541,dp3xk,2021-09-28 07:00:00,0.0,1.0,0.0,2021-09-28 07:00:00,17.46,1014,81,4.63,30,0
52542,dp3xk,2021-09-29 02:00:00,0.0,1.0,0.0,2021-09-29 02:00:00,17.47,1007,79,2.57,80,0
52543,dp3xk,2021-09-30 07:00:00,0.0,1.0,0.0,2021-09-30 07:00:00,18.86,1012,78,1.54,100,40
52544,dp3xk,2021-09-30 20:00:00,0.0,4.0,0.0,2021-09-30 20:00:00,24.70,1022,47,4.63,90,75


### Selecting the two targets

In [19]:
target_departures="nb_departures"
target_arrivals="nb_arrivals"

### Building X_departures and X_arrivals and the corresponding targets

In [20]:
X_dep, y_dep=features_target(merged_df,target_departures)
X_arr, y_arr=features_target(merged_df,target_arrivals)

In [None]:
preprocessor_dep,X_dep_processed=preprocess_features(X_dep)
preprocessor_arr,X_arr_processed=preprocess_features(X_arr)

In [None]:
preprocessor_dep

In [None]:
preprocessor_arr

In [None]:
pkl.dump(preprocessor_dep,open('preprocessors/preprocessor_dep.pickle','wb'))
pkl.dump(preprocessor_arr,open('preprocessors/preprocessor_arr.pickle','wb'))

In [None]:
loaded_pp=pkl.load(open('preprocessors/preprocessor_dep.pickle','rb'))

In [None]:
loaded_pp

In [None]:
loaded_pp=pkl.load(open('preprocessors/preprocessor_arr.pickle','rb'))

In [None]:
X_dep_processed.shape

### Building X_tests and their corresponding targets

In [None]:
X_test_dep_pro,y_test_dep_pro=preprocess_test(preprocessor_dep,target_departures)
X_test_arr_pro,y_test_arr_pro=preprocess_test(preprocessor_arr,target_arrivals)

### Separating X (departures/arrivals) into training and validation splits

In [None]:
X_dep_train,X_dep_val,y_dep_train,y_dep_val=train_test_split(X_dep_processed,
                                                             y_dep,
                                                             test_size=0.2,
                                                             random_state=1)
X_arr_train,X_arr_val,y_arr_train,y_arr_val=train_test_split(X_arr_processed,
                                                             y_arr,
                                                             test_size=0.2,
                                                             random_state=1)

## LINEAR REGRESSION

In [None]:
model_lr=LinearRegression()
cv_results=cross_validate(model_lr,X_dep_train,y_dep_train,
                          cv=5,scoring=('r2','neg_mean_absolute_error'))

In [None]:
cv_results

In [None]:
r2_lr=round(cv_results['test_r2'].mean(),5)
mse_lr=round(cv_results['test_neg_mean_absolute_error'].mean(),5)
r2_lr,mse_lr

## ELASTIC NET

In [None]:
model_en=ElasticNet()
grid={'alpha':[0.001,0.01,0.1,1], 
      'l1_ratio':[0.1,0.3,0.5,0.7,0.9]}
search=GridSearchCV(model_en, 
                    grid, 
                    scoring='neg_mean_absolute_error',
                    cv=5,
                    n_jobs=-1)
search.fit(X_dep_train,y_dep_train)

In [None]:
search.best_score_

In [None]:
search.best_estimator_

In [None]:
search.score(X_test_dep_pro,y_test_dep_pro)

In [None]:
model_en2.coef_.shape

In [None]:
model_en2.intercept_

In [None]:
search.predict(X_test_dep_pro)

In [None]:
y_test_dep_pro

In [None]:
model_en2=search.best_estimator_
params=model_en2.get_params
type(params)

In [None]:
pkl.dump(model_en2,open('models/elasticnet_departures.pickle','wb'))

In [None]:
loaded_model=pkl.load(open('models/elasticnet_departures.pickle','rb'))

In [None]:
savedmodel


In [None]:
model_5=pkl.loads(savedmodel)
model_5

In [None]:
model_5.coef_

In [None]:
model_5.predict(X_dep_processed)

In [None]:
model_en2.predict(X_dep_processed)

In [None]:
type(savedmodel)

In [None]:
model_arr=ElasticNet()
grid={'alpha':[0.001,0.01,0.1,1], 
      'l1_ratio':[0.1,0.3,0.5,0.7,0.9]}
search=GridSearchCV(model_arr, 
                    grid, 
                    scoring='neg_mean_absolute_error',
                    cv=5,
                    n_jobs=-1)
search.fit(X_arr_train,y_arr_train)

In [None]:
model_2arr=search.best_estimator_
model_2arr

In [None]:
pkl.dump(model_2arr,open('models/elasticnet_arrivals.pickle','wb'))

In [None]:
departuress=model_en2.predict(X_dep_processed)[800]
departuress

In [None]:
arrivals=model_2arr.predict(X_arr_processed)[800]
arrivals

In [None]:
from interface_ui.flow.flow import get_station_availability
from ml_logic.model import availability

In [None]:
availability=availability(arrivals=arrivals,departures=departuress)

In [None]:
availability

## K NEIGHBORS REGRESSOR

In [None]:
model_knr=KNeighborsRegressor()
cv_results_knr=cross_validate(model_knr,X_dep_train,y_dep_train,
                              cv=5,scoring=('r2','neg_mean_absolute_error'))
cv_results_knr

In [None]:
r2_knr=cv_results_knr['test_r2'].mean()
mse_knr=cv_results_knr['test_neg_mean_absolute_error'].mean()
r2_knr,mse_knr

## LSTM

### First, we need to add more stations to a given model

In [None]:
## retrieving train data for Streeter Dr & Illinois St
X_dep_pro_st2, y_dep_pro_st2, preproc_st2= preprocess(target_departures)

In [None]:
## retrieving test data for Streeter Dr & Illinois St
X_test_dep_pro_st2,y_test_dep_pro_st2=preprocess_test(preproc_st2,target_departures)

In [None]:
X_dep_pro_st2.shape,X_dep_processed.shape

In [None]:
## retrieving train data for Canal St & Adams St
X_dep_pro_st3, y_dep_pro_st3, preproc_st3= preprocess(target_departures)

In [None]:
X_dep_pro_st3.shape

In [None]:
X_dep_pro_st3.head()

In [None]:
X_dep_pro_st2.head()

In [None]:
X_dep_pro_st2.to_csv('raw_data/data/X_dep_proc_st1',index=False)

In [None]:
y_dep_pro_st2.to_csv('raw_data/data/y_dep_proc_st1',index=False)