In [1]:
pwd

'/home/gdolle/code/G-Dolle/DIVVY_BIKE/notebooks'

In [2]:
cd '/home/gdolle/code/G-Dolle/DIVVY_BIKE'

/home/gdolle/code/G-Dolle/DIVVY_BIKE


In [3]:
import pandas as pd
import numpy as np
import requests
import urllib.parse
import os
from sklearn.neighbors import NearestNeighbors
from datetime import datetime
from ml_logic.cleaning import weather_cleaning

In [5]:
def chicago_weather_forecast():
    '''Return a 5-day weather forecast for the city of Chicago'''

    api_key= os.environ.get("WEATHER_API_KEY")

    
    BASE_URI="https://weather.lewagon.com"
    url=urllib.parse.urljoin(BASE_URI, "/data/2.5/forecast")
    forecasts=requests.get(url, params={'lat': 41.87, 'lon': -87.62, 'units': 'metric', 'appid':api_key}).json()['list']
    for_list = list(forecasts)
    return for_list

In [6]:
forecasts = chicago_weather_forecast()

In [7]:
type(forecasts)

list

In [9]:
len(forecasts)

40

In [5]:
def convert_chicago_forecast_todf(forecasts:list):
    """
    Return a pre-preprocessed dataframe for the 5-day
    weather forecast of Chicago

    """

    def rename_keys(dico,new_keys):
        '''
        Allows to replace keys' names in a dictionary with new ones
        '''
        tmp = dict( zip( list(dico.keys()), new_keys) )
        result = {tmp[oldK]: value for oldK, value in dico.items()}

        return result

    def slice_cleaning(forecasts,slice):
        '''
        Separates the different dictionaries within the json file returned by the
        weather API and concatenates these into a single dictionary
        '''
        one_obs = forecasts[slice]
        main = one_obs["main"]
        weather = one_obs["weather"][0]
        clouds = one_obs["clouds"]
        wind = one_obs["wind"]

        dt_txt = one_obs["dt_txt"]
        visibility = one_obs["visibility"]


        new_keys_weather = ["weather_id","weather_main","weather_description","weather_icon"]
        new_keys_cloud =["clouds_all"]
        new_keys_wind =['wind_speed', 'wind_deg','wind_gust']

        weather_clean = rename_keys(weather, new_keys_weather)
        clouds_clean = rename_keys(clouds, new_keys_cloud)
        wind_clean = rename_keys(wind, new_keys_wind)

        dall = {}
        dall["dt_iso"] = dt_txt
        for d in [main, weather_clean, wind_clean ,clouds_clean]:
            dall.update(d)

        dall["visibility"] = visibility

        return dall

    # Storing these dictionaries into a list
    list_of_slices=[]

    for i in range(0,len(forecasts)):

        dall = slice_cleaning(forecasts,i)

        list_of_slices.append(dall)

    # Converting this list of dictionaries into a dataframe
    forecast_df = pd.DataFrame.from_dict(list_of_slices)

    return forecast_df

In [6]:
def clean_forecast(df):
    '''
    returns a cleaned weather forecast dataframe
    '''
    cleaned_df = weather_cleaning(df)

    return cleaned_df

def get_right_forecast(departure_date,departure_time,df):
    """
    Return the closest hourly weather forecast to the date and time inputs
    provided by the end-user
    """

    full_time_input= datetime.datetime.combine(departure_date,departure_time)
    df["user_input"] = pd.to_datetime(full_time_input)
    df["date_input"]=df["user_input"].dt.date
    df["date_weather"]=df["hourly_data"].dt.date

    df_reduc = df[df["date_weather"]==df["date_input"]]


    df_reduc["time_diff"] = df_reduc["user_input"] - df_reduc["hourly_data"]
    df_reduc=df_reduc[df_reduc["time_diff"]>pd.Timedelta(0)]
    cond = df_reduc["time_diff"].min()
    new_data = df_reduc[df_reduc["time_diff"]==cond]
    new_data.drop(columns=["user_input","date_input","date_weather","time_diff"], inplace=True)

    return new_data


# Workflow

## Current shape of predict dataframe

In [7]:
forecasts = chicago_weather_forecast()

In [8]:
forecast_df = convert_chicago_forecast_todf(forecasts)

In [9]:
cleaned_df = clean_forecast(forecast_df)

In [10]:
import datetime

departure_date = datetime.date(2022, 12, 14)
departure_time = datetime.time(12, 58, 0)

print(departure_date)
print(departure_time)

2022-12-14
12:58:00


In [11]:
final_df = get_right_forecast(departure_date,departure_time,cleaned_df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_reduc["time_diff"] = df_reduc["user_input"] - df_reduc["hourly_data"]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_data.drop(columns=["user_input","date_input","date_weather","time_diff"], inplace=True)


In [12]:
final_df.shape

(1, 8)

In [13]:
final_df.head()

Unnamed: 0,dt_iso,temp,pressure,humidity,wind_speed,wind_deg,clouds_all,hourly_data
9,2022-12-14 12:00:00,4.22,1012,87,10.51,119,100,2022-12-14 12:00:00


## Get shape of training set before pipeline preprocessing

In [14]:
import os
import pandas as pd
import numpy as np
import math

from divvy.ml_logic.data_import import get_weather_data, get_divvy_data
from divvy.ml_logic.cleaning import compute_geohash_stations,weather_cleaning, cleaning_divvy_gen,cleaning_divvy_gen_agg, merge_divvy_weather, features_target
from divvy.ml_logic.preprocessor import transform_time_features, preprocess_features, target_process


In [15]:
# Import data
target_chosen="nb_arrivals"

quarter= os.environ.get("DIVVY_QUARTER")
year= os.environ.get("DIVVY_YEAR")

raw_divvy_df = get_divvy_data(year,quarter)
raw_weather_df = get_weather_data()

print("Raw data imported")

# Clean data & merge data

clean_divvy_df = cleaning_divvy_gen_agg(raw_divvy_df)
clean_weather_df = weather_cleaning(raw_weather_df)

merged_df = merge_divvy_weather(clean_divvy_df, clean_weather_df)

print("Data cleaned and merged")

# Create features and target dataframes

X, y = features_target(merged_df, target_chosen)

Raw data imported


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_stations_reduced.rename(columns={"name":"station_name"}, inplace=True)


Data cleaned and merged


In [16]:
X.shape

(7204, 9)

In [17]:
X.head(5)

Unnamed: 0,geohash,hourly_data,dt_iso,temp,pressure,humidity,wind_speed,wind_deg,clouds_all
0,dp3s,2021-07-01 04:00:00,2021-07-01 04:00:00,21.29,1001,84,0.0,0,75
1,dp3s,2021-07-01 17:00:00,2021-07-01 17:00:00,23.18,1018,62,7.2,20,20
2,dp3s,2021-07-01 18:00:00,2021-07-01 18:00:00,23.36,1018,61,8.23,30,20
3,dp3s,2021-07-01 21:00:00,2021-07-01 21:00:00,22.44,1018,55,8.75,30,20
4,dp3s,2021-07-01 23:00:00,2021-07-01 23:00:00,20.64,1019,62,7.72,40,20


In [18]:
X.geohash.nunique()

5

In [20]:
geohash_df = X[["geohash"]]
geohash_df.shape

(7204, 1)

In [21]:
geohash_df = geohash_df.drop_duplicates()
geohash_df.shape

(5, 1)

In [22]:
geohash_df['key'] = 0
final_df['key'] = 0

predict_geohash = geohash_df.merge(final_df, on='key', how='outer')

In [23]:
predict_geohash

Unnamed: 0,geohash,key,dt_iso,temp,pressure,humidity,wind_speed,wind_deg,clouds_all,hourly_data
0,dp3s,0,2022-12-14 12:00:00,4.22,1012,87,10.51,119,100,2022-12-14 12:00:00
1,dp3t,0,2022-12-14 12:00:00,4.22,1012,87,10.51,119,100,2022-12-14 12:00:00
2,dp3v,0,2022-12-14 12:00:00,4.22,1012,87,10.51,119,100,2022-12-14 12:00:00
3,dp3w,0,2022-12-14 12:00:00,4.22,1012,87,10.51,119,100,2022-12-14 12:00:00
4,dp3x,0,2022-12-14 12:00:00,4.22,1012,87,10.51,119,100,2022-12-14 12:00:00


In [27]:
X.head(2)

Unnamed: 0,geohash,hourly_data,dt_iso,temp,pressure,humidity,wind_speed,wind_deg,clouds_all
0,dp3s,2021-07-01 04:00:00,2021-07-01 04:00:00,21.29,1001,84,0.0,0,75
1,dp3s,2021-07-01 17:00:00,2021-07-01 17:00:00,23.18,1018,62,7.2,20,20


In [24]:
preprocessor, X_processed_df = preprocess_features(X)

In [25]:
X_processed_df.shape

(7204, 32)

In [26]:
X_processed_df.head(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,22,23,24,25,26,27,28,29,30,31
0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-1.594856,1.181886,-1.990979,-1.596945,0.66909,1.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.577405,-0.310341,1.643733,-1.401928,-0.918348,1.0,0.0,0.0,0.0,0.0


In [28]:
predict_geohash_transformed = preprocessor.transform(predict_geohash)

In [29]:
predict_geohash_transformed.shape

(5, 32)

In [32]:
pd.DataFrame(predict_geohash_transformed).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,22,23,24,25,26,27,28,29,30,31
0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.189275,1.385372,3.314691,-0.436592,1.390653,1.0,0.0,0.0,0.0,0.0
1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.189275,1.385372,3.314691,-0.436592,1.390653,0.0,1.0,0.0,0.0,0.0
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.189275,1.385372,3.314691,-0.436592,1.390653,0.0,0.0,1.0,0.0,0.0
3,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.189275,1.385372,3.314691,-0.436592,1.390653,0.0,0.0,0.0,1.0,0.0
4,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.189275,1.385372,3.314691,-0.436592,1.390653,0.0,0.0,0.0,0.0,1.0


# Updating the cleaning of the predict dataframe

In [37]:
def get_retained_geohash(y,q):
    
    raw_divvy_df = get_divvy_data(y,q)
    clean_divvy_df = cleaning_divvy_gen_agg(raw_divvy_df)


    geohash_df = clean_divvy_df[["geohash"]]
    geohash_df = geohash_df.drop_duplicates()

    return geohash_df

In [38]:
geohash_df = get_retained_geohash(year,quarter)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_stations_reduced.rename(columns={"name":"station_name"}, inplace=True)


In [39]:
geohash_df.head()

Unnamed: 0,geohash
0,dp3s
191,dp3t
2368,dp3v
2645,dp3w
4854,dp3x


In [40]:
def get_right_forecast_new(departure_date,departure_time,df,geohash_df):
    """
    Return the closest hourly weather forecast to the date and time inputs
    provided by the end-user
    """

    full_time_input= datetime.datetime.combine(departure_date,departure_time)
    df["user_input"] = pd.to_datetime(full_time_input)
    df["date_input"]=df["user_input"].dt.date
    df["date_weather"]=df["hourly_data"].dt.date

    df_reduc = df[df["date_weather"]==df["date_input"]]


    df_reduc["time_diff"] = df_reduc["user_input"] - df_reduc["hourly_data"]
    df_reduc=df_reduc[df_reduc["time_diff"]>pd.Timedelta(0)]
    cond = df_reduc["time_diff"].min()
    new_data = df_reduc[df_reduc["time_diff"]==cond]
    new_data.drop(columns=["user_input","date_input","date_weather","time_diff"], inplace=True)

    geohash_df['key'] = 0
    new_data['key'] = 0

    predict_geohash = geohash_df.merge(new_data, on='key', how='outer')

    return predict_geohash

In [43]:
pred_test = get_right_forecast_new(departure_date,departure_time,cleaned_df,geohash_df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_reduc["time_diff"] = df_reduc["user_input"] - df_reduc["hourly_data"]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_data.drop(columns=["user_input","date_input","date_weather","time_diff"], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_data['key'] = 0


In [44]:
pred_test.head()

Unnamed: 0,geohash,key,dt_iso,temp,pressure,humidity,wind_speed,wind_deg,clouds_all,hourly_data
0,dp3s,0,2022-12-14 12:00:00,4.22,1012,87,10.51,119,100,2022-12-14 12:00:00
1,dp3t,0,2022-12-14 12:00:00,4.22,1012,87,10.51,119,100,2022-12-14 12:00:00
2,dp3v,0,2022-12-14 12:00:00,4.22,1012,87,10.51,119,100,2022-12-14 12:00:00
3,dp3w,0,2022-12-14 12:00:00,4.22,1012,87,10.51,119,100,2022-12-14 12:00:00
4,dp3x,0,2022-12-14 12:00:00,4.22,1012,87,10.51,119,100,2022-12-14 12:00:00


# Worklfow for the predict dataframe

In [1]:
pwd

'/home/gdolle/code/G-Dolle/DIVVY_BIKE/notebooks'

In [2]:
cd '/home/gdolle/code/G-Dolle/DIVVY_BIKE'

/home/gdolle/code/G-Dolle/DIVVY_BIKE


In [3]:
from divvy.interface_ui.flow.ui_utils import chicago_weather_forecast,convert_chicago_forecast_todf

from divvy.interface_ui.flow.ui_utils import clean_forecast, get_retained_geohash, get_right_forecast

from divvy.interface_ui.flow.ui_utils import predict_set_cleaning

In [4]:
quarter= os.environ.get("DIVVY_QUARTER")
year= os.environ.get("DIVVY_YEAR")

In [5]:
import datetime

departure_date = datetime.date(2022, 12, 14)
departure_time = datetime.time(12, 58, 0)

In [6]:
'''
def predict_set_cleaning(y,q):
    
    forecasts = chicago_weather_forecast()
    forecast_df = convert_chicago_forecast_todf(forecasts)
    cleaned_df = clean_forecast(forecast_df)

    geohash_df = get_retained_geohash(y,q)

    predict_geohash = get_right_forecast(departure_date,
                                      departure_time,
                                      cleaned_df,
                                      geohash_df)
    
    predict_geohash = predict_geohash.drop(columns=["key"])
    return predict_geohash
'''


'\ndef predict_set_cleaning(y,q):\n    \n    forecasts = chicago_weather_forecast()\n    forecast_df = convert_chicago_forecast_todf(forecasts)\n    cleaned_df = clean_forecast(forecast_df)\n\n    geohash_df = get_retained_geohash(y,q)\n\n    predict_geohash = get_right_forecast(departure_date,\n                                      departure_time,\n                                      cleaned_df,\n                                      geohash_df)\n    \n    predict_geohash = predict_geohash.drop(columns=["key"])\n    return predict_geohash\n'

In [8]:
pred_test_new = predict_set_cleaning(year,quarter,departure_date, departure_time)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_stations_reduced.rename(columns={"name":"station_name"}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_reduc["time_diff"] = df_reduc["user_input"] - df_reduc["hourly_data"]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_data.drop(columns=["user_input","date_input","date_weather","time_diff"], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .lo

In [9]:
pred_test_new.head()

Unnamed: 0,geohash,key,dt_iso,temp,pressure,humidity,wind_speed,wind_deg,clouds_all,hourly_data
0,dp3s,0,2022-12-14 12:00:00,4.22,1012,87,10.51,119,100,2022-12-14 12:00:00
1,dp3t,0,2022-12-14 12:00:00,4.22,1012,87,10.51,119,100,2022-12-14 12:00:00
2,dp3v,0,2022-12-14 12:00:00,4.22,1012,87,10.51,119,100,2022-12-14 12:00:00
3,dp3w,0,2022-12-14 12:00:00,4.22,1012,87,10.51,119,100,2022-12-14 12:00:00
4,dp3x,0,2022-12-14 12:00:00,4.22,1012,87,10.51,119,100,2022-12-14 12:00:00
