In [2]:
!pwd

/home/gdolle/code/G-Dolle/DIVVY_BIKE/notebooks


In [3]:
cd '/home/gdolle/code/G-Dolle/DIVVY_BIKE'

/home/gdolle/code/G-Dolle/DIVVY_BIKE


In [4]:
!pwd

/home/gdolle/code/G-Dolle/DIVVY_BIKE


In [5]:
%load_ext autoreload
%autoreload 2

In [5]:
import os
import pandas as pd
import numpy as np
import math
from datetime import date, time, datetime
import seaborn as sns

from ml_logic.data_import import get_weather_data, get_divvy_data
from ml_logic.cleaning import weather_cleaning, merge_divvy_weather, features_target
from ml_logic.preprocessor import transform_time_features, preprocess_features, target_process

from ml_logic.main import preprocess

In [6]:
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer, StandardScaler

# Computing nb of departures and arrivals per hour for each station

## Nb of departures

In [7]:

quarter= os.environ.get("DIVVY_QUARTER")
year= os.environ.get("DIVVY_YEAR")


In [8]:


df = get_divvy_data(year,quarter)
df.head(2)


Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual
0,9DC7B962304CBFD8,electric_bike,2021-09-28 16:07:10,2021-09-28 16:09:54,,,,,41.89,-87.68,41.89,-87.67,casual
1,F930E2C6872D6B32,electric_bike,2021-09-28 14:24:51,2021-09-28 14:40:05,,,,,41.94,-87.64,41.98,-87.67,casual


In [9]:
df.start_station_name.value_counts()

Streeter Dr & Grand Ave               43006
Michigan Ave & Oak St                 22777
DuSable Lake Shore Dr & North Blvd    20214
Wells St & Concord Ln                 19964
Millennium Park                       19375
                                      ...  
California Ave & 36th St                  2
351                                       2
Sacramento Ave & Pershing Rd              1
Kildare Ave & Chicago Ave                 1
Archer Ave & 43rd St                      1
Name: start_station_name, Length: 769, dtype: int64

In [10]:

df['started_at']=pd.to_datetime(df['started_at'])
df['ended_at']=pd.to_datetime(df['ended_at'])
df['hourly_data_started'] = df.started_at.dt.round('60min')
df['hourly_data_ended'] = df.ended_at.dt.round('60min')


In [11]:

df_departures=df[[
                 "start_station_name",
                 "start_station_id",
                 "hourly_data_started"	]]

df_departures=df_departures.rename(columns={'hourly_data_started':'hourly_data',
                                            "start_station_name":"station_name",
                                            "start_station_id": "station_id"})

df_departures.head(2)

Unnamed: 0,station_name,station_id,hourly_data
0,,,2021-09-28 16:00:00
1,,,2021-09-28 14:00:00


In [12]:
df_departures["nb_departures"]=1

In [13]:

df_dep_agg=df_departures.groupby(by=["station_name",
                                        "station_id",
                                        'hourly_data']).count().reset_index()
df_dep_agg.head(10)


Unnamed: 0,station_name,station_id,hourly_data,nb_departures
0,2112 W Peterson Ave,KA1504000155,2021-07-01 06:00:00,1
1,2112 W Peterson Ave,KA1504000155,2021-07-01 09:00:00,1
2,2112 W Peterson Ave,KA1504000155,2021-07-01 10:00:00,1
3,2112 W Peterson Ave,KA1504000155,2021-07-01 15:00:00,2
4,2112 W Peterson Ave,KA1504000155,2021-07-01 20:00:00,1
5,2112 W Peterson Ave,KA1504000155,2021-07-02 07:00:00,1
6,2112 W Peterson Ave,KA1504000155,2021-07-02 09:00:00,1
7,2112 W Peterson Ave,KA1504000155,2021-07-02 11:00:00,2
8,2112 W Peterson Ave,KA1504000155,2021-07-02 12:00:00,1
9,2112 W Peterson Ave,KA1504000155,2021-07-02 21:00:00,1


In [14]:
from ml_logic.preprocessor import compute_geohash_stations

In [15]:
station_df = compute_geohash_stations(precision= 5)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_stations_reduced.rename(columns={"name":"station_name"}, inplace=True)


In [16]:
df_dep_agg_geohash = df_dep_agg.merge(station_df, how="left", on="station_name")

In [17]:
df_dep_agg_geohash

Unnamed: 0,station_name,station_id,hourly_data,nb_departures,geohash
0,2112 W Peterson Ave,KA1504000155,2021-07-01 06:00:00,1,dp3wu
1,2112 W Peterson Ave,KA1504000155,2021-07-01 09:00:00,1,dp3wu
2,2112 W Peterson Ave,KA1504000155,2021-07-01 10:00:00,1,dp3wu
3,2112 W Peterson Ave,KA1504000155,2021-07-01 15:00:00,2,dp3wu
4,2112 W Peterson Ave,KA1504000155,2021-07-01 20:00:00,1,dp3wu
...,...,...,...,...,...
578304,Yates Blvd & 93rd St,20237,2021-09-23 17:00:00,1,dp3tr
578305,Yates Blvd & 93rd St,20237,2021-09-25 22:00:00,1,dp3tr
578306,Yates Blvd & 93rd St,20237,2021-09-26 22:00:00,1,dp3tr
578307,Yates Blvd & 93rd St,20237,2021-09-27 20:00:00,1,dp3tr


In [21]:
df_dep_agg_geohash = df_dep_agg_geohash.drop(columns=["station_name","station_id"])

In [29]:
df_dep_final=df_dep_agg_geohash.groupby(by=["geohash",
                                        'hourly_data']).mean().reset_index()

In [32]:
df_dep_final.shape

(46474, 3)

In [33]:
def cleaning_divvy_gen_agg(df):

    df['started_at']=pd.to_datetime(df['started_at'])
    df['ended_at']=pd.to_datetime(df['ended_at'])
    df['hourly_data_started'] = df.started_at.dt.round('60min')
    df['hourly_data_ended'] = df.ended_at.dt.round('60min')

    df_departures=df[[
                    "start_station_name",
                    "start_station_id",
                    "hourly_data_started"]]

    df_departures=df_departures.rename(columns={'hourly_data_started':'hourly_data',
                                                "start_station_name":"station_name",
                                                "start_station_id": "station_id"})

    df_departures["nb_departures"]=1

    df_dep_agg=df_departures.groupby(by=["station_name",
                                        "station_id",
                                        'hourly_data']).count().reset_index()

    
    station_df = compute_geohash_stations(precision= 5)
    df_dep_agg_geohash = df_dep_agg.merge(station_df, how="left", on="station_name")
    df_dep_agg_geohash = df_dep_agg_geohash.drop(columns=["station_name","station_id"])
    df_dep_final=df_dep_agg_geohash.groupby(by=["geohash",
                                        'hourly_data']).mean().reset_index()

    df_arrivals=df[["end_station_name",
                 "end_station_id",
                 "hourly_data_ended"]]

    df_arrivals=df_arrivals.rename(columns={'hourly_data_ended':'hourly_data',
                                            "end_station_name":"station_name",
                                            "end_station_id": "station_id"})
    df_arrivals["nb_arrivals"]=1

    df_arr_agg=df_arrivals.groupby(by=["station_name",
                                            "station_id",
                                            'hourly_data']).count().reset_index()

    df_arr_agg_geohash = df_arr_agg.merge(station_df, how="left", on="station_name")
    df_arr_agg_geohash = df_arr_agg_geohash.drop(columns=["station_name","station_id"])
    df_arr_final=df_arr_agg_geohash.groupby(by=["geohash",
                                        'hourly_data']).mean().reset_index()

    merge_ratio=pd.merge(
    df_dep_final,
    df_arr_final,
    how="outer",
    on=['hourly_data',"geohash"])


    merge_ratio["nb_departures"] = merge_ratio["nb_departures"].replace(np.nan, 0)
    merge_ratio["nb_arrivals"] = merge_ratio["nb_arrivals"].replace(np.nan, 0)

    merge_ratio['ratio']=merge_ratio['nb_departures']/merge_ratio['nb_arrivals']

    return merge_ratio


In [34]:
test_df = cleaning_divvy_gen_agg(df)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_stations_reduced.rename(columns={"name":"station_name"}, inplace=True)


In [35]:
test_df.head(5)

Unnamed: 0,geohash,hourly_data,nb_departures,nb_arrivals,ratio
0,dp3sy,2021-07-01 04:00:00,1.0,1.0,1.0
1,dp3sy,2021-07-01 17:00:00,1.0,0.0,inf
2,dp3sy,2021-07-01 18:00:00,1.0,2.0,0.5
3,dp3sy,2021-07-01 21:00:00,1.0,0.0,inf
4,dp3sy,2021-07-01 23:00:00,4.0,1.0,4.0


In [36]:
test_df.shape

(52546, 5)

# Merging this consolidated Divvy dataset with Weather data

## Importing and cleaning Weather data

In [37]:
from ml_logic.data_import import get_weather_data
from ml_logic.cleaning import weather_cleaning

In [38]:
raw_weather_df = get_weather_data()
clean_weather_df = weather_cleaning(raw_weather_df)


In [39]:
clean_weather_df.head(2)

Unnamed: 0,dt_iso,temp,pressure,humidity,wind_speed,wind_deg,clouds_all,hourly_data
0,2013-01-01 00:00:00,-2.87,1018,68,4.12,300,100,2013-01-01 00:00:00
1,2013-01-01 01:00:00,-3.12,1019,69,3.1,310,100,2013-01-01 01:00:00


## Merging Weather and Divvy data

In [40]:
merged_df = merge_divvy_weather(test_df, clean_weather_df)
merged_df.shape

(52546, 12)

In [41]:
merged_df.head(5)

Unnamed: 0,geohash,hourly_data,nb_departures,nb_arrivals,ratio,dt_iso,temp,pressure,humidity,wind_speed,wind_deg,clouds_all
0,dp3sy,2021-07-01 04:00:00,1.0,1.0,1.0,2021-07-01 04:00:00,21.29,1001,84,0.0,0,75
1,dp3sy,2021-07-01 17:00:00,1.0,0.0,inf,2021-07-01 17:00:00,23.18,1018,62,7.2,20,20
2,dp3sy,2021-07-01 18:00:00,1.0,2.0,0.5,2021-07-01 18:00:00,23.36,1018,61,8.23,30,20
3,dp3sy,2021-07-01 21:00:00,1.0,0.0,inf,2021-07-01 21:00:00,22.44,1018,55,8.75,30,20
4,dp3sy,2021-07-01 23:00:00,4.0,1.0,4.0,2021-07-01 23:00:00,20.64,1019,62,7.72,40,20


# Getting features and target dataset (target = nb_arrivals)

In [42]:
target ="nb_arrivals"

In [43]:
X,y = features_target(merged_df, target)

In [44]:
X.head(2)

Unnamed: 0,geohash,hourly_data,dt_iso,temp,pressure,humidity,wind_speed,wind_deg,clouds_all
0,dp3sy,2021-07-01 04:00:00,2021-07-01 04:00:00,21.29,1001,84,0.0,0,75
1,dp3sy,2021-07-01 17:00:00,2021-07-01 17:00:00,23.18,1018,62,7.2,20,20


In [54]:
X.geohash.nunique()

44

In [45]:
y.head(2)

0    1.0
1    0.0
Name: nb_arrivals, dtype: float64

In [50]:
import math
import numpy as np
import pandas as pd
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer, StandardScaler
import pygeohash as gh
from ml_logic.data_import import get_station_data


def transform_time_features(X: pd.DataFrame) -> np.ndarray:

    assert isinstance(X, pd.DataFrame)
    hourly_data = pd.to_datetime(X["hourly_data"],
                            format="%Y-%m-%d %H:%M:%S UTC",
                            utc=True)
    hourly_data = hourly_data.dt.tz_convert("America/Chicago").dt
    dow = hourly_data.weekday
    hour = hourly_data.hour
    month = hourly_data.month
    #year = hourly_data.year
    hour_sin = np.sin(2 * math.pi / 24 * hour)
    hour_cos = np.cos(2 * math.pi / 24 * hour)

    result = np.stack([hour_sin, hour_cos, dow, month], axis=1)
    return result

def preprocess_features(X: pd.DataFrame) -> np.ndarray:

    def create_sklearn_preprocessor() -> ColumnTransformer:

        time_categories = {
                    0: np.arange(0, 7, 1),  # days of the week
                    1: np.arange(1, 13, 1)  # months of the year
                    # will need to add one cat for the year when generalizing
                }

        time_pipe = make_pipeline(
                FunctionTransformer(transform_time_features),
                make_column_transformer(
                    (OneHotEncoder(
                        categories=time_categories,
                        sparse=False,
                        handle_unknown="ignore"), [2,3]), # correspond to columns ["day of week", "month"], not the others columns
                    #(FunctionTransformer(lambda year: (year-year_min)/(year_max-year_min)), [4]), # min-max scale the columns 4 ["year"]
                    remainder="passthrough" # keep hour_sin and hour_cos
                    )
                )

        weather_pipe = make_pipeline(StandardScaler())
        weather_features = ["temp","pressure","humidity","wind_speed","wind_deg","clouds_all"]

        cat_transformer = OneHotEncoder(sparse=False, handle_unknown="ignore")

        final_preprocessor = ColumnTransformer(
                    [
                        ("time_preproc", time_pipe, ["hourly_data"]),
                        ("weather_scaler",weather_pipe, weather_features),
                         ("geohash encoding", cat_transformer,["geohash"])
                    ],
                    n_jobs=-1,
                )
        return final_preprocessor


    preprocessor = create_sklearn_preprocessor()

    X = X.drop(columns=["dt_iso"])

    X_processed = preprocessor.fit_transform(X)

    X_processed_df = pd.DataFrame(X_processed)

    return preprocessor, X_processed_df


In [51]:
preprocessor, X_processed_df = preprocess_features(X)

In [52]:
X_processed_df.shape

(52546, 71)

In [53]:
X_processed_df.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,61,62,63,64,65,66,67,68,69,70
0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
import os
import pandas as pd
import numpy as np
import math

from ml_logic.data_import import get_weather_data, get_divvy_data
from ml_logic.cleaning import compute_geohash_stations,weather_cleaning, cleaning_divvy_gen,cleaning_divvy_gen_agg, merge_divvy_weather, features_target
from ml_logic.preprocessor import transform_time_features, preprocess_features, target_process

In [10]:
from ml_logic.main import preprocess, preprocess_test

In [12]:
target_chosen = "nb_arrivals"
X_processed_df, y_processed_df, preprocessor = preprocess(target_chosen)


Raw data imported


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_stations_reduced.rename(columns={"name":"station_name"}, inplace=True)


Data cleaned and merged
features and target dataframes created
features preprocessed
nb_arrivals picked as target
Preprocessing of Training set is done


In [13]:
X_processed_df.shape

(52546, 71)

In [14]:
X_processed_df.head(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,61,62,63,64,65,66,67,68,69,70
0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
y_processed_df.head(2)

0    1.0
1    0.0
Name: nb_arrivals, dtype: float64

In [16]:
X_test_processed, y_test_processed=preprocess_test(preprocessor, target_chosen)

Test Raw data imported


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_stations_reduced.rename(columns={"name":"station_name"}, inplace=True)


Test Data cleaned and merged
Test features and target dataframes created
nb_arrivals picked as target
Preprocessing of test set is done


In [20]:
pd.DataFrame(X_test_processed).shape

(44841, 71)

In [21]:
pd.DataFrame(X_test_processed).head(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,61,62,63,64,65,66,67,68,69,70
0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
y_test_processed.head(2)

0    0.0
1    0.0
Name: nb_arrivals, dtype: float64

In [56]:
y_processed_df = y

In [60]:
import tpot
model = tpot.TPOTRegressor(generations=10,
                           population_size=5,
                           scoring='neg_mean_squared_error',
                           verbosity=2,
                          n_jobs=-1)

model.fit(X_processed_df, y_processed_df)

# Encoding of station_name

In [76]:
from sklearn.preprocessing import OneHotEncoder

In [77]:
X.station_name.nunique()

847

In [81]:
ohe = OneHotEncoder(sparse = False)
ohe.fit(X[['station_name']])

In [82]:
X_encoded = ohe.transform(X[["station_name"]])

In [83]:
X[ohe.categories_[0]] = X_encoded

In [84]:
X = X.drop("station_name", axis=1)

In [85]:
X.shape

(344946, 856)

In [45]:
features_df_complete, target_df = features_target(merged_df,target)

In [46]:
features_df_complete.head(2)

Unnamed: 0,station_name,station_id,hourly_data,dt_iso,temp,pressure,humidity,wind_speed,wind_deg,clouds_all
0,2112 W Peterson Ave,KA1504000155,2022-01-01 21:00:00,2022-01-01 21:00:00,-0.75,1010,93,4.92,20,100
1,2112 W Peterson Ave,KA1504000155,2022-01-03 18:00:00,2022-01-03 18:00:00,-6.55,1030,49,3.13,253,0


In [47]:
features_df_complete.shape

(344946, 10)

In [60]:
import pygeohash as gh
from ml_logic.data_import import get_station_data


def compute_geohash_stations(precision: int = 5) -> np.ndarray:
    """
    Add a geohash (ex: “dr5rx”) of len “precision” = 5 by default
    corresponding to each (lon,lat) tuple, for pick-up, and drop-off
    """
            
    df_stations=get_station_data()
    assert isinstance(df_stations, pd.DataFrame)
    df_stations["geohash"] = df_stations.apply(lambda x: gh.encode(
        x.lat, x.lon, precision=precision),
                                    axis=1)
    df_stations_reduced=df_stations[["name","geohash"]]
    df_stations_reduced.rename(columns={"name":"station_name"}, inplace=True)
    return df_stations_reduced


In [61]:
# Calling Geohash function and creating Stations Dataframe
df_stations_reduced=compute_geohash_stations(precision=5)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_stations_reduced.rename(columns={"name":"station_name"}, inplace=True)


In [62]:
df_stations_reduced.geohash.nunique()

47

In [59]:
df_stations_reduced[df_stations_reduced["station_name"]=="2112 W Peterson Ave"]

Unnamed: 0,station_name,station_id,geohash
224,2112 W Peterson Ave,456,dp3wu


In [124]:
df_stations_reduced.head(5)

Unnamed: 0,station_name,geohash
0,Central Ave & Harrison St,dp3w4
1,Halsted St & 59th St,dp3tt
2,Damen Ave & 51st St,dp3tu
3,Halsted St & 21st St,dp3wj
4,Michigan Ave & Madison St,dp3wq


In [63]:
# Merge divvy data & station data
X_complete=features_df_complete.merge(df_stations_reduced,how="left",on=["station_name"])
X_complete.head(5)

Unnamed: 0,station_name,station_id,hourly_data,dt_iso,temp,pressure,humidity,wind_speed,wind_deg,clouds_all,geohash
0,2112 W Peterson Ave,KA1504000155,2022-01-01 21:00:00,2022-01-01 21:00:00,-0.75,1010,93,4.92,20,100,dp3wu
1,2112 W Peterson Ave,KA1504000155,2022-01-03 18:00:00,2022-01-03 18:00:00,-6.55,1030,49,3.13,253,0,dp3wu
2,2112 W Peterson Ave,KA1504000155,2022-01-04 11:00:00,2022-01-04 11:00:00,-5.02,1021,79,4.12,190,40,dp3wu
3,2112 W Peterson Ave,KA1504000155,2022-01-04 18:00:00,2022-01-04 18:00:00,-0.39,1017,70,4.02,204,99,dp3wu
4,2112 W Peterson Ave,KA1504000155,2022-01-10 09:00:00,2022-01-10 09:00:00,-10.37,1034,58,2.24,301,98,dp3wu


In [69]:
X_complete = X_complete.drop(columns=["station_name","station_id","dt_iso"])

In [70]:
time_categories = {
                    0: np.arange(0, 7, 1),  # days of the week
                    1: np.arange(1, 13, 1)  # months of the year
                    # will need to add one cat for the year when generalizing
                }

time_pipe = make_pipeline(
    FunctionTransformer(transform_time_features),
    make_column_transformer(
        (OneHotEncoder(
            categories=time_categories,
            sparse=False,
            handle_unknown="ignore"), [2,3]), # correspond to columns ["day of week", "month"], not the others columns
        #(FunctionTransformer(lambda year: (year-year_min)/(year_max-year_min)), [4]), # min-max scale the columns 4 ["year"]
        remainder="passthrough" # keep hour_sin and hour_cos
        )
    )

weather_pipe = make_pipeline(StandardScaler())
weather_features = ["temp","pressure","humidity","wind_speed","wind_deg","clouds_all"]


cat_transformer = OneHotEncoder(sparse=False)

final_preprocessor = ColumnTransformer(
        [
            ("time_preproc", time_pipe, ["hourly_data"]),
            ("weather_scaler",weather_pipe, weather_features),
            ("geohash encoding", cat_transformer,["geohash"])
        ],
        n_jobs=-1,
    )

In [71]:
X_processed = final_preprocessor.fit_transform(X_complete)

X_processed_df = pd.DataFrame(X_processed)

In [72]:
X_processed_df.head(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,64,65,66,67,68,69,70,71,72,73
0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [125]:
import os
import pandas as pd
import numpy as np
import math

from ml_logic.data_import import get_weather_data, get_divvy_data
from ml_logic.cleaning import weather_cleaning, cleaning_divvy_gen, merge_divvy_weather, features_target
from ml_logic.preprocessor import transform_time_features, preprocess_features, target_process, compute_geohash_stations


from ml_logic.main import preprocess, preprocess_test

In [117]:
X_processed_df, y_processed_df, preprocessor, df_stations_reduced = preprocess("nb_arrivals")
X_test_processed, y_test_processed=preprocess_test(preprocessor, "nb_arrivals", df_stations_reduced)

Raw data imported
Data cleaned and merged
features and target dataframes created
Geohash of stations computed
Geohash of stations added to Features dataframe


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_stations_reduced.rename(columns={"name":"station_name"}, inplace=True)


features preprocessed
nb_arrivals picked as target
Preprocessing of Training set is done


In [118]:
X_processed_df.shape

(344946, 74)

In [119]:
X_processed_df.head(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,64,65,66,67,68,69,70,71,72,73
0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [120]:
X_test_processed, y_test_processed=preprocess_test(preprocessor, "nb_arrivals", df_stations_reduced)

Test Raw data imported
Test Data cleaned and merged
Test features and target dataframes created
nb_arrivals picked as target
Preprocessing of test set is done


In [121]:
X_test_processed.shape

(659944, 74)

In [123]:
pd.DataFrame(X_test_processed).head(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,64,65,66,67,68,69,70,71,72,73
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [101]:
quarter= os.environ.get("DIVVY_QUARTER_TEST")
year= os.environ.get("DIVVY_YEAR_TEST")

raw_divvy_df = get_divvy_data(year,quarter)
raw_weather_df = get_weather_data()


In [102]:
clean_divvy_df = cleaning_divvy_gen(raw_divvy_df)
clean_weather_df = weather_cleaning(raw_weather_df)

merged_df_test = merge_divvy_weather(clean_divvy_df, clean_weather_df)

In [103]:
X_test, y_test = features_target(merged_df_test, "nb_arrivals")

In [104]:
X_complete=X_test.merge(df_stations_reduced,how="left",on=["station_name"])

In [105]:
X_complete.head(2)

Unnamed: 0,station_name,station_id,hourly_data,dt_iso,temp,pressure,humidity,wind_speed,wind_deg,clouds_all,geohash
0,111th St - Morgan Park Metra,682,2022-05-09 17:00:00,2022-05-09 17:00:00,23.11,1013,51,7.72,160,75,
1,111th St - Morgan Park Metra,682,2022-05-12 18:00:00,2022-05-12 18:00:00,32.62,1018,43,7.72,170,20,


In [107]:
X_complete.geohash.unique()

array([nan, 'dp3wu', 'dp3tx', 'dp3wj', 'dp3wm', 'dp3tj', 'dp3wn', 'dp3wh',
       'dp3ws', 'dp3wk', 'dp3sy', 'dp3tu', 'dp3tv', 'dp3tt', 'dp3wt',
       'dp3w7', 'dp3w6', 'dp3v0', 'dp3sz', 'dp3we', 'dp3tr', 'dp3xh',
       'dp3tp', 'dp3ty', 'dp3w3', 'dp3xj', 'dp3wv', 'dp3v2', 'dp3ts',
       'dp3tw', 'dp3w4', 'dp3wd', 'dp3w5', 'dp3xk', 'dp3tq', 'dp3wq',
       'dp3tn', 'dp3tm', 'dp3te', 'dp3w9', 'dp3th', 'dp3tg', 'dp3x7',
       'dp3tz', 'dp3wg', 'dp3t7', 'dp3tk'], dtype=object)

In [108]:
X_complete.geohash.replace(np.nan, "aaa", inplace=True)

In [109]:
X_complete.geohash.unique()

array(['aaa', 'dp3wu', 'dp3tx', 'dp3wj', 'dp3wm', 'dp3tj', 'dp3wn',
       'dp3wh', 'dp3ws', 'dp3wk', 'dp3sy', 'dp3tu', 'dp3tv', 'dp3tt',
       'dp3wt', 'dp3w7', 'dp3w6', 'dp3v0', 'dp3sz', 'dp3we', 'dp3tr',
       'dp3xh', 'dp3tp', 'dp3ty', 'dp3w3', 'dp3xj', 'dp3wv', 'dp3v2',
       'dp3ts', 'dp3tw', 'dp3w4', 'dp3wd', 'dp3w5', 'dp3xk', 'dp3tq',
       'dp3wq', 'dp3tn', 'dp3tm', 'dp3te', 'dp3w9', 'dp3th', 'dp3tg',
       'dp3x7', 'dp3tz', 'dp3wg', 'dp3t7', 'dp3tk'], dtype=object)

In [106]:
df_stations_reduced[df_stations_reduced["station_name"]=="111th St - Morgan Park Metra"]

Unnamed: 0,station_name,geohash


In [115]:
X_test_processed = preprocessor.transform(X_complete)

In [2]:
pwd

'/home/gdolle/code/G-Dolle/DIVVY_BIKE/notebooks'

In [3]:
cd '/home/gdolle/code/G-Dolle/DIVVY_BIKE'

/home/gdolle/code/G-Dolle/DIVVY_BIKE


In [4]:
import os
import pandas as pd
import numpy as np
import math

from ml_logic.data_import import get_weather_data, get_divvy_data
from ml_logic.cleaning import weather_cleaning, cleaning_divvy_gen, merge_divvy_weather, features_target
from ml_logic.preprocessor import transform_time_features, preprocess_features, target_process, compute_geohash_stations


from ml_logic.main import preprocess, preprocess_test

In [6]:
X_processed_df, y_processed_df, preprocessor, df_stations_reduced = preprocess("nb_arrivals")

Raw data imported
Data cleaned and merged
features and target dataframes created
Geohash of stations computed
Geohash of stations added to Features dataframe


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_stations_reduced.rename(columns={"name":"station_name"}, inplace=True)


features preprocessed
nb_arrivals picked as target
Preprocessing of Training set is done


In [7]:
X_processed_df.shape

(718651, 72)

In [8]:
X_test_processed, y_test_processed=preprocess_test(preprocessor, "nb_arrivals", df_stations_reduced)

Test Raw data imported
Test Data cleaned and merged
Test features and target dataframes created
nb_arrivals picked as target
Preprocessing of test set is done


In [9]:
X_test_processed.shape

(539439, 72)

In [10]:
!pip install tpot

Collecting tpot
  Using cached TPOT-0.11.7-py3-none-any.whl (87 kB)
Collecting deap>=1.2
  Using cached deap-1.3.3-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (139 kB)
Collecting update-checker>=0.16
  Using cached update_checker-0.18.0-py3-none-any.whl (7.0 kB)
Collecting tqdm>=4.36.1
  Using cached tqdm-4.64.1-py2.py3-none-any.whl (78 kB)
Collecting xgboost>=1.1.0
  Downloading xgboost-1.7.2-py3-none-manylinux2014_x86_64.whl (193.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 MB[0m [31m18.0 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting stopit>=1.1.1
  Using cached stopit-1.1.2.tar.gz (18 kB)
  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: stopit
  Building wheel for stopit (setup.py) ... [?25ldone
[?25h  Created wheel for stopit: filename=stopit-1.1.2-py3-none-any.whl size=11939 sha256=89d6b4480d56663d6cbaf830dc1646eecaa41806474668135fb3