In [152]:
import pandas as pd
import haversine as hs

from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [153]:
def read_data(path: str, date_columns: list[int], header_col:int = 0) -> pd.DataFrame:
    return pd.read_csv(path, header=header_col, parse_dates=date_columns)

In [154]:
def preprocess_data(ride_df: pd.DataFrame, station_df: pd.DataFrame)-> pd.DataFrame:
    # merge to get the start station details
    start_station_df = ride_df.merge(station_df, left_on="emplacement_pk_start", right_on="pk", how="inner")
    # rename the lattitude and longitude of the start station
    start_station_df.rename(columns={"latitude":"st_lattitude", "longitude":"st_longitude"}, inplace=True)
    # filter out the unneccesary columns
    start_station_df = start_station_df[["emplacement_pk_start", "emplacement_pk_end", "duration_sec", "is_member", "st_lattitude", "st_longitude"]]
    
    # merge to get the end stations detailes
    end_station_df = start_station_df.copy().merge(stations_df, left_on="emplacement_pk_end", right_on="pk", how="inner")
    # rename the lattitude and longitude of the start station
    end_station_df.rename(columns={"latitude":"end_lattitude", "longitude":"end_longitude"}, inplace=True)
    # filter out the unneccesary columns
    end_station_df = end_station_df[["emplacement_pk_start", "emplacement_pk_end", "st_lattitude", "st_longitude", "end_lattitude", "end_longitude", "is_member", "duration_sec"]]
    
    # drop the rows with missing values
    end_station_df.dropna(inplace=True)
    
    # calculate the distance of the trip 
    end_station_df["distance_km"] = end_station_df.apply(lambda row: hs.haversine((row["st_lattitude"], row["st_longitude"]), (row["end_lattitude"], row["end_longitude"]), unit="km"), axis=1)
    
    # create a pair with start and end station
    end_station_df["ride_stations"] = end_station_df[["emplacement_pk_start", "emplacement_pk_end"]].astype(str).apply(lambda x: '_'.join(x), axis=1)
    # convert the duration to minute
    end_station_df["duration_minute"] = end_station_df["duration_sec"]/60
    
    # select the final columns
    processed_df = end_station_df[["ride_stations", "distance_km", "is_member", "duration_minute"]]
    # convert the categorical column to string
    preprocessed_df["is_member"] = preprocessed_df["is_member"].astype(str)
    
    return processed_df

In [155]:
def generate_features(input_df: pd.DataFrame, target_column: str) -> tuple[DictVectorizer]:
    
    train_columns = input_df.columns.to_list()
    train_columns.remove(target_column)
    # crate a data frame with train columns
    train_df = input_df[train_columns]
    
    # convert the data frame as a dictionary 
    train_dicts = train_df.to_dict(orient='records')
    # vectorize the training data
    dict_vectorizer = DictVectorizer()
    X = dict_vectorizer.fit_transform(train_dicts)
    y = input_df[target_column].values
    
    return (dict_vectorizer, X, y)

In [156]:
def build_model(X_train, y_train):
    # initialize model
    linear_regressor = LinearRegression()
    # perform training
    linear_regressor.fit(X_train, y_train)
    
    return linear_regressor

# Training

In [157]:
#Read data 
ride_path = "../data/2022-06-01/20220106_donnees_ouvertes.csv"
stations_path = "../data/2022-06-01/20220106_stations.csv"
ride_df = read_data(ride_path, [0, 2], 0)
stations_df = read_data(stations_path, [], 0)
print(f"Length of ride df: {len(ride_df)}")
print(f"Length of stations df: {len(stations_df)}")

Length of ride df: 1358198
Length of stations df: 735


In [158]:
# Preprocessed data
preprocessed_df = preprocess_data(ride_df, stations_df)
print(f"Length of preprocessed df: {len(preprocessed_df)}")
preprocessed_df.head(10)

Length of preprocessed  df: 1358196


Unnamed: 0,ride_stations,distance_km,is_member,duration_minute
0,9_1119,4.674871,1.0,16.183333
1,9_1119,4.674871,1.0,25.016667
2,75_1119,2.069513,1.0,11.466667
3,75_1119,2.069513,0.0,15.933333
4,75_1119,2.069513,0.0,15.983333
5,78_1119,1.955478,1.0,16.65
6,78_1119,1.955478,0.0,17.15
7,78_1119,1.955478,1.0,7.683333
8,78_1119,1.955478,1.0,17.116667
9,79_1119,1.228293,1.0,9.083333


In [159]:
# generate features
dict_vectorizer, X_train, y_train = generate_features(preprocessed_df, "duration_minute")

In [160]:
# build model
model = build_model(X_train, y_train)
y_pred = model.predict(X_train)
train_error = mean_squared_error(y_train, y_pred, squared=False)
print(f"Training error: {train_error}")

Training error: 13.483320353503942


# Validation

In [170]:
#Read data 
valid_ride_path = "../data/2022-07-01/20220107_donnees_ouvertes.csv"
valid_stations_path = "../data/2022-07-01/20220107_stations.csv"
valid_ride_df = read_data(valid_ride_path, [0, 2], 0)
valid_stations_df = read_data(valid_stations_path, [], 0)
print(f"Length of ride df: {len(valid_ride_df)}")
print(f"Length of stations df: {len(valid_stations_df)}")

Length of ride df: 1486082
Length of stations df: 735


In [171]:
# Preprocessed data
valid_preprocessed_df = preprocess_data(valid_ride_df, valid_stations_df)
print(f"Length of preprocessed  df: {len(valid_preprocessed_df)}")
valid_preprocessed_df.head(10)

Length of preprocessed  df: 1471959


Unnamed: 0,ride_stations,distance_km,is_member,duration_minute
0,9_394,11.70423,0,76.35
1,9_394,11.70423,1,52.35
2,9_394,11.70423,1,45.3
3,9_394,11.70423,1,82.4
4,10_394,7.194953,0,36.883333
5,10_394,7.194953,0,36.966667
6,10_394,7.194953,1,97.2
7,13_394,4.44399,1,95.983333
8,13_394,4.44399,0,25.233333
9,13_394,4.44399,1,19.5


In [172]:
# generate features
valid_df = valid_preprocessed_df[['ride_stations', 'distance_km', 'is_member']]
valid_dicts = valid_df.to_dict(orient='records')
X_valid  = dict_vectorizer.transform(valid_dicts)
y_valid = valid_preprocessed_df["duration_minute"].values

In [173]:
# build model
y_pred = model.predict(X_valid)
valid_error = mean_squared_error(y_valid, y_pred, squared=False)
print(f"Validation error: {valid_error}")

Validation error: 15.456442416672154
