## Prepare Data

In [19]:
import pandas as pd
import requests
import zipfile
import io

In [20]:
from sklearn.feature_extraction import DictVectorizer

In [21]:
# URL of the ZIP file
# year = '2024'
# month = '01'  # January
# url = f'https://s3.amazonaws.com/tripdata/{year}{month}-citibike-tripdata.csv.zip'

# # Step 1: Download the ZIP file
# response = requests.get(url)
# response.raise_for_status() 

In [22]:
# with zipfile.ZipFile(io.BytesIO(response.content)) as z:
#     file_names = z.namelist()
#     csv_file_name = [name for name in file_names if name.endswith('.csv')][0]
#     with z.open(csv_file_name) as csv_file:
#         df = pd.read_csv(csv_file)

In [23]:
# Merging the data files
# df1 = pd.read_csv("data\\202403-citibike-tripdata_1.csv")
# df2 = pd.read_csv("data\\202403-citibike-tripdata_2.csv")
# df3 = pd.read_csv("data\\202403-citibike-tripdata_3.csv")

# merged_df = pd.concat([df1, df2,df3], ignore_index=True)

# merged_df.to_csv("data/202403-citibike-tripdata.csv", index=False)

In [24]:
train_df = pd.read_csv("training_mage/data/JC-202401-citibike-tripdata.csv")

In [26]:
val_df = pd.read_csv("training_mage/data/JC-202402-citibike-tripdata.csv")

In [27]:
train_df = train_df[train_df.notna()]

In [28]:
val_df = val_df[val_df.notna()]

### Computing target variable: Duration

In [29]:
train_df["started_at"] = pd.to_datetime(train_df["started_at"])
train_df["ended_at"] = pd.to_datetime(train_df["ended_at"])

val_df["started_at"] = pd.to_datetime(val_df["started_at"])
val_df["ended_at"] = pd.to_datetime(val_df["ended_at"])

In [30]:
train_df["duration"] = (train_df["ended_at"] - train_df["started_at"]).dt.total_seconds() / 60
val_df["duration"] = (val_df["ended_at"] - val_df["started_at"]).dt.total_seconds() / 60

In [31]:
# Dropping outliers
train_df = train_df[
    (train_df['duration'] >= 0) & 
    (train_df['duration'] <= 60)
]


val_df = val_df[
    (val_df['duration'] >= 0) & 
    (val_df['duration'] <= 60)
]

In [32]:
categorical_features = [
        'start_station_id',
        'end_station_id'
    ]

train_df[categorical_features] = train_df[categorical_features].astype(str)

val_df[categorical_features] = val_df[categorical_features].astype(str)

In [33]:
train_df["start_end_id"] = train_df["start_station_id"] + "_" + train_df["end_station_id"]
val_df["start_end_id"] = val_df["start_station_id"] + "_" + val_df["end_station_id"]

In [34]:
categorical= ["start_end_id"]
target = ["duration"]

In [35]:
len(train_df["start_end_id"].unique())

4180

In [36]:
# One hot encoding
dv = DictVectorizer()
train_dicts = train_df[categorical].to_dict(orient='records')
val_dicts = val_df[categorical].to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)
X_val = dv.transform(val_dicts)
y_train = train_df[target]
y_val = val_df[target]

In [37]:
len(dv.feature_names_)

4180

### Training the model

In [38]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import root_mean_squared_error

In [39]:
lr = LinearRegression()

lr.fit(X_train,y_train)

y_pred = lr.predict(X_train)

root_mean_squared_error(y_pred=y_pred,y_true=y_train)

4.329270779578182

In [40]:
y_val_pred = lr.predict(X_val)

root_mean_squared_error(y_pred=y_val_pred, y_true=y_val)

5.262920515863876