In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from category_encoders import OneHotEncoder , OrdinalEncoder
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import  LinearRegression , Ridge , Lasso, RidgeCV
from sklearn.metrics import mean_absolute_error , r2_score
from sklearn.ensemble import RandomForestRegressor

In [2]:
file_path = 'MBTA_Bus_Reliability.csv'
df = pd.read_csv(file_path)


In [3]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 659589 entries, 0 to 659588
Data columns (total 10 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   service_date            659589 non-null  object 
 1   gtfs_route_id           659589 non-null  object 
 2   gtfs_route_desc         659589 non-null  object 
 3   route_category          659589 non-null  object 
 4   mode_type               659589 non-null  object 
 5   peak_offpeak_ind        659589 non-null  object 
 6   reliability_percentage  657117 non-null  float64
 7   unreliable_percentage   657117 non-null  float64
 8   average_temp            659589 non-null  float64
 9   precipitation           659589 non-null  float64
dtypes: float64(4), object(6)
memory usage: 50.3+ MB


In [4]:
df = df.dropna()
df.isna().sum()

service_date              0
gtfs_route_id             0
gtfs_route_desc           0
route_category            0
mode_type                 0
peak_offpeak_ind          0
reliability_percentage    0
unreliable_percentage     0
average_temp              0
precipitation             0
dtype: int64

In [5]:
df = df.drop(columns=['service_date','route_category'
                      ,'mode_type','reliability_percentage'])

In [6]:
target = 'unreliable_percentage'
X = df.drop(columns=[target])
y = df[target]
X_train,X_test,y_train,y_test = train_test_split(X,y,
                                train_size= 0.5, test_size=0.1)

In [7]:
Lrg = make_pipeline(

    OneHotEncoder(),
    SimpleImputer(),
    LinearRegression()

)
Lrg.fit(X_train,y_train)

In [9]:
y_pred = Lrg.predict(X_train)
mae = mean_absolute_error(y_train,y_pred)
print(mae)

9.644766985043567


In [10]:
y_mean = y_train.mean()
y_pred_baseline= [y_mean] * len(y_train)
base_mae =  mean_absolute_error(y_train,y_pred_baseline)
print(base_mae)

12.538273511832271


In [11]:
reg_y_pred_train = Lrg.predict(X_train)
reg_y_pred_test = Lrg.predict(X_test)

reg_acc_train = r2_score(y_train , reg_y_pred_train)
reg_acc_test = r2_score(y_test , reg_y_pred_test)

print("Training Accuracy:", round(reg_acc_train, 4))
print("Test Accuracy:", round(reg_acc_test, 4))

Training Accuracy: 0.3376
Test Accuracy: 0.341


In [12]:
rge = make_pipeline(

    OneHotEncoder(),
    SimpleImputer(),
    RidgeCV()

)
rge.fit(X_train , y_train)

In [13]:
y_pred = rge.predict(X_train)
mae = mean_absolute_error(y_train,y_pred)
print(mae)

9.644563361591265


In [None]:
#same as one befoere but with ridge regression
rge_y_pred_train = rge.predict(X_train)
rge_y_pred_test = rge.predict(X_test)

rge_acc_train = r2_score(y_train , rge_y_pred_train)
rge_acc_test = r2_score(y_test , rge_y_pred_test)

print("Training Accuracy:", round(rge_acc_train, 4))
print("Test Accuracy:", round(rge_acc_test, 4))

In [14]:
lso = make_pipeline(

    OneHotEncoder(),
    SimpleImputer(),
    Lasso(max_iter=9000)

)
lso.fit(X_train , y_train)

In [15]:
y_pred = lso.predict(X_train)
mae = mean_absolute_error(y_train,y_pred)
print(mae)

12.28336721568134


In [16]:
lso_y_pred_train = lso.predict(X_train)
lso_y_pred_test = lso.predict(X_test)

lso_acc_train = r2_score(y_train , lso_y_pred_train)
lso_acc_test = r2_score(y_test , lso_y_pred_test)

print("Training Accuracy:", round(lso_acc_train, 4))
print("Test Accuracy:", round(lso_acc_test, 4))

Training Accuracy: 0.034
Test Accuracy: 0.033


In [None]:
forest = make_pipeline(

    OneHotEncoder(),
    SimpleImputer(),
    RandomForestRegressor(max_depth=15)

)
forest.fit(X_train , y_train)

In [None]:
for_y_pred_train = forest.predict(X_train)
for_y_pred_test = forest.predict(X_test)

for_acc_train = r2_score(y_train , for_y_pred_train)
for_acc_test = r2_score(y_test , for_y_pred_test)

print("Training Accuracy:", round(for_acc_train, 4))
print("Test Accuracy:", round(for_acc_test, 4))

In [None]:
max(y_train)
