In [18]:
import pandas as pd
import numpy as np
url = 'dataset1.csv'
sample_df = pd.read_csv(url)
sample_df

Unnamed: 0.1,Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration,Unnamed: 11,Unnamed: 12
0,0,id1080784,2,29-02-2016 16:40,29-02-2016 16:47,1,-73.953918,40.778873,-73.963875,40.771164,N,400,,
1,1,id0889885,1,11-03-2016 23:35,11-03-2016 23:53,2,-73.988312,40.731743,-73.994751,40.694931,N,1100,,
2,2,id0857912,2,21-02-2016 17:59,21-02-2016 18:26,2,-73.997314,40.721458,-73.948029,40.774918,N,1635,,
3,3,id3744273,2,05-01-2016 9:44,05-01-2016 10:03,6,-73.961670,40.759720,-73.956779,40.780628,N,1141,,
4,4,id0232939,1,17-02-2016 6:42,17-02-2016 6:56,1,-74.017120,40.708469,-73.988182,40.740631,N,848,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
399995,399995,id0338454,1,23-05-2016 19:53,23-05-2016 19:53,3,-73.997131,40.731449,-73.997131,40.731449,N,3,,
399996,399996,id2817401,2,09-06-2016 7:37,09-06-2016 7:51,1,-73.978630,40.783371,-73.973839,40.754471,N,835,,
399997,399997,id3780539,1,15-02-2016 10:21,15-02-2016 10:33,1,-73.971268,40.795601,-73.974106,40.762692,N,731,,
399998,399998,id3446645,1,28-03-2016 17:38,28-03-2016 17:48,1,-73.971672,40.763691,-73.979935,40.748798,N,642,,


In [19]:
#Pre-processing

sample_df["store_and_fwd_flag"].value_counts()
#Convert character variables to numeric
f = lambda x: 0 if x == 'N' else 1

sample_df["store_and_fwd_flag"] = sample_df["store_and_fwd_flag"].apply(lambda x: f(x))
#Check result
sample_df["store_and_fwd_flag"].value_counts()


#First, convert datetime strings into datetime
sample_df["dropoff_datetime"] = pd.to_datetime(sample_df["dropoff_datetime"], format='%d-%m-%Y %H:%M')
sample_df["pickup_datetime"] = pd.to_datetime(sample_df["pickup_datetime"], format='%d-%m-%Y %H:%M')

#Now construct other variables, like month, date, etc.
sample_df["pickup_month"] = sample_df["pickup_datetime"].dt.month
sample_df["pickup_day"] = sample_df["pickup_datetime"].dt.day
sample_df["pickup_weekday"] = sample_df["pickup_datetime"].dt.weekday #sample_df["pickup_weekday"] = sample_df["pickup_datetime"].dt.weekday_name
sample_df["pickup_hour"] = sample_df["pickup_datetime"].dt.hour
sample_df["pickup_minute"] = sample_df["pickup_datetime"].dt.minute

#Get latitude and longitude differences
sample_df["latitude_difference"] = sample_df["dropoff_latitude"] - sample_df["pickup_latitude"]
sample_df["longitude_difference"] = sample_df["dropoff_longitude"] - sample_df["pickup_longitude"]

#Convert duration to minutes for easier interpretation
sample_df["trip_duration"] = sample_df["trip_duration"].apply(lambda x: round(x/60))

#Convert trip distance from longitude and latitude differences to Manhattan distance.
sample_df["trip_distance"] = 0.621371 * 6371 * (abs(2 * np.arctan2(np.sqrt(np.square(np.sin((abs(sample_df["latitude_difference"]) * np.pi / 180) / 2))),
                                  np.sqrt(1-(np.square(np.sin((abs(sample_df["latitude_difference"]) * np.pi / 180) / 2)))))) + \
                                     abs(2 * np.arctan2(np.sqrt(np.square(np.sin((abs(sample_df["longitude_difference"]) * np.pi / 180) / 2))),
                                  np.sqrt(1-(np.square(np.sin((abs(sample_df["longitude_difference"]) * np.pi / 180) / 2)))))))

sample_df.head(10)

Unnamed: 0.1,Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,...,Unnamed: 11,Unnamed: 12,pickup_month,pickup_day,pickup_weekday,pickup_hour,pickup_minute,latitude_difference,longitude_difference,trip_distance
0,0,id1080784,2,2016-02-29 16:40:00,2016-02-29 16:47:00,1,-73.953918,40.778873,-73.963875,40.771164,...,,,2,29,0,16,40,-0.007709,-0.009956,1.220593
1,1,id0889885,1,2016-03-11 23:35:00,2016-03-11 23:53:00,2,-73.988312,40.731743,-73.994751,40.694931,...,,,3,11,4,23,35,-0.036812,-0.006439,2.988357
2,2,id0857912,2,2016-02-21 17:59:00,2016-02-21 18:26:00,2,-73.997314,40.721458,-73.948029,40.774918,...,,,2,21,6,17,59,0.053459,0.049286,7.098995
3,3,id3744273,2,2016-01-05 09:44:00,2016-01-05 10:03:00,6,-73.96167,40.75972,-73.956779,40.780628,...,,,1,5,1,9,44,0.020908,0.00489,1.782524
4,4,id0232939,1,2016-02-17 06:42:00,2016-02-17 06:56:00,1,-74.01712,40.708469,-73.988182,40.740631,...,,,2,17,2,6,42,0.032162,0.028938,4.221601
5,5,id1918069,2,2016-02-14 18:31:00,2016-02-14 18:55:00,2,-73.993614,40.751884,-73.995422,40.723862,...,,,2,14,6,18,31,-0.028023,-0.001808,2.061117
6,6,id2429028,1,2016-04-20 20:30:00,2016-04-20 20:36:00,1,-73.96508,40.758915,-73.976807,40.764107,...,,,4,20,2,20,30,0.005192,-0.011726,1.168933
7,7,id1663798,2,2016-06-19 16:48:00,2016-06-19 17:06:00,1,-73.96389,40.765434,-73.872429,40.7742,...,,,6,19,6,16,48,0.008766,0.091461,6.925039
8,8,id2436943,2,2016-03-28 19:17:00,2016-03-28 19:48:00,2,-73.872887,40.774281,-73.979019,40.761879,...,,,3,28,0,19,17,-0.012402,-0.106133,8.189912
9,9,id2933909,1,2016-04-10 22:01:00,2016-04-10 22:25:00,1,-73.987823,40.740982,-73.999153,40.686451,...,,,4,10,6,22,1,-0.054531,-0.01133,4.550537


In [20]:
# Splitting
from sklearn.preprocessing import MinMaxScaler
X = sample_df.drop(["trip_duration", "id", "vendor_id", "pickup_datetime", "dropoff_datetime", "Unnamed: 11","Unnamed: 12"], axis=1)
y = sample_df["trip_duration"]

In [21]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=2018)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=2019)

In [5]:
from sklearn.linear_model import LinearRegression
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)
y_pred = lr_model.predict(X_test)

In [6]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# Calculate Mean Absolute Error (MAE)
mae = mean_absolute_error(y_test, y_pred)

# Calculate Mean Squared Error (MSE)
mse = mean_squared_error(y_test, y_pred)

# Calculate Root Mean Squared Error (RMSE)
rmse = np.sqrt(mse)

# Calculate R-squared (R2 score)
r2 = r2_score(y_test, y_pred)

# Calculate Root Mean Squared Logarithmic Error (RMSLE)
def rmsle(y_true, y_pred):
    return np.sqrt(np.mean(np.square(np.log1p(y_pred) - np.log1p(y_true))))
rmsle_val = rmsle(y_test, y_pred)

# Calculate Adjusted R-squared
n = X_test.shape[0]  # Number of samples
p = X_test.shape[1]  # Number of features
adjusted_r2 = 1 - (1 - r2) * ((n - 1) / (n - p - 1))

# Print the metrics
print("Mean Absolute Error (MAE):", mae)
print("Mean Squared Error (MSE):", mse)
print("Root Mean Squared Error (RMSE):", rmse)
print("Root Mean Squared Logarithmic Error (RMSLE):", rmsle_val)
print("R-squared (R2 score):", r2)
print("Adjusted R-squared:", adjusted_r2)

Mean Absolute Error (MAE): 7.385460864711088
Mean Squared Error (MSE): 2713.8248792715635
Root Mean Squared Error (RMSE): 52.09438433527709
Root Mean Squared Logarithmic Error (RMSLE): 0.5551895579705758
R-squared (R2 score): 0.013824453390151481
Adjusted R-squared: 0.013701165008374483


  return np.sqrt(np.mean(np.square(np.log1p(y_pred) - np.log1p(y_true))))


In [8]:
#Elastic Net Regression 
from sklearn.linear_model import ElasticNet

# Creating and fitting the Elastic Net Regression model
elastic_net_model = ElasticNet()
elastic_net_model.fit(X_train, y_train)

# Making predictions on the test set
y_pred = elastic_net_model.predict(X_test)

In [9]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# Calculate Mean Absolute Error (MAE)
mae = mean_absolute_error(y_test, y_pred)

# Calculate Mean Squared Error (MSE)
mse = mean_squared_error(y_test, y_pred)

# Calculate Root Mean Squared Error (RMSE)
rmse = np.sqrt(mse)

# Calculate R-squared (R2 score)
r2 = r2_score(y_test, y_pred)

# Calculate Root Mean Squared Logarithmic Error (RMSLE)
def rmsle(y_true, y_pred):
    return np.sqrt(np.mean(np.square(np.log1p(y_pred) - np.log1p(y_true))))
rmsle_val = rmsle(y_test, y_pred)

# Calculate Adjusted R-squared
n = X_test.shape[0]  # Number of samples
p = X_test.shape[1]  # Number of features
adjusted_r2 = 1 - (1 - r2) * ((n - 1) / (n - p - 1))

# Print the metrics
print("Mean Absolute Error (MAE):", mae)
print("Mean Squared Error (MSE):", mse)
print("Root Mean Squared Error (RMSE):", rmse)
print("Root Mean Squared Logarithmic Error (RMSLE):", rmsle_val)
print("R-squared (R2 score):", r2)
print("Adjusted R-squared:", adjusted_r2)

Mean Absolute Error (MAE): 7.426174955577725
Mean Squared Error (MSE): 2714.958701649071
Root Mean Squared Error (RMSE): 52.10526558467072
Root Mean Squared Logarithmic Error (RMSLE): 0.5641186930819356
R-squared (R2 score): 0.013412434209607693
Adjusted R-squared: 0.013289094318565176


In [10]:
# Gradient Boosting Regression
from sklearn.ensemble import GradientBoostingRegressor

# Creating and fitting the Gradient Boosting Regression model
gb_model = GradientBoostingRegressor()
gb_model.fit(X_train, y_train)

# Making predictions on the test set
y_pred = gb_model.predict(X_test)

In [11]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# Calculate Mean Absolute Error (MAE)
mae = mean_absolute_error(y_test, y_pred)

# Calculate Mean Squared Error (MSE)
mse = mean_squared_error(y_test, y_pred)

# Calculate Root Mean Squared Error (RMSE)
rmse = np.sqrt(mse)

# Calculate R-squared (R2 score)
r2 = r2_score(y_test, y_pred)

# Calculate Root Mean Squared Logarithmic Error (RMSLE)
def rmsle(y_true, y_pred):
    return np.sqrt(np.mean(np.square(np.log1p(y_pred) - np.log1p(y_true))))
rmsle_val = rmsle(y_test, y_pred)

# Calculate Adjusted R-squared
n = X_test.shape[0]  # Number of samples
p = X_test.shape[1]  # Number of features
adjusted_r2 = 1 - (1 - r2) * ((n - 1) / (n - p - 1))

# Print the metrics
print("Mean Absolute Error (MAE):", mae)
print("Mean Squared Error (MSE):", mse)
print("Root Mean Squared Error (RMSE):", rmse)
print("Root Mean Squared Logarithmic Error (RMSLE):", rmsle_val)
print("R-squared (R2 score):", r2)
print("Adjusted R-squared:", adjusted_r2)

Mean Absolute Error (MAE): 6.86134679560485
Mean Squared Error (MSE): 3346.9842704911775
Root Mean Squared Error (RMSE): 57.85312671317928
Root Mean Squared Logarithmic Error (RMSLE): 0.4905793837325415
R-squared (R2 score): -0.21625904002036034
Adjusted R-squared: -0.2164110926740499


  return np.sqrt(np.mean(np.square(np.log1p(y_pred) - np.log1p(y_true))))


In [12]:
from sklearn.linear_model import SGDRegressor

# Creating and fitting the Stochastic Gradient Descent Regression model
sgd_model = SGDRegressor()
sgd_model.fit(X_train, y_train)

# Making predictions on the test set
y_pred = sgd_model.predict(X_test)

In [17]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# Calculate Mean Absolute Error (MAE)
mae = mean_absolute_error(y_test, y_pred)

# Calculate Mean Squared Error (MSE)
mse = mean_squared_error(y_test, y_pred)

# Calculate Root Mean Squared Error (RMSE)
rmse = np.sqrt(mse)

# Calculate R-squared (R2 score)
r2 = r2_score(y_test, y_pred)

# Calculate Root Mean Squared Logarithmic Error (RMSLE)
def rmsle(y_true, y_pred):
    mask = (y_true > 0) & (y_pred > 0)
    y_true = y_true[mask]
    y_pred = y_pred[mask]
    
    if len(y_true) == 0:  # If all values are non-positive, handle this case
        return 0  # Return a default value or handle it as needed
    
    return np.sqrt(np.mean(np.square(np.log1p(y_pred) - np.log1p(y_true))))

# Calculate Adjusted R-squared
n = X_test.shape[0]  # Number of samples
p = X_test.shape[1]  # Number of features
adjusted_r2 = 1 - (1 - r2) * ((n - 1) / (n - p - 1))

# Print the metrics
print("Mean Absolute Error (MAE):", mae)
print("Mean Squared Error (MSE):", mse)
print("Root Mean Squared Error (RMSE):", rmse)
print("Root Mean Squared Logarithmic Error (RMSLE):", rmsle_val)
print("R-squared (R2 score):", r2)
print("Adjusted R-squared:", adjusted_r2)

Mean Absolute Error (MAE): 3.999977711762561e+18
Mean Squared Error (MSE): 2.1348136546582396e+37
Root Mean Squared Error (RMSE): 4.620404370461788e+18
Root Mean Squared Logarithmic Error (RMSLE): nan
R-squared (R2 score): -7.757689299973756e+33
Adjusted R-squared: -7.758659140448315e+33


In [18]:
from sklearn.linear_model import BayesianRidge

# Creating and fitting the Bayesian Ridge Regression model
bayesian_model = BayesianRidge()
bayesian_model.fit(X_train, y_train)

# Making predictions on the test set
y_pred = bayesian_model.predict(X_test)

In [19]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# Calculate Mean Absolute Error (MAE)
mae = mean_absolute_error(y_test, y_pred)

# Calculate Mean Squared Error (MSE)
mse = mean_squared_error(y_test, y_pred)

# Calculate Root Mean Squared Error (RMSE)
rmse = np.sqrt(mse)

# Calculate R-squared (R2 score)
r2 = r2_score(y_test, y_pred)

# Calculate Root Mean Squared Logarithmic Error (RMSLE)
def rmsle(y_true, y_pred):
    return np.sqrt(np.mean(np.square(np.log1p(y_pred) - np.log1p(y_true))))
rmsle_val = rmsle(y_test, y_pred)

# Calculate Adjusted R-squared
n = X_test.shape[0]  # Number of samples
p = X_test.shape[1]  # Number of features
adjusted_r2 = 1 - (1 - r2) * ((n - 1) / (n - p - 1))

# Print the metrics
print("Mean Absolute Error (MAE):", mae)
print("Mean Squared Error (MSE):", mse)
print("Root Mean Squared Error (RMSE):", rmse)
print("Root Mean Squared Logarithmic Error (RMSLE):", rmsle_val)
print("R-squared (R2 score):", r2)
print("Adjusted R-squared:", adjusted_r2)

Mean Absolute Error (MAE): 7.39888827209841
Mean Squared Error (MSE): 2718.3437967723025
Root Mean Squared Error (RMSE): 52.13773870021889
Root Mean Squared Logarithmic Error (RMSLE): 0.5567364084905849
R-squared (R2 score): 0.012182326084732797
Adjusted R-squared: 0.012058832409670095


In [20]:
from catboost import CatBoostRegressor

# Creating and fitting the CatBoostRegressor model
catboost_model = CatBoostRegressor()
catboost_model.fit(X_train, y_train)

# Making predictions on the test set
y_pred = catboost_model.predict(X_test)

Learning rate set to 0.095299
0:	learn: 85.2597864	total: 158ms	remaining: 2m 37s
1:	learn: 83.8828356	total: 175ms	remaining: 1m 27s
2:	learn: 82.5537762	total: 194ms	remaining: 1m 4s
3:	learn: 81.5271153	total: 212ms	remaining: 52.7s
4:	learn: 80.2934610	total: 226ms	remaining: 45s
5:	learn: 79.0981410	total: 241ms	remaining: 39.9s
6:	learn: 78.8430337	total: 258ms	remaining: 36.6s
7:	learn: 77.6939330	total: 274ms	remaining: 34s
8:	learn: 76.8131247	total: 289ms	remaining: 31.8s
9:	learn: 75.7445403	total: 299ms	remaining: 29.6s
10:	learn: 75.2849020	total: 312ms	remaining: 28s
11:	learn: 74.2649829	total: 324ms	remaining: 26.7s
12:	learn: 73.2897077	total: 334ms	remaining: 25.4s
13:	learn: 72.5319235	total: 348ms	remaining: 24.5s
14:	learn: 71.6183714	total: 360ms	remaining: 23.6s
15:	learn: 70.7302207	total: 373ms	remaining: 23s
16:	learn: 69.8730984	total: 386ms	remaining: 22.3s
17:	learn: 69.0556959	total: 398ms	remaining: 21.7s
18:	learn: 68.2599395	total: 411ms	remaining: 21.2

163:	learn: 49.6239983	total: 2.08s	remaining: 10.6s
164:	learn: 49.6153433	total: 2.09s	remaining: 10.6s
165:	learn: 49.6076363	total: 2.1s	remaining: 10.5s
166:	learn: 49.6031913	total: 2.11s	remaining: 10.5s
167:	learn: 49.5709738	total: 2.12s	remaining: 10.5s
168:	learn: 49.5661821	total: 2.13s	remaining: 10.5s
169:	learn: 49.5628548	total: 2.13s	remaining: 10.4s
170:	learn: 49.5502766	total: 2.14s	remaining: 10.4s
171:	learn: 49.5457798	total: 2.15s	remaining: 10.4s
172:	learn: 49.5404681	total: 2.16s	remaining: 10.3s
173:	learn: 49.5102647	total: 2.17s	remaining: 10.3s
174:	learn: 49.5027402	total: 2.18s	remaining: 10.3s
175:	learn: 49.4932884	total: 2.19s	remaining: 10.3s
176:	learn: 49.4832100	total: 2.2s	remaining: 10.2s
177:	learn: 49.4823799	total: 2.21s	remaining: 10.2s
178:	learn: 49.4705881	total: 2.22s	remaining: 10.2s
179:	learn: 49.4395505	total: 2.23s	remaining: 10.2s
180:	learn: 49.4295919	total: 2.24s	remaining: 10.1s
181:	learn: 49.4264229	total: 2.25s	remaining: 1

331:	learn: 48.1563643	total: 3.73s	remaining: 7.5s
332:	learn: 48.1413600	total: 3.74s	remaining: 7.49s
333:	learn: 48.1316160	total: 3.75s	remaining: 7.48s
334:	learn: 48.1290482	total: 3.76s	remaining: 7.46s
335:	learn: 48.1214575	total: 3.77s	remaining: 7.45s
336:	learn: 48.1179326	total: 3.78s	remaining: 7.44s
337:	learn: 48.1089052	total: 3.79s	remaining: 7.43s
338:	learn: 48.1073249	total: 3.8s	remaining: 7.42s
339:	learn: 48.0997854	total: 3.81s	remaining: 7.41s
340:	learn: 48.0835243	total: 3.83s	remaining: 7.4s
341:	learn: 48.0735630	total: 3.84s	remaining: 7.39s
342:	learn: 48.0693158	total: 3.85s	remaining: 7.38s
343:	learn: 48.0623806	total: 3.87s	remaining: 7.37s
344:	learn: 48.0587764	total: 3.88s	remaining: 7.36s
345:	learn: 48.0533595	total: 3.89s	remaining: 7.34s
346:	learn: 48.0468163	total: 3.9s	remaining: 7.33s
347:	learn: 48.0394062	total: 3.91s	remaining: 7.32s
348:	learn: 48.0351635	total: 3.92s	remaining: 7.32s
349:	learn: 48.0195834	total: 3.94s	remaining: 7.3

488:	learn: 47.0891396	total: 5.6s	remaining: 5.86s
489:	learn: 47.0818364	total: 5.62s	remaining: 5.84s
490:	learn: 47.0758681	total: 5.63s	remaining: 5.83s
491:	learn: 47.0680409	total: 5.64s	remaining: 5.82s
492:	learn: 47.0619410	total: 5.65s	remaining: 5.81s
493:	learn: 47.0558340	total: 5.67s	remaining: 5.8s
494:	learn: 47.0521098	total: 5.68s	remaining: 5.79s
495:	learn: 47.0473333	total: 5.69s	remaining: 5.78s
496:	learn: 47.0387250	total: 5.7s	remaining: 5.77s
497:	learn: 47.0351314	total: 5.71s	remaining: 5.76s
498:	learn: 47.0208200	total: 5.72s	remaining: 5.75s
499:	learn: 47.0141585	total: 5.74s	remaining: 5.74s
500:	learn: 47.0067118	total: 5.75s	remaining: 5.73s
501:	learn: 46.9964006	total: 5.76s	remaining: 5.72s
502:	learn: 46.9903974	total: 5.78s	remaining: 5.71s
503:	learn: 46.9773062	total: 5.79s	remaining: 5.7s
504:	learn: 46.9723988	total: 5.8s	remaining: 5.68s
505:	learn: 46.9603479	total: 5.81s	remaining: 5.67s
506:	learn: 46.9491677	total: 5.82s	remaining: 5.66

652:	learn: 46.1007695	total: 7.47s	remaining: 3.97s
653:	learn: 46.0970003	total: 7.49s	remaining: 3.96s
654:	learn: 46.0897892	total: 7.5s	remaining: 3.95s
655:	learn: 46.0794456	total: 7.51s	remaining: 3.94s
656:	learn: 46.0675906	total: 7.52s	remaining: 3.92s
657:	learn: 46.0653960	total: 7.53s	remaining: 3.91s
658:	learn: 46.0578564	total: 7.54s	remaining: 3.9s
659:	learn: 46.0530245	total: 7.55s	remaining: 3.89s
660:	learn: 46.0480225	total: 7.56s	remaining: 3.88s
661:	learn: 46.0432957	total: 7.57s	remaining: 3.86s
662:	learn: 46.0390562	total: 7.58s	remaining: 3.85s
663:	learn: 46.0350113	total: 7.59s	remaining: 3.84s
664:	learn: 46.0331275	total: 7.6s	remaining: 3.83s
665:	learn: 46.0226840	total: 7.61s	remaining: 3.81s
666:	learn: 46.0204522	total: 7.62s	remaining: 3.8s
667:	learn: 46.0070857	total: 7.63s	remaining: 3.79s
668:	learn: 46.0008698	total: 7.64s	remaining: 3.78s
669:	learn: 45.9953082	total: 7.65s	remaining: 3.77s
670:	learn: 45.9906451	total: 7.66s	remaining: 3.7

823:	learn: 45.2087044	total: 9.37s	remaining: 2s
824:	learn: 45.2001435	total: 9.38s	remaining: 1.99s
825:	learn: 45.1966516	total: 9.39s	remaining: 1.98s
826:	learn: 45.1910689	total: 9.4s	remaining: 1.97s
827:	learn: 45.1841242	total: 9.42s	remaining: 1.96s
828:	learn: 45.1788206	total: 9.43s	remaining: 1.94s
829:	learn: 45.1726010	total: 9.44s	remaining: 1.93s
830:	learn: 45.1675460	total: 9.45s	remaining: 1.92s
831:	learn: 45.1610206	total: 9.46s	remaining: 1.91s
832:	learn: 45.1562737	total: 9.48s	remaining: 1.9s
833:	learn: 45.1517672	total: 9.49s	remaining: 1.89s
834:	learn: 45.1475947	total: 9.5s	remaining: 1.88s
835:	learn: 45.1430729	total: 9.51s	remaining: 1.86s
836:	learn: 45.1374384	total: 9.52s	remaining: 1.85s
837:	learn: 45.1347683	total: 9.54s	remaining: 1.84s
838:	learn: 45.1277616	total: 9.55s	remaining: 1.83s
839:	learn: 45.1215364	total: 9.56s	remaining: 1.82s
840:	learn: 45.1195265	total: 9.57s	remaining: 1.81s
841:	learn: 45.1150375	total: 9.58s	remaining: 1.8s


988:	learn: 44.3646340	total: 11.3s	remaining: 125ms
989:	learn: 44.3613662	total: 11.3s	remaining: 114ms
990:	learn: 44.3573057	total: 11.3s	remaining: 102ms
991:	learn: 44.3490212	total: 11.3s	remaining: 91ms
992:	learn: 44.3477710	total: 11.3s	remaining: 79.6ms
993:	learn: 44.3461655	total: 11.3s	remaining: 68.3ms
994:	learn: 44.3444485	total: 11.3s	remaining: 56.9ms
995:	learn: 44.3395147	total: 11.3s	remaining: 45.5ms
996:	learn: 44.3342195	total: 11.3s	remaining: 34.1ms
997:	learn: 44.3294894	total: 11.4s	remaining: 22.8ms
998:	learn: 44.3247225	total: 11.4s	remaining: 11.4ms
999:	learn: 44.3182926	total: 11.4s	remaining: 0us


In [22]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# Calculate Mean Absolute Error (MAE)
mae = mean_absolute_error(y_test, y_pred)

# Calculate Mean Squared Error (MSE)
mse = mean_squared_error(y_test, y_pred)

# Calculate Root Mean Squared Error (RMSE)
rmse = np.sqrt(mse)

# Calculate R-squared (R2 score)
r2 = r2_score(y_test, y_pred)

# Calculate Root Mean Squared Logarithmic Error (RMSLE)
def rmsle(y_true, y_pred):
    mask = (y_true > 0) & (y_pred > 0)
    y_true = y_true[mask]
    y_pred = y_pred[mask]
    
    if len(y_true) == 0:  # If all values are non-positive, handle this case
        return 0  # Return a default value or handle it as needed
    
    return np.sqrt(np.mean(np.square(np.log1p(y_pred) - np.log1p(y_true))))

# Calculate Adjusted R-squared
n = X_test.shape[0]  # Number of samples
p = X_test.shape[1]  # Number of features
adjusted_r2 = 1 - (1 - r2) * ((n - 1) / (n - p - 1))

# Print the metrics
print("Mean Absolute Error (MAE):", mae)
print("Mean Squared Error (MSE):", mse)
print("Root Mean Squared Error (RMSE):", rmse)
print("Root Mean Squared Logarithmic Error (RMSLE):", rmsle_val)
print("R-squared (R2 score):", r2)
print("Adjusted R-squared:", adjusted_r2)

Mean Absolute Error (MAE): 6.672727148997322
Mean Squared Error (MSE): 2876.008047937266
Root Mean Squared Error (RMSE): 53.628425745468846
Root Mean Squared Logarithmic Error (RMSLE): 0.490507381886914
R-squared (R2 score): -0.0451112119991155
Adjusted R-squared: -0.045241868321458156


In [23]:
from xgboost import XGBRegressor

# Creating and fitting the XGBoost Regressor model
xgb_model = XGBRegressor()
xgb_model.fit(X_train, y_train)

# Making predictions on the test set
y_pred = xgb_model.predict(X_test)

In [24]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# Calculate Mean Absolute Error (MAE)
mae = mean_absolute_error(y_test, y_pred)

# Calculate Mean Squared Error (MSE)
mse = mean_squared_error(y_test, y_pred)

# Calculate Root Mean Squared Error (RMSE)
rmse = np.sqrt(mse)

# Calculate R-squared (R2 score)
r2 = r2_score(y_test, y_pred)

# Calculate Root Mean Squared Logarithmic Error (RMSLE)
def rmsle(y_true, y_pred):
    mask = (y_true > 0) & (y_pred > 0)
    y_true = y_true[mask]
    y_pred = y_pred[mask]
    
    if len(y_true) == 0:  # If all values are non-positive, handle this case
        return 0  # Return a default value or handle it as needed
    
    return np.sqrt(np.mean(np.square(np.log1p(y_pred) - np.log1p(y_true))))

# Calculate Adjusted R-squared
n = X_test.shape[0]  # Number of samples
p = X_test.shape[1]  # Number of features
adjusted_r2 = 1 - (1 - r2) * ((n - 1) / (n - p - 1))

# Print the metrics
print("Mean Absolute Error (MAE):", mae)
print("Mean Squared Error (MSE):", mse)
print("Root Mean Squared Error (RMSE):", rmse)
print("Root Mean Squared Logarithmic Error (RMSLE):", rmsle_val)
print("R-squared (R2 score):", r2)
print("Adjusted R-squared:", adjusted_r2)

Mean Absolute Error (MAE): 7.583902464901764
Mean Squared Error (MSE): 3537.627026675266
Root Mean Squared Error (RMSE): 59.477954123147725
Root Mean Squared Logarithmic Error (RMSLE): 0.490507381886914
R-squared (R2 score): -0.2855366215338426
Adjusted R-squared: -0.28569733504000183


In [8]:
import re

# Define a function to sanitize feature names
def sanitize_feature_names(feature_names):
    sanitized_names = []
    for name in feature_names:
        # Replace non-alphanumeric characters with underscores
        sanitized = re.sub(r'\W+', '_', name)
        # Ensure the name doesn't start with a number
        if re.match(r'^\d', sanitized):
            sanitized = '_' + sanitized
        sanitized_names.append(sanitized)
    return sanitized_names

# Sanitize feature names
X_train.columns = sanitize_feature_names(X_train.columns)


In [9]:
from lightgbm import LGBMRegressor

# Creating and fitting the LGBM Regressor model
lgbm_model = LGBMRegressor()
lgbm_model.fit(X_train, y_train)

# Making predictions on the test set
y_pred = lgbm_model.predict(X_test)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.008429 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2179
[LightGBM] [Info] Number of data points in the train set: 210000, number of used features: 15
[LightGBM] [Info] Start training from score 15.852305


In [11]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# Calculate Mean Absolute Error (MAE)
mae = mean_absolute_error(y_test, y_pred)

# Calculate Mean Squared Error (MSE)
mse = mean_squared_error(y_test, y_pred)

# Calculate Root Mean Squared Error (RMSE)
rmse = np.sqrt(mse)

# Calculate R-squared (R2 score)
r2 = r2_score(y_test, y_pred)

# Calculate Root Mean Squared Logarithmic Error (RMSLE)
def rmsle(y_true, y_pred):
    mask = (y_true > 0) & (y_pred > 0)
    y_true = y_true[mask]
    y_pred = y_pred[mask]
    
    if len(y_true) == 0:  # If all values are non-positive, handle this case
        return 0  # Return a default value or handle it as needed
    
    return np.sqrt(np.mean(np.square(np.log1p(y_pred) - np.log1p(y_true))))

# Calculate Adjusted R-squared
n = X_test.shape[0]  # Number of samples
p = X_test.shape[1]  # Number of features
adjusted_r2 = 1 - (1 - r2) * ((n - 1) / (n - p - 1))

# Calculate Root Mean Squared Logarithmic Error (RMSLE)
rmsle_val = rmsle(y_test, y_pred)

# Print the metrics
print("Mean Absolute Error (MAE):", mae)
print("Mean Squared Error (MSE):", mse)
print("Root Mean Squared Error (RMSE):", rmse)
print("Root Mean Squared Logarithmic Error (RMSLE):", rmsle_val)
print("R-squared (R2 score):", r2)
print("Adjusted R-squared:", adjusted_r2)

Mean Absolute Error (MAE): 6.990165794133198
Mean Squared Error (MSE): 2917.1599871615995
Root Mean Squared Error (RMSE): 54.01073955392204
Root Mean Squared Logarithmic Error (RMSLE): 0.47445235974788597
R-squared (R2 score): -0.060065395840743996
Adjusted R-squared: -0.06019792168533655


In [1]:
#Second Set of Exp

In [7]:
#Polynomial Regression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import make_pipeline

# Creating polynomial features
poly_degree = 3  # Define the degree of the polynomial
poly = PolynomialFeatures(degree=poly_degree)
X_train_poly = poly.fit_transform(X_train)
X_test_poly = poly.transform(X_test)

# Creating and fitting the Polynomial Regression model
poly_model = LinearRegression()
poly_model.fit(X_train_poly, y_train)
y_pred = poly_model.predict(X_test_poly)

In [8]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# Calculate Mean Absolute Error (MAE)
mae = mean_absolute_error(y_test, y_pred)

# Calculate Mean Squared Error (MSE)
mse = mean_squared_error(y_test, y_pred)

# Calculate Root Mean Squared Error (RMSE)
rmse = np.sqrt(mse)

# Calculate R-squared (R2 score)
r2 = r2_score(y_test, y_pred)

# Calculate Root Mean Squared Logarithmic Error (RMSLE)
def rmsle(y_true, y_pred):
    return np.sqrt(np.mean(np.square(np.log1p(y_pred) - np.log1p(y_true))))
rmsle_val = rmsle(y_test, y_pred)

# Calculate Adjusted R-squared
n = X_test.shape[0]  # Number of samples
p = X_test.shape[1]  # Number of features
adjusted_r2 = 1 - (1 - r2) * ((n - 1) / (n - p - 1))

# Print the metrics
print("Mean Absolute Error (MAE):", mae)
print("Mean Squared Error (MSE):", mse)
print("Root Mean Squared Error (RMSE):", rmse)
print("Root Mean Squared Logarithmic Error (RMSLE):", rmsle_val)
print("R-squared (R2 score):", r2)
print("Adjusted R-squared:", adjusted_r2)

Mean Absolute Error (MAE): 8.832486569300418
Mean Squared Error (MSE): 28214.08833702033
Root Mean Squared Error (RMSE): 167.9704984127282
Root Mean Squared Logarithmic Error (RMSLE): 0.6121489749885518
R-squared (R2 score): -9.252704292153155
Adjusted R-squared: -9.253986051090864


  return np.sqrt(np.mean(np.square(np.log1p(y_pred) - np.log1p(y_true))))


In [9]:
#Ridge Regression
from sklearn.linear_model import Ridge

ridge_model = Ridge(alpha=1.0)  # You can adjust the alpha (regularization strength) as needed
ridge_model.fit(X_train, y_train)
y_pred = ridge_model.predict(X_test)

In [10]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# Calculate Mean Absolute Error (MAE)
mae = mean_absolute_error(y_test, y_pred)

# Calculate Mean Squared Error (MSE)
mse = mean_squared_error(y_test, y_pred)

# Calculate Root Mean Squared Error (RMSE)
rmse = np.sqrt(mse)

# Calculate R-squared (R2 score)
r2 = r2_score(y_test, y_pred)

# Calculate Root Mean Squared Logarithmic Error (RMSLE)
def rmsle(y_true, y_pred):
    return np.sqrt(np.mean(np.square(np.log1p(y_pred) - np.log1p(y_true))))
rmsle_val = rmsle(y_test, y_pred)

# Calculate Adjusted R-squared
n = X_test.shape[0]  # Number of samples
p = X_test.shape[1]  # Number of features
adjusted_r2 = 1 - (1 - r2) * ((n - 1) / (n - p - 1))

# Print the metrics
print("Mean Absolute Error (MAE):", mae)
print("Mean Squared Error (MSE):", mse)
print("Root Mean Squared Error (RMSE):", rmse)
print("Root Mean Squared Logarithmic Error (RMSLE):", rmsle_val)
print("R-squared (R2 score):", r2)
print("Adjusted R-squared:", adjusted_r2)

Mean Absolute Error (MAE): 7.385367541776701
Mean Squared Error (MSE): 2713.8488996146802
Root Mean Squared Error (RMSE): 52.09461488114371
Root Mean Squared Logarithmic Error (RMSLE): 0.5551813772960797
R-squared (R2 score): 0.013815724648962635
Adjusted R-squared: 0.013692435175947426


  return np.sqrt(np.mean(np.square(np.log1p(y_pred) - np.log1p(y_true))))


In [12]:
#Principal Components Regression
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import make_pipeline

# Assuming X_train and X_test are your feature matrices, and y_train is the target variable

# Step 1: Scale the data (recommended before applying PCA)
# (You can use other scaling methods like StandardScaler or MinMaxScaler as needed)
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Step 2: Perform PCA and then fit the model
n_components = 10  # Number of components (you can change this number)
pca = PCA(n_components=n_components)
linear_regression = LinearRegression()

# Create a pipeline to chain PCA and Linear Regression
pipeline = make_pipeline(pca, linear_regression)

# Fit the model using the pipeline
pipeline.fit(X_train_scaled, y_train)

# Predict using the test data
y_pred = pipeline.predict(X_test_scaled)

In [13]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# Calculate Mean Absolute Error (MAE)
mae = mean_absolute_error(y_test, y_pred)

# Calculate Mean Squared Error (MSE)
mse = mean_squared_error(y_test, y_pred)

# Calculate Root Mean Squared Error (RMSE)
rmse = np.sqrt(mse)

# Calculate R-squared (R2 score)
r2 = r2_score(y_test, y_pred)

# Calculate Root Mean Squared Logarithmic Error (RMSLE)
def rmsle(y_true, y_pred):
    return np.sqrt(np.mean(np.square(np.log1p(y_pred) - np.log1p(y_true))))
rmsle_val = rmsle(y_test, y_pred)

# Calculate Adjusted R-squared
n = X_test.shape[0]  # Number of samples
p = X_test.shape[1]  # Number of features
adjusted_r2 = 1 - (1 - r2) * ((n - 1) / (n - p - 1))

# Print the metrics
print("Mean Absolute Error (MAE):", mae)
print("Mean Squared Error (MSE):", mse)
print("Root Mean Squared Error (RMSE):", rmse)
print("Root Mean Squared Logarithmic Error (RMSLE):", rmsle_val)
print("R-squared (R2 score):", r2)
print("Adjusted R-squared:", adjusted_r2)

Mean Absolute Error (MAE): 8.426113521421964
Mean Squared Error (MSE): 2707.727131280174
Root Mean Squared Error (RMSE): 52.035825459775054
Root Mean Squared Logarithmic Error (RMSLE): 0.6521126076695455
R-squared (R2 score): 0.016040311165068655
Adjusted R-squared: 0.01591729980244938


  return np.sqrt(np.mean(np.square(np.log1p(y_pred) - np.log1p(y_true))))


In [14]:
#Lasso Regression
from sklearn.linear_model import Lasso

# Alpha (regularization strength) needs to be set. This is a hyperparameter that can be tuned.
alpha = 0.1  # You can change the value of alpha as needed

lasso_model = Lasso(alpha=alpha)
lasso_model.fit(X_train, y_train)
y_pred = lasso_model.predict(X_test)

In [15]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# Calculate Mean Absolute Error (MAE)
mae = mean_absolute_error(y_test, y_pred)

# Calculate Mean Squared Error (MSE)
mse = mean_squared_error(y_test, y_pred)

# Calculate Root Mean Squared Error (RMSE)
rmse = np.sqrt(mse)

# Calculate R-squared (R2 score)
r2 = r2_score(y_test, y_pred)

# Calculate Root Mean Squared Logarithmic Error (RMSLE)
def rmsle(y_true, y_pred):
    return np.sqrt(np.mean(np.square(np.log1p(y_pred) - np.log1p(y_true))))
rmsle_val = rmsle(y_test, y_pred)

# Calculate Adjusted R-squared
n = X_test.shape[0]  # Number of samples
p = X_test.shape[1]  # Number of features
adjusted_r2 = 1 - (1 - r2) * ((n - 1) / (n - p - 1))

# Print the metrics
print("Mean Absolute Error (MAE):", mae)
print("Mean Squared Error (MSE):", mse)
print("Root Mean Squared Error (RMSE):", rmse)
print("Root Mean Squared Logarithmic Error (RMSLE):", rmsle_val)
print("R-squared (R2 score):", r2)
print("Adjusted R-squared:", adjusted_r2)

Mean Absolute Error (MAE): 7.399792651247587
Mean Squared Error (MSE): 2718.42768674855
Root Mean Squared Error (RMSE): 52.13854319741347
Root Mean Squared Logarithmic Error (RMSLE): 0.5568195240599011
R-squared (R2 score): 0.012151841345716452
Adjusted R-squared: 0.012028343859553314


In [5]:
#Ordinal Regression
from mord import LogisticIT
import pandas as pd

num_classes = 10  # You can change this based on your data or required number of classes
y_train_ordinal = pd.qcut(y_train, q=num_classes, labels=False)

ordinal_model = LogisticIT()
ordinal_model.fit(X_train, y_train_ordinal)
y_pred = ordinal_model.predict(X_test)


In [6]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# Calculate Mean Absolute Error (MAE)
mae = mean_absolute_error(y_test, y_pred)

# Calculate Mean Squared Error (MSE)
mse = mean_squared_error(y_test, y_pred)

# Calculate Root Mean Squared Error (RMSE)
rmse = np.sqrt(mse)

# Calculate R-squared (R2 score)
r2 = r2_score(y_test, y_pred)

# Calculate Root Mean Squared Logarithmic Error (RMSLE)
def rmsle(y_true, y_pred):
    return np.sqrt(np.mean(np.square(np.log1p(y_pred) - np.log1p(y_true))))
rmsle_val = rmsle(y_test, y_pred)

# Calculate Adjusted R-squared
n = X_test.shape[0]  # Number of samples
p = X_test.shape[1]  # Number of features
adjusted_r2 = 1 - (1 - r2) * ((n - 1) / (n - p - 1))

# Print the metrics
print("Mean Absolute Error (MAE):", mae)
print("Mean Squared Error (MSE):", mse)
print("Root Mean Squared Error (RMSE):", rmse)
print("Root Mean Squared Logarithmic Error (RMSLE):", rmsle_val)
print("R-squared (R2 score):", r2)
print("Adjusted R-squared:", adjusted_r2)

Mean Absolute Error (MAE): 11.664683333333333
Mean Squared Error (MSE): 2855.47295
Root Mean Squared Error (RMSE): 53.43662554840079
Root Mean Squared Logarithmic Error (RMSLE): 1.3323076577690345
R-squared (R2 score): -0.03764897241702214
Adjusted R-squared: -0.03777869583502991


In [7]:
#Poisson Regression
import statsmodels.api as sm

# Assuming X_train and y_train are your training data
poisson_model = sm.GLM(y_train, X_train, family=sm.families.Poisson())
poisson_results = poisson_model.fit()

# Make predictions on the test data
y_pred = poisson_results.predict(X_test)

In [11]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# Calculate Mean Absolute Error (MAE)
mae = mean_absolute_error(y_test, y_pred)

# Calculate Mean Squared Error (MSE)
mse = mean_squared_error(y_test, y_pred)

# Calculate Root Mean Squared Error (RMSE)
rmse = np.sqrt(mse)

# Calculate R-squared (R2 score)
r2 = r2_score(y_test, y_pred)

# Calculate Root Mean Squared Logarithmic Error (RMSLE)
def rmsle(y_true, y_pred):
    return np.sqrt(np.mean(np.square(np.log1p(y_pred) - np.log1p(y_true))))
rmsle_val = rmsle(y_test, y_pred)

# Calculate Adjusted R-squared
n = X_test.shape[0]  # Number of samples
p = X_test.shape[1]  # Number of features
adjusted_r2 = 1 - (1 - r2) * ((n - 1) / (n - p - 1))

# Print the metrics
print("Mean Absolute Error (MAE):", mae)
print("Mean Squared Error (MSE):", mse)
print("Root Mean Squared Error (RMSE):", rmse)
print("Root Mean Squared Logarithmic Error (RMSLE):", rmsle_val)
print("R-squared (R2 score):", r2)
print("Adjusted R-squared:", adjusted_r2)

Mean Absolute Error (MAE): 1.6442646048939778e+40
Mean Squared Error (MSE): 3.244327309088579e+85
Root Mean Squared Error (RMSE): 5.695899673527071e+42
Root Mean Squared Logarithmic Error (RMSLE): 0.6610646463096236
R-squared (R2 score): -1.178954575094205e+82
Adjusted R-squared: -1.1791019640679548e+82


In [22]:
#Negative Binomial Regression
import statsmodels.api as sm

# Assuming X_train and y_train are your training features and target respectively

# Fit the Negative Binomial Regression model
nb_model = sm.GLM(y_train, X_train, family=sm.families.NegativeBinomial())
nb_result = nb_model.fit()

# Predict using the test set
y_pred = nb_result.predict(X_test)

In [23]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# Calculate Mean Absolute Error (MAE)
mae = mean_absolute_error(y_test, y_pred)

# Calculate Mean Squared Error (MSE)
mse = mean_squared_error(y_test, y_pred)

# Calculate Root Mean Squared Error (RMSE)
rmse = np.sqrt(mse)

# Calculate R-squared (R2 score)
r2 = r2_score(y_test, y_pred)

# Calculate Root Mean Squared Logarithmic Error (RMSLE)
def rmsle(y_true, y_pred):
    return np.sqrt(np.mean(np.square(np.log1p(y_pred) - np.log1p(y_true))))
rmsle_val = rmsle(y_test, y_pred)

# Calculate Adjusted R-squared
n = X_test.shape[0]  # Number of samples
p = X_test.shape[1]  # Number of features
adjusted_r2 = 1 - (1 - r2) * ((n - 1) / (n - p - 1))

# Print the metrics
print("Mean Absolute Error (MAE):", mae)
print("Mean Squared Error (MSE):", mse)
print("Root Mean Squared Error (RMSE):", rmse)
print("Root Mean Squared Logarithmic Error (RMSLE):", rmsle_val)
print("R-squared (R2 score):", r2)
print("Adjusted R-squared:", adjusted_r2)

Mean Absolute Error (MAE): 1.6442646048939778e+40
Mean Squared Error (MSE): 3.244327309088579e+85
Root Mean Squared Error (RMSE): 5.695899673527071e+42
Root Mean Squared Logarithmic Error (RMSLE): 0.6610646463096236
R-squared (R2 score): -1.178954575094205e+82
Adjusted R-squared: -1.1791019640679548e+82


In [25]:
#Stepwise regression
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression

def stepwise_selection(X, y, 
                       initial_list=[], 
                       threshold_in=0.01, 
                       threshold_out = 0.05, 
                       verbose=True):
    included = list(initial_list)
    while True:
        changed=False
        # forward step
        excluded = list(set(X.columns)-set(included))
        new_pval = pd.Series(index=excluded)
        for new_column in excluded:
            model = sm.OLS(y, sm.add_constant(pd.DataFrame(X[included+[new_column]]))).fit()
            new_pval[new_column] = model.pvalues[new_column]
        best_pval = new_pval.min()
        if best_pval < threshold_in:
            best_feature = new_pval.idxmin()
            included.append(best_feature)
            changed=True
            if verbose:
                print('Add  {:30} with p-value {:.6}'.format(best_feature, best_pval))

        # backward step
        model = sm.OLS(y, sm.add_constant(pd.DataFrame(X[included]))).fit()
        # use all coefs except intercept
        pvalues = model.pvalues.iloc[1:]
        worst_pval = pvalues.max() # null if pvalues is empty
        if worst_pval > threshold_out:
            changed=True
            worst_feature = pvalues.idxmax()
            included.remove(worst_feature)
            if verbose:
                print('Drop {:30} with p-value {:.6}'.format(worst_feature, worst_pval))
        if not changed:
            break
    return included

# Assuming X_train and y_train are your training data
selected_features = stepwise_selection(X_train, y_train)

# After obtaining selected features, fit the model using sklearn LinearRegression
lr_model = LinearRegression()
lr_model.fit(X_train[selected_features], y_train)
y_pred = lr_model.predict(X_test[selected_features])

  new_pval = pd.Series(index=excluded)


Add  trip_distance                  with p-value 0.0


  new_pval = pd.Series(index=excluded)


Add  passenger_count                with p-value 4.31624e-06


  new_pval = pd.Series(index=excluded)


Add  dropoff_latitude               with p-value 7.85613e-05


  new_pval = pd.Series(index=excluded)


Add  longitude_difference           with p-value 0.00192423


  new_pval = pd.Series(index=excluded)


In [26]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# Calculate Mean Absolute Error (MAE)
mae = mean_absolute_error(y_test, y_pred)

# Calculate Mean Squared Error (MSE)
mse = mean_squared_error(y_test, y_pred)

# Calculate Root Mean Squared Error (RMSE)
rmse = np.sqrt(mse)

# Calculate R-squared (R2 score)
r2 = r2_score(y_test, y_pred)

# Calculate Root Mean Squared Logarithmic Error (RMSLE)
def rmsle(y_true, y_pred):
    return np.sqrt(np.mean(np.square(np.log1p(y_pred) - np.log1p(y_true))))
rmsle_val = rmsle(y_test, y_pred)

# Calculate Adjusted R-squared
n = X_test.shape[0]  # Number of samples
p = X_test.shape[1]  # Number of features
adjusted_r2 = 1 - (1 - r2) * ((n - 1) / (n - p - 1))

# Print the metrics
print("Mean Absolute Error (MAE):", mae)
print("Mean Squared Error (MSE):", mse)
print("Root Mean Squared Error (RMSE):", rmse)
print("Root Mean Squared Logarithmic Error (RMSLE):", rmsle_val)
print("R-squared (R2 score):", r2)
print("Adjusted R-squared:", adjusted_r2)

Mean Absolute Error (MAE): 7.398881793755651
Mean Squared Error (MSE): 2718.215486272104
Root Mean Squared Error (RMSE): 52.13650819025095
Root Mean Squared Logarithmic Error (RMSLE): 0.5567265497304519
R-squared (R2 score): 0.0122289527770576
Adjusted R-squared: 0.012105464931108756


  return np.sqrt(np.mean(np.square(np.log1p(y_pred) - np.log1p(y_true))))


In [14]:
#Decision Tree Regression 
from sklearn.tree import DecisionTreeRegressor

# Create a DecisionTreeRegressor model
dt_model = DecisionTreeRegressor()

# Fit the model with training data
dt_model.fit(X_train, y_train)

# Predict using the trained model
y_pred = dt_model.predict(X_test)

In [15]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# Calculate Mean Absolute Error (MAE)
mae = mean_absolute_error(y_test, y_pred)

# Calculate Mean Squared Error (MSE)
mse = mean_squared_error(y_test, y_pred)

# Calculate Root Mean Squared Error (RMSE)
rmse = np.sqrt(mse)

# Calculate R-squared (R2 score)
r2 = r2_score(y_test, y_pred)

# Calculate Root Mean Squared Logarithmic Error (RMSLE)
def rmsle(y_true, y_pred):
    return np.sqrt(np.mean(np.square(np.log1p(y_pred) - np.log1p(y_true))))
rmsle_val = rmsle(y_test, y_pred)

# Calculate Adjusted R-squared
n = X_test.shape[0]  # Number of samples
p = X_test.shape[1]  # Number of features
adjusted_r2 = 1 - (1 - r2) * ((n - 1) / (n - p - 1))

# Print the metrics
print("Mean Absolute Error (MAE):", mae)
print("Mean Squared Error (MSE):", mse)
print("Root Mean Squared Error (RMSE):", rmse)
print("Root Mean Squared Logarithmic Error (RMSLE):", rmsle_val)
print("R-squared (R2 score):", r2)
print("Adjusted R-squared:", adjusted_r2)

Mean Absolute Error (MAE): 9.211533333333334
Mean Squared Error (MSE): 14628.6503
Root Mean Squared Error (RMSE): 120.94895741592815
Root Mean Squared Logarithmic Error (RMSLE): 0.528846799741553
R-squared (R2 score): -4.315898352895608
Adjusted R-squared: -4.31656292879984


In [16]:
#Random Forest Regression
from sklearn.ensemble import RandomForestRegressor

# Create a RandomForestRegressor model
rf_model = RandomForestRegressor()
rf_model.fit(X_train, y_train)
y_pred = rf_model.predict(X_test)

In [17]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# Calculate Mean Absolute Error (MAE)
mae = mean_absolute_error(y_test, y_pred)

# Calculate Mean Squared Error (MSE)
mse = mean_squared_error(y_test, y_pred)

# Calculate Root Mean Squared Error (RMSE)
rmse = np.sqrt(mse)

# Calculate R-squared (R2 score)
r2 = r2_score(y_test, y_pred)

# Calculate Root Mean Squared Logarithmic Error (RMSLE)
def rmsle(y_true, y_pred):
    return np.sqrt(np.mean(np.square(np.log1p(y_pred) - np.log1p(y_true))))
rmsle_val = rmsle(y_test, y_pred)

# Calculate Adjusted R-squared
n = X_test.shape[0]  # Number of samples
p = X_test.shape[1]  # Number of features
adjusted_r2 = 1 - (1 - r2) * ((n - 1) / (n - p - 1))

# Print the metrics
print("Mean Absolute Error (MAE):", mae)
print("Mean Squared Error (MSE):", mse)
print("Root Mean Squared Error (RMSE):", rmse)
print("Root Mean Squared Logarithmic Error (RMSLE):", rmsle_val)
print("R-squared (R2 score):", r2)
print("Adjusted R-squared:", adjusted_r2)

Mean Absolute Error (MAE): 7.423382000000002
Mean Squared Error (MSE): 3729.342195331667
Root Mean Squared Error (RMSE): 61.06834036824373
Root Mean Squared Logarithmic Error (RMSLE): 0.5110228990118234
R-squared (R2 score): -0.35520390651129974
Adjusted R-squared: -0.3553733295893573


In [18]:
#K-Nearest Neighbors Regression
from sklearn.neighbors import KNeighborsRegressor

# Create a K-Nearest Neighbors Regression model
knn_model = KNeighborsRegressor(n_neighbors=5)  # You can set the number of neighbors (K) as needed

# Fit the KNN model with the training data
knn_model.fit(X_train, y_train)

# Predict using the KNN model
y_pred = knn_model.predict(X_test)


In [19]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# Calculate Mean Absolute Error (MAE)
mae = mean_absolute_error(y_test, y_pred)

# Calculate Mean Squared Error (MSE)
mse = mean_squared_error(y_test, y_pred)

# Calculate Root Mean Squared Error (RMSE)
rmse = np.sqrt(mse)

# Calculate R-squared (R2 score)
r2 = r2_score(y_test, y_pred)

# Calculate Root Mean Squared Logarithmic Error (RMSLE)
def rmsle(y_true, y_pred):
    return np.sqrt(np.mean(np.square(np.log1p(y_pred) - np.log1p(y_true))))
rmsle_val = rmsle(y_test, y_pred)

# Calculate Adjusted R-squared
n = X_test.shape[0]  # Number of samples
p = X_test.shape[1]  # Number of features
adjusted_r2 = 1 - (1 - r2) * ((n - 1) / (n - p - 1))

# Print the metrics
print("Mean Absolute Error (MAE):", mae)
print("Mean Squared Error (MSE):", mse)
print("Root Mean Squared Error (RMSE):", rmse)
print("Root Mean Squared Logarithmic Error (RMSLE):", rmsle_val)
print("R-squared (R2 score):", r2)
print("Adjusted R-squared:", adjusted_r2)

Mean Absolute Error (MAE): 11.387129999999999
Mean Squared Error (MSE): 3175.184586
Root Mean Squared Error (RMSE): 56.34877626000409
Root Mean Squared Logarithmic Error (RMSLE): 0.7804062247978638
R-squared (R2 score): -0.15382883346776843
Adjusted R-squared: -0.15397308130499665


In [20]:
#Robust Regression
from sklearn.linear_model import RANSACRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

# Split your data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a RANSACRegressor model
ransac_model = RANSACRegressor(base_estimator=LinearRegression(), random_state=0)

# Fit the RANSAC model to the training data
ransac_model.fit(X_train, y_train)

# Make predictions on the test data
y_pred = ransac_model.predict(X_test)

In [21]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# Calculate Mean Absolute Error (MAE)
mae = mean_absolute_error(y_test, y_pred)

# Calculate Mean Squared Error (MSE)
mse = mean_squared_error(y_test, y_pred)

# Calculate Root Mean Squared Error (RMSE)
rmse = np.sqrt(mse)

# Calculate R-squared (R2 score)
r2 = r2_score(y_test, y_pred)

# Calculate Root Mean Squared Logarithmic Error (RMSLE)
def rmsle(y_true, y_pred):
    return np.sqrt(np.mean(np.square(np.log1p(y_pred) - np.log1p(y_true))))
rmsle_val = rmsle(y_test, y_pred)

# Calculate Adjusted R-squared
n = X_test.shape[0]  # Number of samples
p = X_test.shape[1]  # Number of features
adjusted_r2 = 1 - (1 - r2) * ((n - 1) / (n - p - 1))

# Print the metrics
print("Mean Absolute Error (MAE):", mae)
print("Mean Squared Error (MSE):", mse)
print("Root Mean Squared Error (RMSE):", rmse)
print("Root Mean Squared Logarithmic Error (RMSLE):", rmsle_val)
print("R-squared (R2 score):", r2)
print("Adjusted R-squared:", adjusted_r2)

Mean Absolute Error (MAE): 6.831701106324882
Mean Squared Error (MSE): 15299.536967902046
Root Mean Squared Error (RMSE): 123.69129705804708
Root Mean Squared Logarithmic Error (RMSLE): 0.48550542242178957
R-squared (R2 score): 0.00559415286833409
Adjusted R-squared: 0.0054076644743180635


  return np.sqrt(np.mean(np.square(np.log1p(y_pred) - np.log1p(y_true))))
