In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
import warnings
warnings.filterwarnings('ignore')
import joblib


In [3]:
df= pd.read_csv(r"C:\Users\ahijazi\Desktop\final_internship_data\final_internship_data.csv")

In [4]:
categorical_columns = ['User ID', 'User Name', 'Driver Name', 'Car Condition', 'Weather', 'Traffic Condition']
for column in categorical_columns:
    df[column].fillna(df[column].mode()[0], inplace=True)

# Fill missing values for numerical columns with mean
numerical_columns = ['fare_amount', 'pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude', 'passenger_count', 'hour', 'day', 'month', 'weekday', 'year', 'jfk_dist', 'ewr_dist', 'lga_dist', 'sol_dist', 'nyc_dist', 'distance', 'bearing']
for column in numerical_columns:
    df[column].fillna(df[column].mean(), inplace=True)

In [5]:
df.isnull().sum()

User ID              0
User Name            0
Driver Name          0
Car Condition        0
Weather              0
Traffic Condition    0
key                  0
fare_amount          0
pickup_datetime      0
pickup_longitude     0
pickup_latitude      0
dropoff_longitude    0
dropoff_latitude     0
passenger_count      0
hour                 0
day                  0
month                0
weekday              0
year                 0
jfk_dist             0
ewr_dist             0
lga_dist             0
sol_dist             0
nyc_dist             0
distance             0
bearing              0
dtype: int64

In [11]:
print(df.dtypes)

User ID               object
User Name             object
Driver Name           object
Car Condition         object
Weather               object
Traffic Condition     object
fare_amount          float64
pickup_longitude     float64
pickup_latitude      float64
dropoff_longitude    float64
dropoff_latitude     float64
passenger_count        int64
hour                   int64
day                    int64
month                  int64
weekday                int64
year                   int64
jfk_dist             float64
ewr_dist             float64
lga_dist             float64
sol_dist             float64
nyc_dist             float64
distance             float64
bearing              float64
period                object
dtype: object


In [6]:
df.drop(columns=['pickup_datetime','key'],inplace=True)

In [7]:
df

Unnamed: 0,User ID,User Name,Driver Name,Car Condition,Weather,Traffic Condition,key,fare_amount,pickup_longitude,pickup_latitude,...,month,weekday,year,jfk_dist,ewr_dist,lga_dist,sol_dist,nyc_dist,distance,bearing
0,KHVrEVlD,Kimberly Adams,Amy Butler,Very Good,windy,Congested Traffic,2009-06-15 17:26:21.0000001,4.5,-1.288826,0.710721,...,6,0,2009,20.265840,55.176046,14.342611,34.543548,27.572573,1.030764,-2.918897
1,lPxIuEri,Justin Tapia,Hannah Zimmerman,Excellent,cloudy,Flow Traffic,2010-01-05 16:52:16.0000002,16.9,-1.291824,0.710546,...,1,1,2010,44.667679,31.832358,23.130775,15.125872,8.755732,8.450134,-0.375217
2,gsVN8JLS,Elizabeth Lopez,Amanda Jackson,Bad,stormy,Congested Traffic,2011-08-18 00:35:00.00000049,5.7,-1.291242,0.711418,...,8,3,2011,43.597686,33.712082,19.865289,17.722624,9.847344,1.389525,2.599961
3,9I7kWFgd,Steven Wilson,Amy Horn,Very Good,stormy,Flow Traffic,2012-04-21 04:30:42.0000001,7.7,-1.291319,0.710927,...,4,5,2012,42.642965,32.556289,21.063132,15.738963,7.703421,2.799270,0.133905
4,8QN5ZaGN,Alexander Andrews,Cassandra Larson,Bad,stormy,Congested Traffic,2010-03-09 07:51:00.000000135,5.3,-1.290987,0.711536,...,3,1,2010,43.329953,39.406828,15.219339,23.732406,15.600745,1.999157,-0.502703
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
499995,er7Luy6J,Morgan Smith,Gabriel Coleman,Bad,rainy,Dense Traffic,2015-05-07 18:45:12.0000004,7.0,-1.291173,0.711514,...,5,3,2015,42.565403,36.580043,17.157511,20.401068,12.280187,1.404709,-2.275706
499996,FWDZu9NA,Dillon Jackson,Larry Richards,Very Good,rainy,Flow Traffic,2010-09-13 12:11:34.0000004,13.7,-1.291595,0.710670,...,9,0,2010,41.814877,27.833416,26.463167,8.596656,0.994796,0.994184,2.556931
499997,QZIMcpLZ,Michael Rodriguez,Lonnie Santana,Bad,rainy,Flow Traffic,2014-08-25 00:22:20.0000001,25.0,-1.291262,0.710796,...,8,0,2014,31.780108,38.960930,19.708214,19.066488,11.663395,7.859324,-1.926251
499998,AD72Uwmn,Deborah Soto,Emily Rivera,Good,stormy,Congested Traffic,2015-01-12 12:17:32.0000001,6.5,-1.291101,0.711335,...,1,0,2015,42.292914,36.382813,17.227166,20.006433,11.819775,0.994470,-0.384817


In [9]:
df['period'] = df['hour'].apply(lambda x: 'AM' if x < 12 else 'PM')
df

Unnamed: 0,User ID,User Name,Driver Name,Car Condition,Weather,Traffic Condition,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,...,weekday,year,jfk_dist,ewr_dist,lga_dist,sol_dist,nyc_dist,distance,bearing,period
0,KHVrEVlD,Kimberly Adams,Amy Butler,Very Good,windy,Congested Traffic,4.5,-1.288826,0.710721,-1.288779,...,0,2009,20.265840,55.176046,14.342611,34.543548,27.572573,1.030764,-2.918897,PM
1,lPxIuEri,Justin Tapia,Hannah Zimmerman,Excellent,cloudy,Flow Traffic,16.9,-1.291824,0.710546,-1.291182,...,1,2010,44.667679,31.832358,23.130775,15.125872,8.755732,8.450134,-0.375217,PM
2,gsVN8JLS,Elizabeth Lopez,Amanda Jackson,Bad,stormy,Congested Traffic,5.7,-1.291242,0.711418,-1.291391,...,3,2011,43.597686,33.712082,19.865289,17.722624,9.847344,1.389525,2.599961,AM
3,9I7kWFgd,Steven Wilson,Amy Horn,Very Good,stormy,Flow Traffic,7.7,-1.291319,0.710927,-1.291396,...,5,2012,42.642965,32.556289,21.063132,15.738963,7.703421,2.799270,0.133905,AM
4,8QN5ZaGN,Alexander Andrews,Cassandra Larson,Bad,stormy,Congested Traffic,5.3,-1.290987,0.711536,-1.290787,...,1,2010,43.329953,39.406828,15.219339,23.732406,15.600745,1.999157,-0.502703,AM
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
499995,er7Luy6J,Morgan Smith,Gabriel Coleman,Bad,rainy,Dense Traffic,7.0,-1.291173,0.711514,-1.290951,...,3,2015,42.565403,36.580043,17.157511,20.401068,12.280187,1.404709,-2.275706,PM
499996,FWDZu9NA,Dillon Jackson,Larry Richards,Very Good,rainy,Flow Traffic,13.7,-1.291595,0.710670,-1.291708,...,0,2010,41.814877,27.833416,26.463167,8.596656,0.994796,0.994184,2.556931,PM
499997,QZIMcpLZ,Michael Rodriguez,Lonnie Santana,Bad,rainy,Flow Traffic,25.0,-1.291262,0.710796,-1.289737,...,0,2014,31.780108,38.960930,19.708214,19.066488,11.663395,7.859324,-1.926251,AM
499998,AD72Uwmn,Deborah Soto,Emily Rivera,Good,stormy,Congested Traffic,6.5,-1.291101,0.711335,-1.291023,...,0,2015,42.292914,36.382813,17.227166,20.006433,11.819775,0.994470,-0.384817,PM


In [12]:
df.drop(columns=['User ID','User Name','Driver Name'],inplace=True)

In [13]:
df

Unnamed: 0,Car Condition,Weather,Traffic Condition,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,hour,...,weekday,year,jfk_dist,ewr_dist,lga_dist,sol_dist,nyc_dist,distance,bearing,period
0,Very Good,windy,Congested Traffic,4.5,-1.288826,0.710721,-1.288779,0.710563,1,17,...,0,2009,20.265840,55.176046,14.342611,34.543548,27.572573,1.030764,-2.918897,PM
1,Excellent,cloudy,Flow Traffic,16.9,-1.291824,0.710546,-1.291182,0.711780,1,16,...,1,2010,44.667679,31.832358,23.130775,15.125872,8.755732,8.450134,-0.375217,PM
2,Bad,stormy,Congested Traffic,5.7,-1.291242,0.711418,-1.291391,0.711231,2,0,...,3,2011,43.597686,33.712082,19.865289,17.722624,9.847344,1.389525,2.599961,AM
3,Very Good,stormy,Flow Traffic,7.7,-1.291319,0.710927,-1.291396,0.711363,1,4,...,5,2012,42.642965,32.556289,21.063132,15.738963,7.703421,2.799270,0.133905,AM
4,Bad,stormy,Congested Traffic,5.3,-1.290987,0.711536,-1.290787,0.711811,1,7,...,1,2010,43.329953,39.406828,15.219339,23.732406,15.600745,1.999157,-0.502703,AM
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
499995,Bad,rainy,Dense Traffic,7.0,-1.291173,0.711514,-1.290951,0.711371,1,18,...,3,2015,42.565403,36.580043,17.157511,20.401068,12.280187,1.404709,-2.275706,PM
499996,Very Good,rainy,Flow Traffic,13.7,-1.291595,0.710670,-1.291708,0.710540,1,12,...,0,2010,41.814877,27.833416,26.463167,8.596656,0.994796,0.994184,2.556931,PM
499997,Bad,rainy,Flow Traffic,25.0,-1.291262,0.710796,-1.289737,0.710366,1,0,...,0,2014,31.780108,38.960930,19.708214,19.066488,11.663395,7.859324,-1.926251,AM
499998,Good,stormy,Congested Traffic,6.5,-1.291101,0.711335,-1.291023,0.711480,6,12,...,0,2015,42.292914,36.382813,17.227166,20.006433,11.819775,0.994470,-0.384817,PM


In [15]:
df_complete=pd.get_dummies(df[['Car Condition','Traffic Condition','Weather','period']],sparse=False,prefix='category').astype(int)
df_complete

Unnamed: 0,category_Bad,category_Excellent,category_Good,category_Very Good,category_Congested Traffic,category_Dense Traffic,category_Flow Traffic,category_cloudy,category_rainy,category_stormy,category_sunny,category_windy,category_AM,category_PM
0,0,0,0,1,1,0,0,0,0,0,0,1,0,1
1,0,1,0,0,0,0,1,1,0,0,0,0,0,1
2,1,0,0,0,1,0,0,0,0,1,0,0,1,0
3,0,0,0,1,0,0,1,0,0,1,0,0,1,0
4,1,0,0,0,1,0,0,0,0,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
499995,1,0,0,0,0,1,0,0,1,0,0,0,0,1
499996,0,0,0,1,0,0,1,0,1,0,0,0,0,1
499997,1,0,0,0,0,0,1,0,1,0,0,0,1,0
499998,0,0,1,0,1,0,0,0,0,1,0,0,0,1


In [16]:
df.drop(columns=['Car Condition','Traffic Condition','Weather','period'],inplace=True)

In [17]:
joindf=df.join(df_complete)

In [18]:
pd.set_option('display.max_columns', None)
joindf.head()

Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,hour,day,month,weekday,year,jfk_dist,ewr_dist,lga_dist,sol_dist,nyc_dist,distance,bearing,category_Bad,category_Excellent,category_Good,category_Very Good,category_Congested Traffic,category_Dense Traffic,category_Flow Traffic,category_cloudy,category_rainy,category_stormy,category_sunny,category_windy,category_AM,category_PM
0,4.5,-1.288826,0.710721,-1.288779,0.710563,1,17,15,6,0,2009,20.26584,55.176046,14.342611,34.543548,27.572573,1.030764,-2.918897,0,0,0,1,1,0,0,0,0,0,0,1,0,1
1,16.9,-1.291824,0.710546,-1.291182,0.71178,1,16,5,1,1,2010,44.667679,31.832358,23.130775,15.125872,8.755732,8.450134,-0.375217,0,1,0,0,0,0,1,1,0,0,0,0,0,1
2,5.7,-1.291242,0.711418,-1.291391,0.711231,2,0,18,8,3,2011,43.597686,33.712082,19.865289,17.722624,9.847344,1.389525,2.599961,1,0,0,0,1,0,0,0,0,1,0,0,1,0
3,7.7,-1.291319,0.710927,-1.291396,0.711363,1,4,21,4,5,2012,42.642965,32.556289,21.063132,15.738963,7.703421,2.79927,0.133905,0,0,0,1,0,0,1,0,0,1,0,0,1,0
4,5.3,-1.290987,0.711536,-1.290787,0.711811,1,7,9,3,1,2010,43.329953,39.406828,15.219339,23.732406,15.600745,1.999157,-0.502703,1,0,0,0,1,0,0,0,0,1,0,0,1,0


In [19]:
X = joindf.drop('fare_amount', axis=1)
y = joindf['fare_amount']

In [26]:
from sklearn.linear_model import LassoCV

lasso = LassoCV(cv=5, random_state=42)
lasso.fit(X, y)

# Get selected feature names
selected_features = X.columns[lasso.coef_ != 0]
print(selected_features)

Index(['hour', 'day', 'month', 'year', 'jfk_dist', 'ewr_dist', 'lga_dist',
       'sol_dist', 'nyc_dist', 'distance', 'bearing'],
      dtype='object')


In [22]:
final_df=joindf[['hour', 'day', 'month', 'year', 'jfk_dist', 'ewr_dist', 'lga_dist',
       'sol_dist', 'nyc_dist', 'distance', 'bearing']]

In [23]:
X_train, X_test, y_train, y_test = train_test_split(final_df, y, test_size=0.2, random_state=40)

In [27]:
from sklearn.ensemble import RandomForestRegressor

# Initialize the model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# Fit the model
rf.fit(X_train, y_train)


In [28]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split


# Predict and evaluate
from sklearn.metrics import mean_squared_error, r2_score

y_pred = rf.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R^2 Score: {r2}")


Mean Squared Error: 21.419383632555363
R^2 Score: 0.7795946939300936


In [None]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10]
}

# Initialize GridSearchCV with RandomForestRegressor
grid_search = GridSearchCV(estimator=RandomForestRegressor(random_state=42), param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)

# Fit GridSearchCV
grid_search.fit(X_train, y_train)

# Get the best parameters and model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

print("Best Parameters: ", best_params)

# Predict and evaluate with the best model
y_pred_best = best_model.predict(X_test)

mse_best = mean_squared_error(y_test, y_pred_best)
r2_best = r2_score(y_test, y_pred_best)

print(f"Mean Squared Error with Best Model: {mse_best}")
print(f"R^2 Score with Best Model: {r2_best}")


Fitting 3 folds for each of 36 candidates, totalling 108 fits
