In [10]:
# Imports
import pymongo
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from datetime import datetime
import joblib

In [11]:
import sys
sys.path.append('d:\\GitHub\\NextTrip\\flask_LocationModel\\config')

In [12]:
from db_config import DB_URL
import re

In [13]:
with open('/GitHub/NEXTTRIP/flask_LocationModel/config/db_config.py', 'r') as databaseUrlFile:
    DB_URL = databaseUrlFile.read()
DB_URL

match = re.search(r'"([^"]*)"', DB_URL)

if match:
    database_url = match.group(1)
else:
    print("No match found")

In [14]:
client = pymongo.MongoClient(database_url)

In [20]:
db = client['NextTripDB']
collection = db['rides_data']

In [21]:
# Retrieve the JSON data from MongoDB
data = list(collection.find({}))  
data_df = pd.DataFrame(data)
data_df.head()

Unnamed: 0,_id,Date/Time,Lat,Lon
0,650db48407e33ccdd2398bf9,4/1/2014 0:11:00,40.769,-73.9549
1,650db48407e33ccdd2398bfa,4/1/2014 0:17:00,40.7267,-74.0345
2,650db48407e33ccdd2398bfb,4/1/2014 0:21:00,40.7316,-73.9873
3,650db48407e33ccdd2398bfc,4/1/2014 0:28:00,40.7588,-73.9776
4,650db48407e33ccdd2398bfd,4/1/2014 0:33:00,40.7594,-73.9722


In [17]:
# Load the dataset
# data = pd.read_csv('/GitHub/NEXTTRIP/flask_LocationModel/data/uber-raw-data-apr14.csv')

In [22]:
# Preprocess the data
data_df['Date/Time'] = pd.to_datetime(data_df['Date/Time'])

In [23]:
# Feature Engineering
data_df['weekday'] = data_df['Date/Time'].dt.weekday
data_df['hour'] = data_df['Date/Time'].dt.hour

In [24]:
# Split the data into features (X) and target variables (y)
X = data_df[['weekday', 'hour']]
y_lat = data_df['Lat']
y_lon = data_df['Lon']

In [25]:
# Split the data into training and testing sets
X_train, X_test, y_lat_train, y_lat_test, y_lon_train, y_lon_test = train_test_split(X, y_lat, y_lon, test_size=0.2, random_state=42)

In [26]:
# Model Selection and Training
model_lat = RandomForestRegressor()
model_lat.fit(X_train, y_lat_train)

model_lon = RandomForestRegressor()
model_lon.fit(X_train, y_lon_train)

In [27]:
# Model Evaluation
y_lat_pred = model_lat.predict(X_test)
y_lon_pred = model_lon.predict(X_test)

mse_lat = mean_squared_error(y_lat_test, y_lat_pred)
mse_lon = mean_squared_error(y_lon_test, y_lon_pred)

rmse_lat = mean_squared_error(y_lat_test, y_lat_pred, squared=False)
rmse_lon = mean_squared_error(y_lon_test, y_lon_pred, squared=False)

print('Latitude RMSE:', rmse_lat)
print('Longitude RMSE:', rmse_lon)

Latitude RMSE: 0.035844013391631055
Longitude RMSE: 0.05092007337037105


In [28]:
# Make Predictions
new_datetime = datetime(2023, 8, 21, 1, 30)  # Example new date/time input
new_weekday = new_datetime.weekday()
print('Weekday:', new_weekday)
new_hour = new_datetime.hour
print('Hour:', new_hour)

new_X = pd.DataFrame({'weekday': [new_weekday], 'hour': [new_hour]})
predicted_lat = model_lat.predict(new_X)
predicted_lon = model_lon.predict(new_X)

print('Predicted Latitude:', predicted_lat[0])
print('Predicted Longitude:', predicted_lon[0])

Weekday: 0
Hour: 1
Predicted Latitude: 40.733712068785316
Predicted Longitude: -73.97391767505452


In [29]:
# Persist the model
joblib.dump(model_lat, '/GitHub/NEXTTRIP/flask_LocationModel/models/lat_model.pkl')
joblib.dump(model_lon, '/GitHub/NEXTTRIP/flask_LocationModel/models/lon_model.pkl')

['/GitHub/NEXTTRIP/flask_LocationModel/models/lon_model.pkl']

In [30]:
#Save model columns
rnd_columns = list(X_train.columns)
joblib.dump(rnd_columns, '/GitHub/NEXTTRIP/flask_LocationModel/models/rnd_columns.pkl')

['/GitHub/NEXTTRIP/flask_LocationModel/models/rnd_columns.pkl']