In [1]:
# Imports
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from datetime import datetime
import joblib

In [7]:
# Load the dataset
data = pd.read_csv('/GitHub/uberDataApp/flask-backend/data/uber-raw-data-apr14.csv')

In [8]:
# Preprocess the data
data['Date/Time'] = pd.to_datetime(data['Date/Time'])

In [9]:
# Feature Engineering
data['weekday'] = data['Date/Time'].dt.weekday
data['hour'] = data['Date/Time'].dt.hour

In [10]:
# Split the data into features (X) and target variables (y)
X = data[['weekday', 'hour']]
y_lat = data['Lat']
y_lon = data['Lon']

In [11]:
# Split the data into training and testing sets
X_train, X_test, y_lat_train, y_lat_test, y_lon_train, y_lon_test = train_test_split(X, y_lat, y_lon, test_size=0.2, random_state=42)

In [12]:
# Model Selection and Training
model_lat = RandomForestRegressor()
model_lat.fit(X_train, y_lat_train)

model_lon = RandomForestRegressor()
model_lon.fit(X_train, y_lon_train)

RandomForestRegressor()

In [13]:
# Model Evaluation
y_lat_pred = model_lat.predict(X_test)
y_lon_pred = model_lon.predict(X_test)

mse_lat = mean_squared_error(y_lat_test, y_lat_pred)
mse_lon = mean_squared_error(y_lon_test, y_lon_pred)

rmse_lat = mean_squared_error(y_lat_test, y_lat_pred, squared=False)
rmse_lon = mean_squared_error(y_lon_test, y_lon_pred, squared=False)

print('Latitude RMSE:', rmse_lat)
print('Longitude RMSE:', rmse_lon)

Latitude RMSE: 0.035843777987688175
Longitude RMSE: 0.05092111350724933


In [14]:
# Make Predictions
new_datetime = datetime(2014, 4, 16, 1, 30)  # Example new date/time input
new_weekday = new_datetime.weekday()
new_hour = new_datetime.hour

new_X = pd.DataFrame({'weekday': [new_weekday], 'hour': [new_hour]})
predicted_lat = model_lat.predict(new_X)
predicted_lon = model_lon.predict(new_X)

print('Predicted Latitude:', predicted_lat[0])
print('Predicted Longitude:', predicted_lon[0])

Predicted Latitude: 40.736726821714186
Predicted Longitude: -73.97932807375841


In [15]:
# Persist the model
joblib.dump(model_lat, '/GitHub/uberDataApp/flask-backend/models/lat_model.pkl')
joblib.dump(model_lon, '/GitHub/uberDataApp/flask-backend/models/lon_model.pkl')

['/GitHub/uberDataApp/flask-backend/models/lon_model.pkl']

In [17]:
#Save model columns
rnd_columns = list(X_train.columns)
joblib.dump(rnd_columns, '/GitHub/uberDataApp/flask-backend/models/rnd_columns.pkl')

['/GitHub/uberDataApp/flask-backend/models/rnd_columns.pkl']