In [1]:
# Imports
import pymongo
import pandas as pd
import joblib

In [17]:
from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

In [3]:
# import sys
# sys.path.append('d:\\GitHub\\NextTrip\\flask_LocationModel\\config')

In [4]:
import re

In [5]:
with open('/GitHub/NEXTTRIP/flask_LocationModel/config/db_config.py', 'r') as databaseUrlFile:
    DB_URL = databaseUrlFile.read()
DB_URL

match = re.search(r'"([^"]*)"', DB_URL)

if match:
    database_url = match.group(1)
else:
    print("No match found")

In [6]:
client = pymongo.MongoClient(database_url)

In [7]:
db = client['NextTripDB']
collection = db['rides_data']

In [8]:
# Retrieve the JSON data from MongoDB
data = list(collection.find({}))  
data_df = pd.DataFrame(data)
data_df.head()

Unnamed: 0,_id,Date/Time,Lat,Lon,__v
0,650db48407e33ccdd2398bfa,4/1/2014 0:17:00,40.7267,-74.0345,
1,650db48407e33ccdd2398bfe,4/1/2014 0:33:00,40.7383,-74.0403,
2,650db48407e33ccdd2398bff,4/1/2014 0:39:00,40.7223,-73.9887,
3,650db48407e33ccdd2398c06,4/1/2014 2:11:00,40.6463,-73.7896,
4,650db48407e33ccdd2398c09,4/1/2014 2:43:00,40.758,-73.9761,


In [9]:
# Load the dataset
# data = pd.read_csv('/GitHub/NEXTTRIP/flask_LocationModel/data/uber-raw-data-apr14.csv')

In [10]:
# Preprocess the data
data_df['Date/Time'] = pd.to_datetime(data_df['Date/Time'])

In [11]:
# Feature Engineering
data_df['weekday'] = data_df['Date/Time'].dt.weekday
data_df['hour'] = data_df['Date/Time'].dt.hour

In [12]:
dateTime_data = data_df[['weekday', 'hour']]
location_data = data_df[['Lat', 'Lon']]

In [13]:
num_clusters = 100

In [14]:
# Apply K-means clustering on the location data
kmeans = KMeans(n_clusters=num_clusters)
data_df['cluster'] = kmeans.fit_predict(location_data)

  super()._check_params_vs_input(X, default_n_init=10)


In [15]:
data_df.head()

Unnamed: 0,_id,Date/Time,Lat,Lon,__v,weekday,hour,cluster
0,650db48407e33ccdd2398bfa,2014-04-01 00:17:00,40.7267,-74.0345,,1,0,90
1,650db48407e33ccdd2398bfe,2014-04-01 00:33:00,40.7383,-74.0403,,1,0,39
2,650db48407e33ccdd2398bff,2014-04-01 00:39:00,40.7223,-73.9887,,1,0,74
3,650db48407e33ccdd2398c06,2014-04-01 02:11:00,40.6463,-73.7896,,1,2,82
4,650db48407e33ccdd2398c09,2014-04-01 02:43:00,40.758,-73.9761,,1,2,43


In [23]:
# train a model for each cluster
models= {}
rmses_lat = []
rmses_lon = []

for cluster_id in range(num_clusters):
    print('Training model for cluster: {}'.format(cluster_id))
    cluster_data = data_df[data_df['cluster'] == cluster_id]

    X = cluster_data[['weekday', 'hour']]
    y_lat = cluster_data['Lat']
    y_lon = cluster_data['Lon']

    if len(cluster_data) < 2:
        print(f"Skipping splitting data in cluster {cluster_id} due to insufficient data points.")

        model_lat = RandomForestRegressor()
        model_lat.fit(X, y_lat)

        model_lon = RandomForestRegressor()
        model_lon.fit(X, y_lon)

        y_lat_pred = model_lat.predict(X)
        y_lon_pred = model_lon.predict(X)

        rmse_lat = mean_squared_error(y_lat, y_lat_pred, squared=False)
        rmse_lon = mean_squared_error(y_lon, y_lon_pred, squared=False)

        rmses_lat.append(rmse_lat)
        rmses_lon.append(rmse_lon)

    else :

        X_train, X_test, y_lat_train, y_lat_test, y_lon_train, y_lon_test = train_test_split(X, y_lat, y_lon, test_size=0.2, random_state=42)

        model_lat = RandomForestRegressor()
        model_lat.fit(X_train, y_lat_train)

        model_lon = RandomForestRegressor()
        model_lon.fit(X_train, y_lon_train)

        y_lat_pred = model_lat.predict(X_test)
        y_lon_pred = model_lon.predict(X_test)

        mse_lat = mean_squared_error(y_lat_test, y_lat_pred)
        mse_lon = mean_squared_error(y_lon_test, y_lon_pred)

        rmse_lat = mean_squared_error(y_lat_test, y_lat_pred, squared=False)
        rmse_lon = mean_squared_error(y_lon_test, y_lon_pred, squared=False)

        rmses_lat.append(rmse_lat)
        rmses_lon.append(rmse_lon)

    joblib.dump(model_lat, f'/GitHub/NEXTTRIP/flask_LocationModel/models/clusters/cluster_{cluster_id}_lat_model.pkl')
    joblib.dump(model_lon, f'/GitHub/NEXTTRIP/flask_LocationModel/models/clusters/cluster_{cluster_id}_lon_model.pkl')
    
    


Training model for cluster: 0
Training model for cluster: 1
Training model for cluster: 2
Training model for cluster: 3
Training model for cluster: 4
Training model for cluster: 5
Training model for cluster: 6
Training model for cluster: 7
Training model for cluster: 8
Training model for cluster: 9
Training model for cluster: 10
Training model for cluster: 11
Training model for cluster: 12
Training model for cluster: 13
Training model for cluster: 14
Training model for cluster: 15
Training model for cluster: 16
Training model for cluster: 17
Training model for cluster: 18
Training model for cluster: 19
Training model for cluster: 20
Training model for cluster: 21
Training model for cluster: 22
Training model for cluster: 23
Training model for cluster: 24
Training model for cluster: 25
Training model for cluster: 26
Training model for cluster: 27
Training model for cluster: 28
Training model for cluster: 29
Training model for cluster: 30
Training model for cluster: 31
Training model for

In [24]:
rmses_lat_avg = sum(rmses_lat) / len(rmses_lat)
rmses_lon_avg = sum(rmses_lon) / len(rmses_lon)
print(f"Average RMSE for latitude: {rmses_lat_avg}")
print(f"Average RMSE for longitude: {rmses_lon_avg}")

Average RMSE for latitude: 0.022379436806737547
Average RMSE for longitude: 0.021696016348763895


In [25]:
# Persist the KMeans model
joblib.dump(kmeans, '/GitHub/NEXTTRIP/flask_LocationModel/models/kmeans_model.pkl')

['/GitHub/NEXTTRIP/flask_LocationModel/models/kmeans_model.pkl']

In [None]:
#Save model columns
# rnd_columns = list(X_train.columns)
# joblib.dump(rnd_columns, '/GitHub/NEXTTRIP/flask_LocationModel/models/rnd_columns.pkl')