In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import median_absolute_error

Step 1: Load and Inspect the Dataset
Load the dataset and display the first few rows to understand its structure.

In [2]:
file_path = '/home/antqua/code/AntQua/ET_Predictor/raw_data/scrubbed.csv'
data = pd.read_csv(file_path, low_memory=False)

data.head()

Unnamed: 0,datetime,city,state,country,shape,duration (seconds),duration (hours/min),comments,date posted,latitude,longitude
0,10/10/1949 20:30,san marcos,tx,us,cylinder,2700,45 minutes,This event took place in early fall around 194...,4/27/2004,29.8830556,-97.941111
1,10/10/1949 21:00,lackland afb,tx,,light,7200,1-2 hrs,1949 Lackland AFB&#44 TX. Lights racing acros...,12/16/2005,29.38421,-98.581082
2,10/10/1955 17:00,chester (uk/england),,gb,circle,20,20 seconds,Green/Orange circular disc over Chester&#44 En...,1/21/2008,53.2,-2.916667
3,10/10/1956 21:00,edna,tx,us,circle,20,1/2 hour,My older brother and twin sister were leaving ...,1/17/2004,28.9783333,-96.645833
4,10/10/1960 20:00,kaneohe,hi,us,light,900,15 minutes,AS a Marine 1st Lt. flying an FJ4B fighter/att...,1/22/2004,21.4180556,-157.803611


In [3]:
print(data.shape)
print(data.columns)
print(data.dtypes)

(80332, 11)
Index(['datetime', 'city', 'state', 'country', 'shape', 'duration (seconds)',
       'duration (hours/min)', 'comments', 'date posted', 'latitude',
       'longitude '],
      dtype='object')
datetime                 object
city                     object
state                    object
country                  object
shape                    object
duration (seconds)       object
duration (hours/min)     object
comments                 object
date posted              object
latitude                 object
longitude               float64
dtype: object


The dataset has several columns, including datetime, city, state, country, shape, duration (seconds), duration (hours/min), comments, date posted, latitude, and longitude. Our focus will be on the datetime and duration (seconds) columns for features, and latitude and longitude for the target variables.

Step 2: Data Preparation

Convert the datetime column to numerical format and split it into separate features
We'll convert the datetime column to datetime objects and then extract the year, month, day, hour, and minute as separate features.

Handle Missing Values
We'll check for and handle any missing values in the relevant columns (datetime, duration (seconds), latitude, and longitude).

In [4]:
# Strip whitespace from all column names
data.columns = data.columns.str.strip()

# Convert latitude and longitude to numeric, forcing invalid parsing to NaN
data['latitude'] = pd.to_numeric(data['latitude'], errors='coerce')
data['longitude'] = pd.to_numeric(data['longitude'], errors='coerce')

# Convert duration (seconds) to numeric, forcing invalid parsing to NaN
data['duration (seconds)'] = pd.to_numeric(data['duration (seconds)'], errors='coerce')

# Convert datetime column to datetime object
data['datetime'] = pd.to_datetime(data['datetime'], errors='coerce')

# Drop rows where datetime conversion failed
data = data.dropna(subset=['datetime'])

# Drop rows with NaN 
data = data.dropna()

print(data.shape)
print(data.columns)
# Verify the conversion
print(data.dtypes)

(66040, 11)
Index(['datetime', 'city', 'state', 'country', 'shape', 'duration (seconds)',
       'duration (hours/min)', 'comments', 'date posted', 'latitude',
       'longitude'],
      dtype='object')
datetime                datetime64[ns]
city                            object
state                           object
country                         object
shape                           object
duration (seconds)             float64
duration (hours/min)            object
comments                        object
date posted                     object
latitude                       float64
longitude                      float64
dtype: object


Step 3: Feature Engineering

We'll use the extracted datetime features and the duration (seconds) for training the model.

In [5]:
column_names = data.columns
column_names

Index(['datetime', 'city', 'state', 'country', 'shape', 'duration (seconds)',
       'duration (hours/min)', 'comments', 'date posted', 'latitude',
       'longitude'],
      dtype='object')

In [6]:
us_data = data[data['country'] == 'us']

# Save the filtered data to a new CSV file
# us_only_file_path = '/mnt/data/us_only_scrubbed.csv'
# us_data.to_csv(us_only_file_path, index=False)

#import ace_tools as tools; tools.display_dataframe_to_user(name="US Only Scrubbed Dataset", dataframe=us_data)
#us_only_file_path

us_data.shape

(63099, 11)

In [7]:
# Extract year, month, day, hour, and minute from datetime
us_data['year'] = us_data['datetime'].dt.year
us_data['month'] = us_data['datetime'].dt.month
us_data['day'] = us_data['datetime'].dt.day
us_data['hour'] = us_data['datetime'].dt.hour
us_data['minute'] = us_data['datetime'].dt.minute

# Specify the columns to keep
columns_to_keep = ['datetime', 'duration (seconds)', 'latitude', 'longitude', 'year', 'month', 'day', 'hour', 'minute']

# Select only the specified columns
us_data = us_data.loc[:, columns_to_keep]

us_data.head(), us_data.dtypes, us_data.shape



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  us_data['year'] = us_data['datetime'].dt.year
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  us_data['month'] = us_data['datetime'].dt.month
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  us_data['day'] = us_data['datetime'].dt.day
A value is trying to be set on a copy of a slice from a DataFrame.


(             datetime  duration (seconds)   latitude   longitude  year  month  \
 0 1949-10-10 20:30:00              2700.0  29.883056  -97.941111  1949     10   
 3 1956-10-10 21:00:00                20.0  28.978333  -96.645833  1956     10   
 4 1960-10-10 20:00:00               900.0  21.418056 -157.803611  1960     10   
 5 1961-10-10 19:00:00               300.0  36.595000  -82.188889  1961     10   
 7 1965-10-10 23:45:00              1200.0  41.117500  -73.408333  1965     10   
 
    day  hour  minute  
 0   10    20      30  
 3   10    21       0  
 4   10    20       0  
 5   10    19       0  
 7   10    23      45  ,
 datetime              datetime64[ns]
 duration (seconds)           float64
 latitude                     float64
 longitude                    float64
 year                           int32
 month                          int32
 day                            int32
 hour                           int32
 minute                         int32
 dtype: object,
 (6

Step 4: Model Training

We'll scale the features and train both LinearRegression and RandomForestRegressor models for predicting latitude and longitude separately.

Keep just the enntries for the united states

In [8]:
# Define features and target variables
features = ['duration (seconds)', 'year', 'month', 'day', 'hour', 'minute']
X = us_data[features]
y_lat = us_data['latitude']
y_long = us_data['longitude']

# Split the data into training and test sets
X_train, X_test, y_lat_train, y_lat_test, y_long_train, y_long_test = train_test_split(X, y_lat, y_long, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train Linear Regression models
lin_reg_lat = LinearRegression()
lin_reg_lat.fit(X_train_scaled, y_lat_train)

lin_reg_long = LinearRegression()
lin_reg_long.fit(X_train_scaled, y_long_train)

# Train Random Forest Regressor models
rf_reg_lat = RandomForestRegressor(random_state=42)
rf_reg_lat.fit(X_train_scaled, y_lat_train)

rf_reg_long = RandomForestRegressor(random_state=42)
rf_reg_long.fit(X_train_scaled, y_long_train)


Step 5: Prediction

We'll use the trained models to predict the location for a future chosen date.

In [9]:
# Predict and evaluate the models
y_lat_pred_lin = lin_reg_lat.predict(X_test_scaled)
y_long_pred_lin = lin_reg_long.predict(X_test_scaled)

y_lat_pred_rf = rf_reg_lat.predict(X_test_scaled)
y_long_pred_rf = rf_reg_long.predict(X_test_scaled)

mae_lat_lin = median_absolute_error(y_lat_test, y_lat_pred_lin)
mae_long_lin = median_absolute_error(y_long_test, y_long_pred_lin)

mae_lat_rf = median_absolute_error(y_lat_test, y_lat_pred_rf)
mae_long_rf = median_absolute_error(y_long_test, y_long_pred_rf)

mae_results = {
    'Linear Regression Latitude MAE': mae_lat_lin,
    'Linear Regression Longitude MAE': mae_long_lin,
    'Random Forest Latitude MAE': mae_lat_rf,
    'Random Forest Longitude MAE': mae_long_rf,
}

mae_results

{'Linear Regression Latitude MAE': 3.8444349807795355,
 'Linear Regression Longitude MAE': 15.935530014488826,
 'Random Forest Latitude MAE': 3.683563900000003,
 'Random Forest Longitude MAE': 14.927904165500003}

In [10]:
# Example prediction for a future chosen date
future_date = pd.to_datetime('2025-01-01 00:00:00')
future_duration = 120  # example duration in seconds

future_features = pd.DataFrame({
    'duration (seconds)': [future_duration],
    'year': [future_date.year],
    'month': [future_date.month],
    'day': [future_date.day],
    'hour': [future_date.hour],
    'minute': [future_date.minute]
})

future_features_scaled = scaler.transform(future_features)

# Predict latitude and longitude using both models
predicted_lat_lin = lin_reg_lat.predict(future_features_scaled)
predicted_long_lin = lin_reg_long.predict(future_features_scaled)

predicted_lat_rf = rf_reg_lat.predict(future_features_scaled)
predicted_long_rf = rf_reg_long.predict(future_features_scaled)

predicted_location_lin = (predicted_lat_lin[0], predicted_long_lin[0])
predicted_location_rf = (predicted_lat_rf[0], predicted_long_rf[0])

print(f"Predicted location (Linear Regression): {predicted_location_lin}")
print(f"Predicted location (Random Forest): {predicted_location_rf}")

Predicted location (Linear Regression): (37.73284272832605, -96.60529444026037)
Predicted location (Random Forest): (37.39361446059999, -116.58863610220014)


Hyperparameter Tuning for Random Forest Regressor:

In [12]:
from sklearn.model_selection import GridSearchCV

# Define the hyperparameters to tune for both latitude and longitude prediction
param_grid_rf = {
    'n_estimators': [50, 100, 150],  # Number of trees in the forest
    'max_depth': [None, 10, 20, 30],  # Maximum depth of the trees
    'min_samples_leaf': [1, 5, 10]  # Minimum number of samples required to be at a leaf node
}

# Initialize the Random Forest Regressor
rf_reg = RandomForestRegressor(random_state=42)

# Perform grid search with cross-validation for latitude prediction
grid_search_rf = GridSearchCV(estimator=rf_reg, param_grid=param_grid_rf, cv=5, scoring='neg_mean_absolute_error', n_jobs=-1)
grid_search_rf.fit(X_train_scaled, y_lat_train)

# Print the best hyperparameters for latitude prediction
print("Best Hyperparameters for Random Forest Regressor (Latitude Prediction):", grid_search_rf.best_params_)

# Train Random Forest Regressor with best hyperparameters for latitude prediction
best_rf_reg_lat = grid_search_rf.best_estimator_
best_rf_reg_lat.fit(X_train_scaled, y_lat_train)

# Perform grid search with cross-validation for longitude prediction
grid_search_rf = GridSearchCV(estimator=rf_reg, param_grid=param_grid_rf, cv=5, scoring='neg_mean_absolute_error', n_jobs=-1)
grid_search_rf.fit(X_train_scaled, y_long_train)

# Print the best hyperparameters for longitude prediction
print("Best Hyperparameters for Random Forest Regressor (Longitude Prediction):", grid_search_rf.best_params_)

# Train Random Forest Regressor with best hyperparameters for longitude prediction
best_rf_reg_long = grid_search_rf.best_estimator_
best_rf_reg_long.fit(X_train_scaled, y_long_train)


Best Hyperparameters for Random Forest Regressor (Latitude Prediction): {'max_depth': 30, 'min_samples_leaf': 10, 'n_estimators': 150}
Best Hyperparameters for Random Forest Regressor (Longitude Prediction): {'max_depth': 20, 'min_samples_leaf': 5, 'n_estimators': 150}


Hyperparameter Tuning for Linear Regression with Ridge Regression:

In [13]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Ridge

# Define the parameter grid
param_grid = {
    'alpha': [0.1, 1.0, 10.0],  # Regularization strength
    'solver': ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga']  # Solver options
}

# Define the Ridge Regression model
ridge_reg = Ridge()

# Perform Grid Search with cross-validation
grid_search_lat = GridSearchCV(estimator=ridge_reg, param_grid=param_grid, scoring='neg_mean_absolute_error', cv=5)
grid_search_long = GridSearchCV(estimator=ridge_reg, param_grid=param_grid, scoring='neg_mean_absolute_error', cv=5)

# Fit the models for latitude and longitude
grid_search_lat.fit(X_train_scaled, y_lat_train)
grid_search_long.fit(X_train_scaled, y_long_train)

# Get the best parameters
best_params_lat = grid_search_lat.best_params_
best_params_long = grid_search_long.best_params_

# Train Ridge Regression models with the best parameters
ridge_reg_lat = Ridge(**best_params_lat)
ridge_reg_long = Ridge(**best_params_long)

ridge_reg_lat.fit(X_train_scaled, y_lat_train)
ridge_reg_long.fit(X_train_scaled, y_long_train)


Evaluation Metrics for Random Forest Regressor (Latitude and Longitude Prediction):

In [21]:
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.metrics import root_mean_squared_error

# Predict latitude and longitude using the best Random Forest Regressor models
y_lat_pred_rf_tuned = best_rf_reg_lat.predict(X_test_scaled)
y_long_pred_rf_tuned = best_rf_reg_long.predict(X_test_scaled)

# Compute evaluation metrics for latitude prediction
mse_lat_rf = mean_squared_error(y_lat_test, y_lat_pred_rf_tuned)
rmse_lat_rf = root_mean_squared_error(y_lat_test, y_lat_pred_rf_tuned)  # RMSE
r2_lat_rf = r2_score(y_lat_test, y_lat_pred_rf_tuned)
mae_lat_rf = mean_absolute_error(y_lat_test, y_lat_pred_rf_tuned)

# Compute evaluation metrics for longitude prediction
mse_long_rf = mean_squared_error(y_long_test, y_long_pred_rf_tuned)
rmse_long_rf = root_mean_squared_error(y_long_test, y_long_pred_rf_tuned)  # RMSE
r2_long_rf = r2_score(y_long_test, y_long_pred_rf_tuned)
mae_long_rf = mean_absolute_error(y_long_test, y_long_pred_rf_tuned)

# Print evaluation metrics for Random Forest Regressor
print("Random Forest Regressor Evaluation Metrics (Latitude Prediction):")
print("Mean Squared Error (MSE) for Latitude:", mse_lat_rf)
print("Root Mean Squared Error (RMSE) for Latitude:", rmse_lat_rf)
print("Mean Absolute Error (MAE) for Latitude:", mae_lat_rf)
print("R-squared (R^2) Score for Latitude:", r2_lat_rf)
print()
print("Random Forest Regressor Evaluation Metrics (Longitude Prediction):")
print("Mean Squared Error (MSE) for Longitude:", mse_long_rf)
print("Root Mean Squared Error (RMSE) for Longitude:", rmse_long_rf)
print("Mean Absolute Error (MAE) for Longitude:", mae_long_rf)
print("R-squared (R^2) Score for Longitude:", r2_long_rf)



Random Forest Regressor Evaluation Metrics (Latitude Prediction):
Mean Squared Error (MSE) for Latitude: 30.3985254397244
Root Mean Squared Error (RMSE) for Latitude: 5.513485779407107
Mean Absolute Error (MAE) for Latitude: 4.344335268814692
R-squared (R^2) Score for Latitude: 0.02400273397957997

Random Forest Regressor Evaluation Metrics (Longitude Prediction):
Mean Squared Error (MSE) for Longitude: 325.44648778349824
Root Mean Squared Error (RMSE) for Longitude: 18.040135470209147
Mean Absolute Error (MAE) for Longitude: 15.393973535066895
R-squared (R^2) Score for Longitude: 0.018539929037229896


Evaluation Metrics for Ridge Regression (Latitude and Longitude Prediction):

In [23]:
# Predict latitude and longitude using the Ridge Regression models
y_lat_pred_ridge = ridge_reg_lat.predict(X_test_scaled)
y_long_pred_ridge = ridge_reg_long.predict(X_test_scaled)

# Compute evaluation metrics for latitude prediction
mse_lat_ridge = mean_squared_error(y_lat_test, y_lat_pred_ridge)
rmse_lat_ridge = root_mean_squared_error(y_lat_test, y_lat_pred_ridge)  # RMSE
r2_lat_ridge = r2_score(y_lat_test, y_lat_pred_ridge)
mae_lat_ridge = mean_absolute_error(y_lat_test, y_lat_pred_ridge)

# Compute evaluation metrics for longitude prediction
mse_long_ridge = mean_squared_error(y_long_test, y_long_pred_ridge)
rmse_long_ridge = root_mean_squared_error(y_long_test, y_long_pred_ridge)  # RMSE
r2_long_ridge = r2_score(y_long_test, y_long_pred_ridge)
mae_long_ridge = mean_absolute_error(y_long_test, y_long_pred_ridge)

# Print evaluation metrics for Ridge Regression
print("Ridge Regression Evaluation Metrics (Latitude Prediction):")
print("Mean Squared Error (MSE) for Latitude:", mse_lat_ridge)
print("Root Mean Squared Error (RMSE) for Latitude:", rmse_lat_ridge)
print("Mean Absolute Error (MAE) for Latitude:", mae_lat_ridge)
print("R-squared (R^2) Score for Latitude:", r2_lat_ridge)
print()
print("Ridge Regression Evaluation Metrics (Longitude Prediction):")
print("Mean Squared Error (MSE) for Longitude:", mse_long_ridge)
print("Root Mean Squared Error (RMSE) for Longitude:", rmse_long_ridge)
print("Mean Absolute Error (MAE) for Longitude:", mae_long_ridge)
print("R-squared (R^2) Score for Longitude:", r2_long_ridge)



Ridge Regression Evaluation Metrics (Latitude Prediction):
Mean Squared Error (MSE) for Latitude: 31.084594995702584
Root Mean Squared Error (RMSE) for Latitude: 5.575356042057098
Mean Absolute Error (MAE) for Latitude: 4.442363929437805
R-squared (R^2) Score for Latitude: 0.001975283593465993

Ridge Regression Evaluation Metrics (Longitude Prediction):
Mean Squared Error (MSE) for Longitude: 330.8707160299979
Root Mean Squared Error (RMSE) for Longitude: 18.189852006819567
Mean Absolute Error (MAE) for Longitude: 15.772510334437378
R-squared (R^2) Score for Longitude: 0.0021818989475972383
