In [27]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import median_absolute_error

Step 1: Load and Inspect the Dataset
Load the dataset and display the first few rows to understand its structure.

In [11]:
file_path = '/home/antqua/code/AntQua/ET_Predictor/raw_data/scrubbed.csv'
data = pd.read_csv(file_path, low_memory=False)

data.head()

Unnamed: 0,datetime,city,state,country,shape,duration (seconds),duration (hours/min),comments,date posted,latitude,longitude
0,10/10/1949 20:30,san marcos,tx,us,cylinder,2700,45 minutes,This event took place in early fall around 194...,4/27/2004,29.8830556,-97.941111
1,10/10/1949 21:00,lackland afb,tx,,light,7200,1-2 hrs,1949 Lackland AFB&#44 TX. Lights racing acros...,12/16/2005,29.38421,-98.581082
2,10/10/1955 17:00,chester (uk/england),,gb,circle,20,20 seconds,Green/Orange circular disc over Chester&#44 En...,1/21/2008,53.2,-2.916667
3,10/10/1956 21:00,edna,tx,us,circle,20,1/2 hour,My older brother and twin sister were leaving ...,1/17/2004,28.9783333,-96.645833
4,10/10/1960 20:00,kaneohe,hi,us,light,900,15 minutes,AS a Marine 1st Lt. flying an FJ4B fighter/att...,1/22/2004,21.4180556,-157.803611


In [13]:
print(data.shape)
print(data.columns)
print(data.dtypes)

(80332, 11)
Index(['datetime', 'city', 'state', 'country', 'shape', 'duration (seconds)',
       'duration (hours/min)', 'comments', 'date posted', 'latitude',
       'longitude '],
      dtype='object')
datetime                 object
city                     object
state                    object
country                  object
shape                    object
duration (seconds)       object
duration (hours/min)     object
comments                 object
date posted              object
latitude                 object
longitude               float64
dtype: object


The dataset has several columns, including datetime, city, state, country, shape, duration (seconds), duration (hours/min), comments, date posted, latitude, and longitude. Our focus will be on the datetime and duration (seconds) columns for features, and latitude and longitude for the target variables.

Step 2: Data Preparation

Convert the datetime column to numerical format and split it into separate features
We'll convert the datetime column to datetime objects and then extract the year, month, day, hour, and minute as separate features.

Handle Missing Values
We'll check for and handle any missing values in the relevant columns (datetime, duration (seconds), latitude, and longitude).

In [19]:
# Strip whitespace from all column names
data.columns = data.columns.str.strip()

# Convert latitude and longitude to numeric, forcing invalid parsing to NaN
data['latitude'] = pd.to_numeric(data['latitude'], errors='coerce')
data['longitude'] = pd.to_numeric(data['longitude'], errors='coerce')

# Convert duration (seconds) to numeric, forcing invalid parsing to NaN
data['duration (seconds)'] = pd.to_numeric(data['duration (seconds)'], errors='coerce')

# Convert datetime column to datetime object
data['datetime'] = pd.to_datetime(data['datetime'], errors='coerce')

# Drop rows with NaN values in datetime, duration (seconds), latitude, or longitude
data = data.dropna(subset=['datetime', 'duration (seconds)', 'latitude', 'longitude'])

print(data.head(1))
print(data.shape)
print(data.columns)
# Verify the conversion
print(data.dtypes)

             datetime        city state country     shape  duration (seconds)  \
0 1949-10-10 20:30:00  san marcos    tx      us  cylinder              2700.0   

  duration (hours/min)                                           comments  \
0           45 minutes  This event took place in early fall around 194...   

  date posted   latitude  longitude  
0   4/27/2004  29.883056 -97.941111  
(79634, 11)
Index(['datetime', 'city', 'state', 'country', 'shape', 'duration (seconds)',
       'duration (hours/min)', 'comments', 'date posted', 'latitude',
       'longitude'],
      dtype='object')
datetime                datetime64[ns]
city                            object
state                           object
country                         object
shape                           object
duration (seconds)             float64
duration (hours/min)            object
comments                        object
date posted                     object
latitude                       float64
longitude    

Step 3: Feature Engineering

We'll use the extracted datetime features and the duration (seconds) for training the model.

In [24]:
# Drop rows with NaN values in datetime, duration (seconds), latitude, or longitude
data = data.dropna(subset=['datetime', 'duration (seconds)', 'latitude', 'longitude'])

# Extract year, month, day, hour, and minute from datetime
data['year'] = data['datetime'].dt.year
data['month'] = data['datetime'].dt.month
data['day'] = data['datetime'].dt.day
data['hour'] = data['datetime'].dt.hour
data['minute'] = data['datetime'].dt.minute

# Specify the columns to keep
columns_to_keep = ['datetime', 'duration (seconds)', 'latitude', 'longitude', 'year', 'month', 'day', 'hour', 'minute']

# Select only the specified columns
data = data[columns_to_keep]

print(data.head())
print(data.dtypes)


             datetime  duration (seconds)   latitude   longitude  year  month  \
0 1949-10-10 20:30:00              2700.0  29.883056  -97.941111  1949     10   
1 1949-10-10 21:00:00              7200.0  29.384210  -98.581082  1949     10   
2 1955-10-10 17:00:00                20.0  53.200000   -2.916667  1955     10   
3 1956-10-10 21:00:00                20.0  28.978333  -96.645833  1956     10   
4 1960-10-10 20:00:00               900.0  21.418056 -157.803611  1960     10   

   day  hour  minute  
0   10    20      30  
1   10    21       0  
2   10    17       0  
3   10    21       0  
4   10    20       0  
datetime              datetime64[ns]
duration (seconds)           float64
latitude                     float64
longitude                    float64
year                           int32
month                          int32
day                            int32
hour                           int32
minute                         int32
dtype: object


Step 4: Model Training

We'll scale the features and train both LinearRegression and RandomForestRegressor models for predicting latitude and longitude separately.

In [25]:
# Define features and target variables
features = ['duration (seconds)', 'year', 'month', 'day', 'hour', 'minute']
X = data[features]
y_lat = data['latitude']
y_long = data['longitude']

# Split the data into training and test sets
X_train, X_test, y_lat_train, y_lat_test, y_long_train, y_long_test = train_test_split(X, y_lat, y_long, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train Linear Regression models
lin_reg_lat = LinearRegression()
lin_reg_lat.fit(X_train_scaled, y_lat_train)

lin_reg_long = LinearRegression()
lin_reg_long.fit(X_train_scaled, y_long_train)

# Train Random Forest Regressor models
rf_reg_lat = RandomForestRegressor(random_state=42)
rf_reg_lat.fit(X_train_scaled, y_lat_train)

rf_reg_long = RandomForestRegressor(random_state=42)
rf_reg_long.fit(X_train_scaled, y_long_train)


Step 5: Prediction

We'll use the trained models to predict the location for a future chosen date.

In [28]:
# Predict and evaluate the models
y_lat_pred_lin = lin_reg_lat.predict(X_test_scaled)
y_long_pred_lin = lin_reg_long.predict(X_test_scaled)

y_lat_pred_rf = rf_reg_lat.predict(X_test_scaled)
y_long_pred_rf = rf_reg_long.predict(X_test_scaled)

mae_lat_lin = median_absolute_error(y_lat_test, y_lat_pred_lin)
mae_long_lin = median_absolute_error(y_long_test, y_long_pred_lin)

mae_lat_rf = median_absolute_error(y_lat_test, y_lat_pred_rf)
mae_long_rf = median_absolute_error(y_long_test, y_long_pred_rf)

mae_results = {
    'Linear Regression Latitude MAE': mae_lat_lin,
    'Linear Regression Longitude MAE': mae_long_lin,
    'Random Forest Latitude MAE': mae_lat_rf,
    'Random Forest Longitude MAE': mae_long_rf,
}

mae_results

{'Linear Regression Latitude MAE': 4.279800930829566,
 'Linear Regression Longitude MAE': 12.092890712536345,
 'Random Forest Latitude MAE': 4.4448742869999975,
 'Random Forest Longitude MAE': 15.68461854966678}

In [29]:
# Example prediction for a future chosen date
future_date = pd.to_datetime('2025-01-01 00:00:00')
future_duration = 120  # example duration in seconds

future_features = pd.DataFrame({
    'duration (seconds)': [future_duration],
    'year': [future_date.year],
    'month': [future_date.month],
    'day': [future_date.day],
    'hour': [future_date.hour],
    'minute': [future_date.minute]
})

future_features_scaled = scaler.transform(future_features)

# Predict latitude and longitude using both models
predicted_lat_lin = lin_reg_lat.predict(future_features_scaled)
predicted_long_lin = lin_reg_long.predict(future_features_scaled)

predicted_lat_rf = rf_reg_lat.predict(future_features_scaled)
predicted_long_rf = rf_reg_long.predict(future_features_scaled)

predicted_location_lin = (predicted_lat_lin[0], predicted_long_lin[0])
predicted_location_rf = (predicted_lat_rf[0], predicted_long_rf[0])

print(f"Predicted location (Linear Regression): {predicted_location_lin}")
print(f"Predicted location (Random Forest): {predicted_location_rf}")

Predicted location (Linear Regression): (37.13876627269018, -86.11649353746462)
Predicted location (Random Forest): (37.040061110000025, -115.68422593400015)
