In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.cluster import KMeans
from collections import Counter
import joblib

In [2]:
# Data cleaning function
def clean_data(value):
    try:
        return float(value)
    except ValueError:
        cleaned_value = ''.join(char for char in value if char.isdigit() or char == '.')
        try:
            return float(cleaned_value)
        except ValueError:
            return float('nan')

In [3]:
# Load and preprocess the data
file_path = '/home/antqua/code/AntQua/ET_Predictor/raw_data/scrubbed.csv'
df = pd.read_csv(file_path, low_memory=False)

df.head()

Unnamed: 0,datetime,city,state,country,shape,duration (seconds),duration (hours/min),comments,date posted,latitude,longitude
0,10/10/1949 20:30,san marcos,tx,us,cylinder,2700,45 minutes,This event took place in early fall around 194...,4/27/2004,29.8830556,-97.941111
1,10/10/1949 21:00,lackland afb,tx,,light,7200,1-2 hrs,1949 Lackland AFB&#44 TX. Lights racing acros...,12/16/2005,29.38421,-98.581082
2,10/10/1955 17:00,chester (uk/england),,gb,circle,20,20 seconds,Green/Orange circular disc over Chester&#44 En...,1/21/2008,53.2,-2.916667
3,10/10/1956 21:00,edna,tx,us,circle,20,1/2 hour,My older brother and twin sister were leaving ...,1/17/2004,28.9783333,-96.645833
4,10/10/1960 20:00,kaneohe,hi,us,light,900,15 minutes,AS a Marine 1st Lt. flying an FJ4B fighter/att...,1/22/2004,21.4180556,-157.803611


In [4]:
df['latitude'] = df['latitude'].apply(clean_data)
df['longitude '] = df['longitude '].apply(clean_data)
df.columns = df.columns.str.strip()
df['latitude'] = pd.to_numeric(df['latitude'], errors='coerce')
df['longitude'] = pd.to_numeric(df['longitude'], errors='coerce')
df['duration (seconds)'] = pd.to_numeric(df['duration (seconds)'], errors='coerce')
df['datetime'] = pd.to_datetime(df['datetime'], errors='coerce')
df = df.dropna()
df.dtypes

datetime                datetime64[ns]
city                            object
state                           object
country                         object
shape                           object
duration (seconds)             float64
duration (hours/min)            object
comments                        object
date posted                     object
latitude                       float64
longitude                      float64
dtype: object

In [5]:
# Extract features and target variables
df['year'] = df['datetime'].dt.year
df['month'] = df['datetime'].dt.month
df['day'] = df['datetime'].dt.day
df['hour'] = df['datetime'].dt.hour
df['minute'] = df['datetime'].dt.minute

features = ['year', 'month', 'day', 'hour', 'minute']
X = df[features]
y_lat = df['latitude']
y_long = df['longitude']

In [6]:
# Split the data into training and test sets
X_train, X_test, y_lat_train, y_lat_test, y_long_train, y_long_test = train_test_split(X, y_lat, y_long, test_size=0.2, random_state=42)

In [7]:
# Define and train the pipelines for latitude and longitude prediction
lat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()),
    ('regressor', RandomForestRegressor(random_state=42))
])

long_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()),
    ('regressor', RandomForestRegressor(random_state=42))
])

lat_pipeline.fit(X_train, y_lat_train)
long_pipeline.fit(X_train, y_long_train)

In [8]:
# Save the pipelines
joblib.dump(lat_pipeline, 'lat_pipeline.pkl')
joblib.dump(long_pipeline, 'long_pipeline.pkl')

['long_pipeline.pkl']

In [9]:
# Train the KMeans model for shape and duration prediction
location_data = df[['latitude', 'longitude']]
kmeans = KMeans(n_clusters=10, random_state=0)
kmeans.fit(location_data)
df['cluster'] = kmeans.labels_

In [10]:
# Save the KMeans model
joblib.dump(kmeans, 'kmeans.pkl')

['kmeans.pkl']

In [11]:
# Extract the cluster information
cluster_info = {}
for cluster_label in range(kmeans.n_clusters):
    cluster_data = df[df['cluster'] == cluster_label]
    shape_counter = Counter(cluster_data['shape'])
    most_common_shape = shape_counter.most_common(1)[0][0]
    average_duration = cluster_data['duration (seconds)'].mean()
    cluster_info[cluster_label] = {
        'most_common_shape': most_common_shape,
        'average_duration': average_duration
    }

In [12]:
# Save the cluster information
joblib.dump(cluster_info, 'cluster_info.pkl')

['cluster_info.pkl']