In [1]:
!pip install pandas numpy scikit-learn tensorflow joblib streamlit plotly matplotlib

Defaulting to user installation because normal site-packages is not writeable
Collecting plotly
  Downloading plotly-6.4.0-py3-none-any.whl.metadata (8.5 kB)
Downloading plotly-6.4.0-py3-none-any.whl (9.9 MB)
   ---------------------------------------- 0.0/9.9 MB ? eta -:--:--
   ---- ----------------------------------- 1.0/9.9 MB 8.5 MB/s eta 0:00:02
   --------- ------------------------------ 2.4/9.9 MB 7.5 MB/s eta 0:00:02
   -------------- ------------------------- 3.7/9.9 MB 7.0 MB/s eta 0:00:01
   --------------------- ------------------ 5.2/9.9 MB 6.8 MB/s eta 0:00:01
   -------------------------- ------------- 6.6/9.9 MB 6.9 MB/s eta 0:00:01
   ------------------------------- -------- 7.9/9.9 MB 6.9 MB/s eta 0:00:01
   -------------------------------------- - 9.4/9.9 MB 6.8 MB/s eta 0:00:01
   ---------------------------------------- 9.9/9.9 MB 6.6 MB/s  0:00:01
Installing collected packages: plotly
Successfully installed plotly-6.4.0


In [7]:
!pip install -q pandas numpy scikit-learn tensorflow joblib plotly pyarrow

import pandas as pd
import numpy as np
import joblib
from pathlib import Path

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering

from sklearn.ensemble import IsolationForest

import plotly.express as px

# ---------------- Core helpers ----------------
REQUIRED_COLUMNS = ["name","id","nametype","recclass","mass","fall","year","reclat","reclong","GeoLocation","date"]

def load_data(path: str) -> pd.DataFrame:
    df = pd.read_csv(path)
    missing = [c for c in REQUIRED_COLUMNS if c not in df.columns]
    if missing:
        raise ValueError(f"Missing required columns: {missing}")
    return df

def clean_and_engineer(df: pd.DataFrame):
    df = df.copy()
    # coerce numeric
    for col in ["mass","year","reclat","reclong"]:
        df[col] = pd.to_numeric(df[col], errors="coerce")
    # drop essential NaNs
    df = df.dropna(subset=["year","reclat","reclong"])
    # mass: fill + log1p
    if df["mass"].isna().any():
        df["mass"] = df["mass"].fillna(df["mass"].median())
    df["mass_log1p"] = np.log1p(df["mass"].clip(lower=0))
    # target (not used here, but handy): Fell->1 else 0
    df["impact"] = (df["fall"].astype(str).str.strip().str.lower() == "fell").astype(int)
    features = ["year","reclat","reclong","mass_log1p"]
    return df, features


In [8]:
CSV_PATH = "meteor_data.csv"  # change if needed

df_raw = load_data(CSV_PATH)
df, features = clean_and_engineer(df_raw)
X = df[features].values.astype(float)

scaler = StandardScaler()
Xs = scaler.fit_transform(X)

# --- KMeans ---
kmeans = KMeans(n_clusters=5, random_state=42, n_init=10)
df["cluster_kmeans"] = kmeans.fit_predict(Xs)

# --- DBSCAN ---
db = DBSCAN(eps=0.8, min_samples=10).fit(Xs)
df["cluster_dbscan"] = db.labels_   # -1 means noise

# --- Agglomerative ---
ag = AgglomerativeClustering(n_clusters=5)
df["cluster_agglom"] = ag.fit_predict(Xs)

# Save artifacts
Path("artifacts").mkdir(exist_ok=True)
Path("models").mkdir(exist_ok=True)

df_out = df[["id","year","reclat","reclong","mass","mass_log1p","impact",
             "cluster_kmeans","cluster_dbscan","cluster_agglom"]].copy()

df_out.to_parquet("artifacts/clusters.parquet", index=False)
joblib.dump(kmeans, "models/kmeans.pkl")
joblib.dump(scaler, "artifacts/cluster_scaler.pkl")

print("✅ Clustering done. Saved:")
print("- artifacts/clusters.parquet")
print("- models/kmeans.pkl")
print("- artifacts/cluster_scaler.pkl")

# Quick look
df_out.head()


✅ Clustering done. Saved:
- artifacts/clusters.parquet
- models/kmeans.pkl
- artifacts/cluster_scaler.pkl


Unnamed: 0,id,year,reclat,reclong,mass,mass_log1p,impact,cluster_kmeans,cluster_dbscan,cluster_agglom
0,1,1880.0,50.775,6.08333,21.0,3.091042,1,2,0,0
1,2,1951.0,56.18333,10.23333,720.0,6.580639,1,4,0,0
2,6,1952.0,54.21667,-113.0,107000.0,11.580593,1,2,0,0
3,10,1976.0,16.88333,-99.9,1914.0,7.557473,1,4,0,2
4,370,1902.0,-33.16667,-64.95,780.0,6.660575,1,2,0,0


In [9]:
fig = px.scatter_mapbox(
    df_out, lat="reclat", lon="reclong", color="cluster_kmeans",
    hover_name="id", zoom=1, height=500
)
fig.update_layout(mapbox_style="open-street-map")
fig.show()


  fig = px.scatter_mapbox(


In [10]:
USE_AUTOENCODER = True  # set to False if you want only IsolationForest

from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense

def build_autoencoder(input_dim: int):
    inp = Input(shape=(input_dim,))
    x = Dense(16, activation="relu")(inp)
    x = Dense(8, activation="relu")(x)
    bottleneck = Dense(4, activation="relu")(x)
    x = Dense(8, activation="relu")(bottleneck)
    x = Dense(16, activation="relu")(x)
    out = Dense(input_dim, activation="linear")(x)
    ae = Model(inp, out)
    ae.compile(optimizer="adam", loss="mse")
    return ae

# Use same features & scaler style, but fit a fresh scaler for anomalies
sc_anom = StandardScaler()
Xs_anom = sc_anom.fit_transform(df[features].values.astype(float))

# --- Isolation Forest ---
iso = IsolationForest(contamination=0.02, random_state=42)
iso.fit(Xs_anom)
iso_scores = -iso.score_samples(Xs_anom)  # higher => more anomalous

# --- Autoencoder (optional) ---
if USE_AUTOENCODER:
    ae = build_autoencoder(Xs_anom.shape[1])
    ae.fit(Xs_anom, Xs_anom, epochs=50, batch_size=32, validation_split=0.2, verbose=0)
    recon = ae.predict(Xs_anom, verbose=0)
    recon_err = ((Xs_anom - recon)**2).mean(axis=1)
else:
    ae = None
    recon_err = np.zeros(len(Xs_anom))

# Save artifacts
joblib.dump(iso, "models/isolation_forest.pkl")
joblib.dump(sc_anom, "artifacts/anomaly_scaler.pkl")
if ae is not None:
    ae.save("models/autoencoder.keras")

anom = df[["id","year","reclat","reclong","mass"]].copy()
anom["iso_anomaly_score"] = iso_scores
anom["ae_recon_error"] = recon_err
anom.to_parquet("artifacts/anomaly_scores.parquet", index=False)

print("✅ Anomaly detection done. Saved:")
print("- models/isolation_forest.pkl")
if ae is not None:
    print("- models/autoencoder.keras")
print("- artifacts/anomaly_scores.parquet")

# Show top anomalies
anom.sort_values(["iso_anomaly_score","ae_recon_error"], ascending=False).head(20)


✅ Anomaly detection done. Saved:
- models/isolation_forest.pkl
- models/autoencoder.keras
- artifacts/anomaly_scores.parquet


Unnamed: 0,id,year,reclat,reclong,mass,iso_anomaly_score,ae_recon_error
17215,12356,1749.0,54.9,91.8,700000.0,0.751667,0.082241
5308,5262,1818.0,76.13333,-64.93333,58200000.0,0.739378,0.237998
31750,18884,1797.0,-7.56667,110.83333,500000.0,0.733266,0.028839
5137,5130,1810.0,52.5,30.33333,823000.0,0.731278,0.058646
37401,23694,1873.0,58.61667,98.93333,217000.0,0.727721,0.136667
16434,12025,1822.0,-24.20333,-68.80667,920000.0,0.726622,0.394236
5023,5015,1784.0,-10.11667,-39.2,5360000.0,0.724716,0.25071
972,24004,1807.0,54.5,35.2,65500.0,0.722426,0.063041
985,24037,1824.0,51.73333,102.53333,2000.0,0.722325,0.060403
37905,24166,1854.0,54.76667,113.98333,18000.0,0.721175,0.097161


In [11]:
fig = px.scatter(
    anom, x="iso_anomaly_score", y="ae_recon_error",
    hover_data=["id","year","mass"], title="Anomaly Space"
)
fig.show()


In [15]:
!streamlit run app.py --server.port 8501



  You can now view your Streamlit app in your browser.

  Local URL: http://localhost:8501
  Network URL: http://192.168.0.104:8501



  fig = px.scatter_mapbox(
2025-11-10 22:00:06.703 Please replace `use_container_width` with `width`.

`use_container_width` will be removed after 2025-12-31.

For `use_container_width=True`, use `width='stretch'`. For `use_container_width=False`, use `width='content'`.
2025-11-10 22:00:06.848 Please replace `use_container_width` with `width`.

`use_container_width` will be removed after 2025-12-31.

For `use_container_width=True`, use `width='stretch'`. For `use_container_width=False`, use `width='content'`.

*scatter_mapbox* is deprecated! Use *scatter_map* instead. Learn more at: https://plotly.com/python/mapbox-to-maplibre/

2025-11-10 22:00:50.319 Please replace `use_container_width` with `width`.

`use_container_width` will be removed after 2025-12-31.

For `use_container_width=True`, use `width='stretch'`. For `use_container_width=False`, use `width='content'`.
2025-11-10 22:00:50.423 Please replace `use_container_width` with `width`.

`use_container_width` will be removed after