# Asteroid Hazard Analysis â€” Data Preparation Notebook

This notebook processes and merges two datasets:

- **All known asteroids** (`mpcorb_extended.json`)
- **Potentially Hazardous Asteroids (PHA)** (`pha_extended.json`)

# 1. Importing Libraries & Configurations

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option('display.max_columns', 200)

# 2. Loading the Datasets

Data Available from the [IAU Minor Planet Center](https://www.minorplanetcenter.net/data)

In [5]:
asteroids = pd.read_json('/content/drive/MyDrive/Colab Notebooks/Personal/asteroid_project/data/raw/mpcorb_extended.json')

hazardous_asteroids = pd.read_json('/content/drive/MyDrive/Colab Notebooks/Personal/asteroid_project/data/raw/pha_extended.json')

# 3. Initial Inspection

In [6]:
asteroids.head()

Unnamed: 0,Critical_list_numbered_object_flag,H,G,Num_obs,rms,U,Arc_years,Perturbers,Perturbers_2,Number,Name,Principal_desig,Other_desigs,Epoch,M,Peri,Node,i,e,n,a,Ref,Num_opps,Computer,Hex_flags,Last_obs,Tp,Orbital_period,Perihelion_dist,Aphelion_dist,Semilatus_rectum,Synodic_period,Orbit_type,NEO_flag,One_km_NEO_flag,PHA_flag,One_opposition_object_flag,Arc_length
0,1.0,3.35,0.15,7369.0,0.69,0,1801-2025,M-v,30k,(1),Ceres,A801 AA,"[A899 OF, 1943 XB]",2461000.5,231.53975,73.29974,80.24963,10.58789,0.079576,0.214297,2.765616,MPO950947,126,MPCORBFIT,4000,2025-10-03,2461600.0,4.599258,2.545538,2.985693,1.374051,1.277835,MBA,,,,,
1,1.0,4.11,0.15,9021.0,0.64,0,1804-2025,M-c,28k,(2),Pallas,A802 FA,,2461000.5,211.52977,310.9334,172.88859,34.92833,0.230643,0.213797,2.769926,E2025-V46,124,MPCORBFIT,4000,2025-11-05,2461695.0,4.610014,2.131062,3.40879,1.311288,1.277007,MBA,,,,,
2,1.0,5.19,0.15,7565.0,0.67,0,1804-2025,M-v,3Ek,(3),Juno,A804 RA,,2461000.5,217.59095,247.88367,169.81989,12.98604,0.255826,0.225799,2.670879,MPO937415,118,MPCORBFIT,4000,2025-08-08,2461631.0,4.364971,1.987599,3.354159,1.248039,1.297179,MBA,,,,,
3,1.0,3.25,0.15,7543.0,0.69,0,1821-2025,M-p,18k,(4),Vesta,A807 FA,,2461000.5,26.80968,151.53712,103.70232,7.14406,0.090168,0.271588,2.361541,MPO925791,112,MPCLINUX,4000,2025-06-24,2460902.0,3.629053,2.148607,2.574476,1.171171,1.380365,MBA,,,,,
4,1.0,6.97,0.15,3339.0,0.79,0,1845-2025,M-v,3Ek,(5),Astraea,A845 XA,[1969 SE],2461000.5,133.8676,359.34517,141.44862,5.35925,0.187509,0.238269,2.576865,MPO950947,89,MPCORBFIT,4000,2025-10-06,2460439.0,4.136542,2.09368,3.060049,1.243132,1.318823,MBA,,,,,


In [7]:
asteroids.shape

(1477090, 38)

In [8]:
asteroids.columns

Index(['Critical_list_numbered_object_flag', 'H', 'G', 'Num_obs', 'rms', 'U',
       'Arc_years', 'Perturbers', 'Perturbers_2', 'Number', 'Name',
       'Principal_desig', 'Other_desigs', 'Epoch', 'M', 'Peri', 'Node', 'i',
       'e', 'n', 'a', 'Ref', 'Num_opps', 'Computer', 'Hex_flags', 'Last_obs',
       'Tp', 'Orbital_period', 'Perihelion_dist', 'Aphelion_dist',
       'Semilatus_rectum', 'Synodic_period', 'Orbit_type', 'NEO_flag',
       'One_km_NEO_flag', 'PHA_flag', 'One_opposition_object_flag',
       'Arc_length'],
      dtype='object')

In [9]:
hazardous_asteroids.head()

Unnamed: 0,NEO_flag,One_km_NEO_flag,PHA_flag,H,G,Num_obs,rms,U,Arc_years,Perturbers,Perturbers_2,Number,Name,Principal_desig,Epoch,M,Peri,Node,i,e,n,a,Ref,Num_opps,Computer,Hex_flags,Last_obs,Tp,Orbital_period,Perihelion_dist,Aphelion_dist,Semilatus_rectum,Synodic_period,Orbit_type,Other_desigs,One_opposition_object_flag,Arc_length
0,1.0,1.0,1.0,16.53,0.15,1842,0.63,0,1949-2025,M-v,3Ek,(1566),Icarus,1949 MA,2461000.5,153.07893,31.43821,87.95243,22.8032,0.827006,0.880548,1.078038,MPO950970,43,MPCLINUX,9803,2025-08-18,2460827.0,1.119312,0.186494,1.969581,0.170363,9.381419,Apollo,,,
1,1.0,1.0,1.0,15.26,0.15,8261,0.51,0,1951-2025,M-v,3Ek,(1620),Geographos,1951 RA,2461000.5,212.92045,277.01839,337.14079,13.3358,0.335512,0.708833,1.245777,E2025-V79,41,MPCORBFIT,9803,2025-11-10,2461208.0,1.390466,0.827804,1.66375,0.552771,3.561042,Apollo,[1983 CY3],,
2,1.0,1.0,1.0,16.08,0.09,3129,0.72,0,1930-2025,M-v,3Ek,(1862),Apollo,1932 HA,2461000.5,113.88454,286.05088,35.54163,6.3514,0.559918,0.552483,1.470924,E2025-UO9,37,MPCORBFIT,9803,2025-10-27,2460794.0,1.783961,0.647327,2.29452,0.504889,2.275574,Apollo,,,
3,1.0,1.0,1.0,15.28,0.15,2264,0.64,0,1973-2025,M-v,3Ek,(1981),Midas,1973 EA,2461000.5,65.34754,267.84424,356.79391,39.82339,0.650496,0.416332,1.776272,E2025-V79,28,MPCORBFIT,9803,2025-11-10,2460844.0,2.367358,0.620815,2.931728,0.512326,1.731337,Apollo,,,
4,1.0,,1.0,18.69,0.15,158,0.9,0,1936-2025,M-v,3Ek,(2101),Adonis,1936 CA,2461000.5,24.47917,43.7116,349.40534,1.31997,0.764127,0.384185,1.874025,MPO937442,12,MPCLINUX,8803,2025-07-19,2460937.0,2.565448,0.442031,3.306019,0.3899,1.638795,Apollo,,,


In [10]:
hazardous_asteroids.shape

(2505, 37)

In [11]:
hazardous_asteroids.columns

Index(['NEO_flag', 'One_km_NEO_flag', 'PHA_flag', 'H', 'G', 'Num_obs', 'rms',
       'U', 'Arc_years', 'Perturbers', 'Perturbers_2', 'Number', 'Name',
       'Principal_desig', 'Epoch', 'M', 'Peri', 'Node', 'i', 'e', 'n', 'a',
       'Ref', 'Num_opps', 'Computer', 'Hex_flags', 'Last_obs', 'Tp',
       'Orbital_period', 'Perihelion_dist', 'Aphelion_dist',
       'Semilatus_rectum', 'Synodic_period', 'Orbit_type', 'Other_desigs',
       'One_opposition_object_flag', 'Arc_length'],
      dtype='object')

# 4. Checking Overlap Between Datasets
We verify how many hazard-listed asteroids appear in the full asteroid dataset

In [12]:
asteroids['Principal_desig'].isin(hazardous_asteroids['Principal_desig']).value_counts()

Unnamed: 0_level_0,count
Principal_desig,Unnamed: 1_level_1
False,1474585
True,2505


# 5. Column Comparison Between the Two Datasets

In [13]:
asteroids_cols = asteroids.columns
hazardous_asteroids_col = hazardous_asteroids.columns

print('The columns in Asteroids but not in Hazerous Astorids:')
for col in asteroids_cols:
  if col not in hazardous_asteroids_col:
    print(col)

print('\n'+"#"*80+'\n')

print('The columns that are in Hazerous Astorids but not in Asteroids:')
for col in hazardous_asteroids_col:
  if col not in asteroids_cols:
    print(col)

The columns in Asteroids but not in Hazerous Astorids:
Critical_list_numbered_object_flag

################################################################################

The columns that are in Hazerous Astorids but not in Asteroids:


# 6. Removing Unmatched Columns

In [14]:
asteroids.drop('Critical_list_numbered_object_flag', axis=1, inplace=True)

In [15]:
asteroids_cols = asteroids.columns
hazardous_asteroids_col = hazardous_asteroids.columns

print('The columns in Asteroids but not in Hazerous Astorids:')
for col in asteroids_cols:
  if col not in hazardous_asteroids_col:
    print(col)

print('\n'+"#"*80+'\n')

print('The columns that are in Hazerous Astorids but not in Asteroids:')
for col in hazardous_asteroids_col:
  if col not in asteroids_cols:
    print(col)

The columns in Asteroids but not in Hazerous Astorids:

################################################################################

The columns that are in Hazerous Astorids but not in Asteroids:


# 7. Aligning Column Order

In [16]:
asteroids_cols = asteroids.columns
hazardous_asteroids_col = hazardous_asteroids.columns

for i, col in enumerate(asteroids_cols):
    print(f'{col}: index_in_asteroids={i}, index_in_hazardous={hazardous_asteroids_col.get_loc(col)}')

H: index_in_asteroids=0, index_in_hazardous=3
G: index_in_asteroids=1, index_in_hazardous=4
Num_obs: index_in_asteroids=2, index_in_hazardous=5
rms: index_in_asteroids=3, index_in_hazardous=6
U: index_in_asteroids=4, index_in_hazardous=7
Arc_years: index_in_asteroids=5, index_in_hazardous=8
Perturbers: index_in_asteroids=6, index_in_hazardous=9
Perturbers_2: index_in_asteroids=7, index_in_hazardous=10
Number: index_in_asteroids=8, index_in_hazardous=11
Name: index_in_asteroids=9, index_in_hazardous=12
Principal_desig: index_in_asteroids=10, index_in_hazardous=13
Other_desigs: index_in_asteroids=11, index_in_hazardous=34
Epoch: index_in_asteroids=12, index_in_hazardous=14
M: index_in_asteroids=13, index_in_hazardous=15
Peri: index_in_asteroids=14, index_in_hazardous=16
Node: index_in_asteroids=15, index_in_hazardous=17
i: index_in_asteroids=16, index_in_hazardous=18
e: index_in_asteroids=17, index_in_hazardous=19
n: index_in_asteroids=18, index_in_hazardous=20
a: index_in_asteroids=19, 

# 8. Reorder the Hazardous Dataset Columns

In [17]:
hazardous_asteroids = hazardous_asteroids[asteroids.columns]

In [18]:
hazardous_asteroids.head()

Unnamed: 0,H,G,Num_obs,rms,U,Arc_years,Perturbers,Perturbers_2,Number,Name,Principal_desig,Other_desigs,Epoch,M,Peri,Node,i,e,n,a,Ref,Num_opps,Computer,Hex_flags,Last_obs,Tp,Orbital_period,Perihelion_dist,Aphelion_dist,Semilatus_rectum,Synodic_period,Orbit_type,NEO_flag,One_km_NEO_flag,PHA_flag,One_opposition_object_flag,Arc_length
0,16.53,0.15,1842,0.63,0,1949-2025,M-v,3Ek,(1566),Icarus,1949 MA,,2461000.5,153.07893,31.43821,87.95243,22.8032,0.827006,0.880548,1.078038,MPO950970,43,MPCLINUX,9803,2025-08-18,2460827.0,1.119312,0.186494,1.969581,0.170363,9.381419,Apollo,1.0,1.0,1.0,,
1,15.26,0.15,8261,0.51,0,1951-2025,M-v,3Ek,(1620),Geographos,1951 RA,[1983 CY3],2461000.5,212.92045,277.01839,337.14079,13.3358,0.335512,0.708833,1.245777,E2025-V79,41,MPCORBFIT,9803,2025-11-10,2461208.0,1.390466,0.827804,1.66375,0.552771,3.561042,Apollo,1.0,1.0,1.0,,
2,16.08,0.09,3129,0.72,0,1930-2025,M-v,3Ek,(1862),Apollo,1932 HA,,2461000.5,113.88454,286.05088,35.54163,6.3514,0.559918,0.552483,1.470924,E2025-UO9,37,MPCORBFIT,9803,2025-10-27,2460794.0,1.783961,0.647327,2.29452,0.504889,2.275574,Apollo,1.0,1.0,1.0,,
3,15.28,0.15,2264,0.64,0,1973-2025,M-v,3Ek,(1981),Midas,1973 EA,,2461000.5,65.34754,267.84424,356.79391,39.82339,0.650496,0.416332,1.776272,E2025-V79,28,MPCORBFIT,9803,2025-11-10,2460844.0,2.367358,0.620815,2.931728,0.512326,1.731337,Apollo,1.0,1.0,1.0,,
4,18.69,0.15,158,0.9,0,1936-2025,M-v,3Ek,(2101),Adonis,1936 CA,,2461000.5,24.47917,43.7116,349.40534,1.31997,0.764127,0.384185,1.874025,MPO937442,12,MPCLINUX,8803,2025-07-19,2460937.0,2.565448,0.442031,3.306019,0.3899,1.638795,Apollo,1.0,,1.0,,


# 9. Remove Hazardous Asteroids from Main Dataset

In [19]:
asteroids = asteroids[~asteroids['Principal_desig'].isin(hazardous_asteroids['Principal_desig'])]

In [20]:
asteroids.shape

(1474585, 37)

# 10. Adding a Hazard Label

In [21]:
asteroids['hazerdous'] = np.zeros(1474585, dtype=int)

In [22]:
asteroids.head()

Unnamed: 0,H,G,Num_obs,rms,U,Arc_years,Perturbers,Perturbers_2,Number,Name,Principal_desig,Other_desigs,Epoch,M,Peri,Node,i,e,n,a,Ref,Num_opps,Computer,Hex_flags,Last_obs,Tp,Orbital_period,Perihelion_dist,Aphelion_dist,Semilatus_rectum,Synodic_period,Orbit_type,NEO_flag,One_km_NEO_flag,PHA_flag,One_opposition_object_flag,Arc_length,hazerdous
0,3.35,0.15,7369.0,0.69,0,1801-2025,M-v,30k,(1),Ceres,A801 AA,"[A899 OF, 1943 XB]",2461000.5,231.53975,73.29974,80.24963,10.58789,0.079576,0.214297,2.765616,MPO950947,126,MPCORBFIT,4000,2025-10-03,2461600.0,4.599258,2.545538,2.985693,1.374051,1.277835,MBA,,,,,,0
1,4.11,0.15,9021.0,0.64,0,1804-2025,M-c,28k,(2),Pallas,A802 FA,,2461000.5,211.52977,310.9334,172.88859,34.92833,0.230643,0.213797,2.769926,E2025-V46,124,MPCORBFIT,4000,2025-11-05,2461695.0,4.610014,2.131062,3.40879,1.311288,1.277007,MBA,,,,,,0
2,5.19,0.15,7565.0,0.67,0,1804-2025,M-v,3Ek,(3),Juno,A804 RA,,2461000.5,217.59095,247.88367,169.81989,12.98604,0.255826,0.225799,2.670879,MPO937415,118,MPCORBFIT,4000,2025-08-08,2461631.0,4.364971,1.987599,3.354159,1.248039,1.297179,MBA,,,,,,0
3,3.25,0.15,7543.0,0.69,0,1821-2025,M-p,18k,(4),Vesta,A807 FA,,2461000.5,26.80968,151.53712,103.70232,7.14406,0.090168,0.271588,2.361541,MPO925791,112,MPCLINUX,4000,2025-06-24,2460902.0,3.629053,2.148607,2.574476,1.171171,1.380365,MBA,,,,,,0
4,6.97,0.15,3339.0,0.79,0,1845-2025,M-v,3Ek,(5),Astraea,A845 XA,[1969 SE],2461000.5,133.8676,359.34517,141.44862,5.35925,0.187509,0.238269,2.576865,MPO950947,89,MPCORBFIT,4000,2025-10-06,2460439.0,4.136542,2.09368,3.060049,1.243132,1.318823,MBA,,,,,,0


In [23]:
hazardous_asteroids['hazerdous'] = np.ones(2505, dtype=int)

In [24]:
hazardous_asteroids.head()

Unnamed: 0,H,G,Num_obs,rms,U,Arc_years,Perturbers,Perturbers_2,Number,Name,Principal_desig,Other_desigs,Epoch,M,Peri,Node,i,e,n,a,Ref,Num_opps,Computer,Hex_flags,Last_obs,Tp,Orbital_period,Perihelion_dist,Aphelion_dist,Semilatus_rectum,Synodic_period,Orbit_type,NEO_flag,One_km_NEO_flag,PHA_flag,One_opposition_object_flag,Arc_length,hazerdous
0,16.53,0.15,1842,0.63,0,1949-2025,M-v,3Ek,(1566),Icarus,1949 MA,,2461000.5,153.07893,31.43821,87.95243,22.8032,0.827006,0.880548,1.078038,MPO950970,43,MPCLINUX,9803,2025-08-18,2460827.0,1.119312,0.186494,1.969581,0.170363,9.381419,Apollo,1.0,1.0,1.0,,,1
1,15.26,0.15,8261,0.51,0,1951-2025,M-v,3Ek,(1620),Geographos,1951 RA,[1983 CY3],2461000.5,212.92045,277.01839,337.14079,13.3358,0.335512,0.708833,1.245777,E2025-V79,41,MPCORBFIT,9803,2025-11-10,2461208.0,1.390466,0.827804,1.66375,0.552771,3.561042,Apollo,1.0,1.0,1.0,,,1
2,16.08,0.09,3129,0.72,0,1930-2025,M-v,3Ek,(1862),Apollo,1932 HA,,2461000.5,113.88454,286.05088,35.54163,6.3514,0.559918,0.552483,1.470924,E2025-UO9,37,MPCORBFIT,9803,2025-10-27,2460794.0,1.783961,0.647327,2.29452,0.504889,2.275574,Apollo,1.0,1.0,1.0,,,1
3,15.28,0.15,2264,0.64,0,1973-2025,M-v,3Ek,(1981),Midas,1973 EA,,2461000.5,65.34754,267.84424,356.79391,39.82339,0.650496,0.416332,1.776272,E2025-V79,28,MPCORBFIT,9803,2025-11-10,2460844.0,2.367358,0.620815,2.931728,0.512326,1.731337,Apollo,1.0,1.0,1.0,,,1
4,18.69,0.15,158,0.9,0,1936-2025,M-v,3Ek,(2101),Adonis,1936 CA,,2461000.5,24.47917,43.7116,349.40534,1.31997,0.764127,0.384185,1.874025,MPO937442,12,MPCLINUX,8803,2025-07-19,2460937.0,2.565448,0.442031,3.306019,0.3899,1.638795,Apollo,1.0,,1.0,,,1


# 11. Combine the Two Datasets

In [25]:
df = pd.concat([asteroids, hazardous_asteroids])
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

In [26]:
df.shape

(1477090, 38)

# 12. Final Class Balance Check

In [27]:
df['hazerdous'].value_counts()

Unnamed: 0_level_0,count
hazerdous,Unnamed: 1_level_1
0,1474585
1,2505


In [28]:
df.head()

Unnamed: 0,H,G,Num_obs,rms,U,Arc_years,Perturbers,Perturbers_2,Number,Name,Principal_desig,Other_desigs,Epoch,M,Peri,Node,i,e,n,a,Ref,Num_opps,Computer,Hex_flags,Last_obs,Tp,Orbital_period,Perihelion_dist,Aphelion_dist,Semilatus_rectum,Synodic_period,Orbit_type,NEO_flag,One_km_NEO_flag,PHA_flag,One_opposition_object_flag,Arc_length,hazerdous
0,18.31,0.15,101.0,0.9,0,2003-2025,M-v,3Ek,(440131),,2003 SR329,,2461000.5,98.01667,21.2949,20.45607,2.17419,0.22058,0.236456,2.590018,MPO891325,7,MPCLINUX,0,2025-01-02,2460586.0,4.168255,2.018711,3.161326,1.231999,1.315631,MBA,,,,,,0
1,17.14,0.15,250.0,0.89,0,2005-2024,M-v,3Ek,(596642),,2005 YC185,"[2010 CY86, 2012 UW121, 2014 FL44]",2461000.5,245.12922,329.93421,318.53094,4.99459,0.125079,0.255371,2.46049,MPO892149,13,MPCLINUX,0,2024-12-20,2461450.0,3.859512,2.152733,2.768246,1.210998,1.34971,MBA,,,,,,0
2,17.12,0.15,213.0,0.99,0,2008-2025,M-v,3Ek,(484515),,2008 EL51,,2461000.5,7.73489,333.54655,190.29557,15.46781,0.250532,0.222208,2.699581,MPO905048,9,MPCLINUX,0,2025-01-26,2460966.0,4.435521,2.023249,3.375913,1.265069,1.291077,MBA,,,,,,0
3,18.15,0.15,206.0,0.95,0,2002-2025,M-v,3Ek,(660520),,2002 AC173,,2461000.5,227.02952,5.59939,113.34616,2.06068,0.13808,0.273783,2.3489,MPO954653,12,MPCORBFIT,0,2025-08-16,2461486.0,3.599953,2.024565,2.673236,1.152058,1.384622,MBA,,,,,,0
4,15.36,0.15,1119.0,0.81,0,2001-2025,M-v,3Ek,(188731),,2005 US54,,2461000.5,296.32977,319.59282,28.96358,14.85383,0.166574,0.230973,2.630845,MPO927334,18,MPCLINUX,0,2025-06-14,2461276.0,4.267199,2.192614,3.069076,1.278924,1.306072,MBA,,,,,,0


# 13. Exporting the processed data set

In [34]:
df.to_csv('/content/drive/MyDrive/Colab Notebooks/Personal/asteroid_project/data/processed/asteroids.csv', index=False)