### Library Imports

In [1]:
# Imports
import pandas as pd
import numpy as np
import pickle
import datetime
from shapely import geometry
import geopandas
import shapely.wkt
import collections

pd.options.display.max_colwidth = 100000

### Read in Data

In [2]:
# Read FIR Codes
df = pd.read_csv('data/NA_FIR_Codes.csv')

# Read Vertices
df2 = pd.read_csv('data/vertices_20220621.csv')

# Read Spaceports
df3 = pd.read_csv ('data/spaceports_20201027.csv')

# Read pickle file with topics and augmented text
df4 = pd.read_pickle("data/allData.pkl")

# Read Polygon File
df5 = pd.read_csv('data/polygon_20201027.csv')

# Read in launch data
df6 = pd.read_csv('data/launches_20201027.csv', parse_dates=['LAUNCH_DATE'])

# Annotated Data
df7 = pd.read_csv('data/HumanAnnotatedMatches_SVO_DB_20200127_pipes_noquotes.csv', encoding='UTF-8', on_bad_lines='skip', engine="python", delimiter='|')

# Read Basemap Shapefile
states = geopandas.read_file('data/bound_p.shx')

### Helper Functions

#### Facet Filter Function
Applies all our filters to each row of a dataframe

In [3]:
# Processing function
def getNotams(time, launch_id):
    # Date filter
    filterA = df4[((df4['POSSIBLE_START_DATE'] - datetime.timedelta(hours = 8)) <= time) & 
                  ((df4['POSSIBLE_END_DATE'] + datetime.timedelta(hours = 8)) >= time)]
    
    # Altitude filter
    filterB = filterA[filterA['MAX_ALT'] >= 50 | filterA['MAX_ALT'].isna()]
    
    # Classification Filter
    filterC = filterB[(filterB['CLASSIFICATION'] != 'MIL') & (filterB['CLASSIFICATION'] != 'LMIL')]
    
    # FIR Code filter
    filterD = filterC[filterC['LOCATION_CODE'].isin(df['FIR']) | filterC['LOCATION_CODE'].isna()]
    
    # Topic filter
    filterE = filterD[(filterD['TOPIC'] == 0) | (filterD['TOPIC'] == 7)]
    
    # NLP Stacking
    filterF = filterE[filterE['XGB'] + filterE['XGB_NEW'] + filterE['LinReg_NEW'] + filterE['SVM_NEW'] >= 3]
    
    # Get IDs for Candidate NOTAMs
    id_list, rec_ids = getIds(filterF, df5)

    # Get Polygons for Candidate NOTAMs
    P = getPolygons(id_list)
    test2 = pd.DataFrame(P, index=['boundary']).rename_axis('polygon_id', axis=1).transpose().reset_index()

    # Return if no polygons found
    if id_list == []:
        return 
    
    # Get all spaceport to polygon interactions
    interactions_df = find_interactions(sp_df_2, test2, ['SPACEPORT_REC_ID'], ['polygon_id'], 'buffer', 'boundary')
    interactions_df['NOTAM_REC_ID'] = [rec_ids[x] for x in interactions_df['polygon_id']]
    
    # Get polygon intersections from data pre-topic filter
    filterG = filterF[(filterF['NOTAM_REC_ID'].isin(interactions_df[interactions_df['SPACEPORT_REC_ID'] == launch_id]['NOTAM_REC_ID'].unique()))]
    
    # If intersections exist return them
    if len(filterG['NOTAM_REC_ID'] > 0):
        return filterG['NOTAM_REC_ID'].tolist()

#### Get Polygon Ids and Polygon Geometries

In [4]:
# Creates WKT Polygons
def getPolygons(id_list):
    all_polys = {}

    for poly_id in id_list:
        tmp = df2[df2['POLYGON_ID'] == poly_id]

        long = tmp['LONGITUDE'].astype(str).values
        lat = tmp['LATITUDE'].astype(str).values

        pointList = list(zip(tmp['LONGITUDE'], tmp['LATITUDE']))

        if len(pointList) == 1:
            poly = geometry.Point([pointList[0][0], pointList[0][1]])
        elif len(pointList) == 2:
            poly = geometry.LineString([[p[0], p[1]] for p in pointList])
        else:
            poly = geometry.Polygon([[p[0], p[1]] for p in pointList])

        all_polys[poly_id] = poly
        
    return all_polys

# Gets Poylgon ID from NOTAM ID
def getIds(rec_ids, poly_df):
    id_list = []
    rec_id = {}
    for row in rec_ids['NOTAM_REC_ID']:
        for item in poly_df[poly_df['NOTAM_REC_ID'] == row]['POLYGON_ID'].values:
            id_list.append(item)
            rec_id[item] = row
            
    return id_list, rec_id

#### Geocircle and Conversion Functions

In [5]:
#This is a utility function for converting distance values using different units
def conv_dist(distance_value, units_value):
    # Determine the conversion factor for the specified units (meters are required for this projection)
    if units_value == "mi":
        unit_factor = 1609.344
    elif units_value == "km":
        unit_factor = 1000.0
    elif units_value == "ft":
        unit_factor = 0.3048
    elif units_value == "nm":
        unit_factor = 1852
    elif units_value == "m":
        unit_factor = 1
    else:  # Bad units
        unit_factor = 0

    return distance_value * unit_factor

#This function creates geospatial circle(s) based on center, radius and unit values in the dataset
def gen_geocircle(input_df, key_col, center_col, radius_col, units_col):

    # Convert point data to geopandas dataframe
    working_cols = [key_col] + [center_col] + [radius_col] + [units_col]
    return_cols = [key_col] + ["buffer"]
    
    pointsdf = input_df[working_cols]
    gdf_pts = geopandas.GeoDataFrame(pointsdf, geometry=center_col)
    
    # Add CRS (start with WGS84 to match lat/lon values)
    gdf_pts.set_crs(epsg=4326, inplace=True)

    #Prepare projection (North America Lambert Conformal Conic)
    # This projection is equidistant for measuring between points.
    # Units are in meters
    projout = '+proj=lcc +lat_1=20 +lat_2=60 +lat_0=40 +lon_0=-96 +x_0=0 +y_0=0 +ellps=GRS80 +datum=NAD83 +units=m no_defs'

    # Convert to Lambert projection
    gdf_pts = gdf_pts.to_crs(projout)
    gdf_pts[units_col] = gdf_pts[units_col].str.lower()
    gdf_pts["dist"] = 0

    for pt_index,pt_row in gdf_pts.iterrows():
        dvalue = conv_dist(pt_row[2], pt_row[3])
        gdf_pts.loc[pt_index,'dist'] = dvalue

    gdf_pts["buffer"] = gdf_pts[center_col].buffer(gdf_pts['dist'])

    gdf_circle = gdf_pts[return_cols]

    gdf_circle = geopandas.GeoDataFrame(gdf_circle, geometry='buffer')

    gdf_circle = gdf_circle.to_crs(epsg=4326)
    
    return gdf_circle

#### Interactions Function

In [6]:
#This function determines takes two sets of geospatial objects and determines which ones interact
def find_interactions(geom_set1_df, geom_set2_df,
                      set1_key_cols, set2_key_cols,
                      set1_geometry_col, set2_geometry_col
                      ):

    set1_working_cols = set1_key_cols + [set1_geometry_col]
    set2_working_cols = set2_key_cols + [set2_geometry_col]
    
    # Convert point data to geopandas dataframe

    gdf1 = geom_set1_df[set1_working_cols]
    gdf1 = geopandas.GeoDataFrame(gdf1, geometry=set1_geometry_col)

    # Add CRS (start with WGS84 to match lat/lon values)
    gdf1.set_crs(epsg=4326, inplace=True)
    gdf1_type = gdf1.loc[0, set1_geometry_col].geom_type

    #Convert polygon data to geopandas dataframe
    gdf2 = geom_set2_df[set2_working_cols]
    gdf2 = geopandas.GeoDataFrame(gdf2, geometry=set2_geometry_col)

    # Add CRS (start with WGS84 to match lat/lon values)
    gdf2.set_crs(epsg=4326, inplace=True)
    gdf2_type = gdf2.loc[0, set2_geometry_col].geom_type

    #If datasets are mixed (one polygon and one linestring), ensure polygons are gdf1
    if (gdf1_type == 'LineString' or gdf1_type == 'Point') and gdf2_type == 'Polygon':
        gdf_temp = gdf1
        gdf1 = gdf2
        gdf2 = gdf_temp
        gdf2_type = gdf1_type
        gdf1_type = 'Polygon'
        keys_temp = set1_key_cols
        set1_key_cols = set2_key_cols
        set2_key_cols = keys_temp
        geom_temp = set1_geometry_col
        set1_geometry_col = set2_geometry_col
        set2_geometry_col = geom_temp
        
    # Convert to new equidistant projection

    #Prepare projection (North America Lambert Conformal Conic)
    # This projection is equidistant for measuring between points.
    # Units are in meters
    projout = '+proj=lcc +lat_1=20 +lat_2=60 +lat_0=40 +lon_0=-96 +x_0=0 +y_0=0 +ellps=GRS80 +datum=NAD83 +units=m no_defs'

    # Convert to Lambert projection
    gdf1 = gdf1.to_crs(projout)

    # Convert to Lambert projection
    gdf2 = gdf2.to_crs(projout)

    results = pd.DataFrame(columns=(set1_key_cols + set2_key_cols + ['Interaction']))
    
    df_index = 0
    
    for gdf1_index,gdf1_row in gdf1.iterrows():
        for gdf2_index,gdf2_row in gdf2.iterrows():
            interaction = gdf1.loc[gdf1_index, set1_geometry_col].intersects(gdf2.loc[gdf2_index, set2_geometry_col])
            if interaction:
                interact_str = ['Intersect']
                results.loc[df_index] = np.concatenate((gdf1.loc[gdf1_index, set1_key_cols].values,
                                            gdf2.loc[gdf2_index, set2_key_cols].values,
                                            interact_str),axis=None)
                df_index += 1
            
    return results

In [7]:
#Convert the latitude/longtitude values to geospatial points
spaceport_df = df3

spaceport_df['FacilityLocation'] = [geometry.Point(xy) for xy in zip(spaceport_df['LONGITUDE'], spaceport_df['LATITUDE'])]

spaceport_df['radius'] = 50
spaceport_df['units'] = 'nm'

spaceport_df = spaceport_df[spaceport_df['SPACEPORT_REC_ID'] > 1]

sp_df_2 = gen_geocircle(spaceport_df, 'SPACEPORT_REC_ID', 'FacilityLocation', 'radius', 'units')
sp_df_2.reset_index(inplace=True)

### Apply Facet Filter to All NOTAMs

In [8]:
# Make predictions for launches
df6['DISCOVERED'] = df6.apply(lambda x: getNotams(x['LAUNCH_DATE'], x['SPACEPORT_REC_ID']), axis=1)

### Print All Matches

In [9]:
# Final list of found launches
df6[~df6['DISCOVERED'].isnull().values]

Unnamed: 0,LAUNCHES_REC_ID,LAUNCH_DATE,VEHICLE_NAME,PAD_NAME,PAYLOAD,AGENCY,PURPOSE,ORBIT_TYPE,OUTCOME,SPACE_LAUNCH_REPORT_FLAG,GUNTER_FLAG,SPACE_FLIGHT_NOW_FLAG,NOONAN_FLAG,AST_FLAG,WIKIPEDIA_FLAG,JSR_FLAG,SPACEPORT_REC_ID,DISCOVERED
200,201,2015-10-07 11:02:00,Black Brant IX,,,NASA,,,Successful,False,False,False,False,False,False,True,9.0,[25525]
233,234,2016-03-01 02:52:00,Terrier Imp Malemute,,,NASA,,,Successful,False,False,False,False,False,False,True,9.0,[25525]
235,236,2016-03-07 00:00:00,Terrier Improved Orion,,,NASA,,,Successful,False,False,False,False,False,False,True,9.0,[25525]
242,243,2016-05-06 05:21:00,Falcon 9 v1.2,SLC-40,JCSat 14,SpaceX,Communications,Geosynchronous,Operational,True,True,True,True,True,True,True,2.0,"[21915, 21916, 21917]"
247,248,2016-05-26 21:40:00,Falcon 9 v1.2,SLC-40,Thiacom 8,SpaceX,Communications,Geosynchronous,Scrubbed,False,False,False,True,False,False,False,2.0,"[1006, 1076, 1121, 1140, 1155, 5669, 5719, 6132, 6133, 6256]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
510,511,2020-03-18 12:16:39,Falcon 9 v1.2,LC-39A,Starlink 1 F5,SpaceX,Communications,Low Earth,Operational,True,True,True,False,True,True,False,8.0,"[1596831, 1596863, 1596864, 1596865, 1597003, 1597348, 1597349, 1599064, 1601738, 1601739]"
511,512,2020-03-26 20:18:00,Atlas 5-551,SLC-41,AEHF 6,US Military,Communications,Supersynchronous,,True,True,True,False,False,True,False,2.0,"[1607204, 1607205, 1608021, 1608022, 1608023, 1608074, 1612521, 1612522]"
513,514,2020-04-22 19:30:30,Falcon 9 v1.2,LC-39A,Starlink 1 F6,SpaceX,Communications,Low Earth,Operational,True,True,True,False,True,True,False,8.0,"[1624825, 1629217, 1629218]"
514,515,2020-05-17 13:14:00,Atlas 5-501,SLC-41,USSF 7 (X37B OTV6),US Military,Testing,Low Earth,Operational,True,True,True,False,False,True,False,2.0,"[1634274, 1634275, 1634276, 1634442, 1634443, 1634444, 1634445, 1634797, 1634798, 1634799, 1634800, 1634871, 1634872, 1634873, 1634874, 1634887, 1634888, 1634889, 1634899, 1634900]"


### Check for Duplicates

In [10]:
all_matched = []
for row in df6[~df6['DISCOVERED'].isnull().values]['DISCOVERED']:
    all_matched.append(row)
    
all_matched = [element for sublist in all_matched for element in sublist]

dups = [item for item, count in collections.Counter(all_matched).items() if count > 1]
print(dups)

[25525, 4722, 360970, 360983, 510253, 510263, 607593, 607594, 765453, 765797, 830963, 830964, 843822, 841265, 1243795, 1243803, 1392897, 1392902]


### Print Sample Matched NOTAMs

In [11]:
# Get Matched IDs for each Launch
matched_dict = {}
for launch_id in df7['LAUNCHES_REC_ID'].unique():
    matched_dict[launch_id] = df7[df7['LAUNCHES_REC_ID'] == launch_id]['NOTAM_REC_ID'].to_numpy()

In [12]:
# Human Matched Launchs
matched_dict.keys()

dict_keys([391, 431, 347, 475, 308, 442, 284, 389, 325, 365, 395, 316, 452, 456, 387, 261, 511, 410, 419, 263, 324, 379, 430, 517, 468, 514, 449, 474, 312, 407, 512, 305, 260, 307, 270, 413, 341, 243, 404, 339, 262, 510, 420, 499, 254, 399, 383, 279, 500, 355, 286, 437, 402, 353, 373, 359, 445, 301, 332, 400, 300])

In [13]:
# Predictions
df6[df6['LAUNCHES_REC_ID'] == 500]['DISCOVERED'].to_numpy()

array([list([1555934, 1555979, 1555998, 1556017, 1556025, 1556457, 1556516, 1556542, 1556548, 1565240, 1565251, 1567932, 1568298])],
      dtype=object)

In [14]:
# Actual
matched_dict[500]

array([1555934, 1567932, 1555979, 1555998, 1556017, 1556025])

In [15]:
df4[df4['NOTAM_REC_ID'] == 1556457]['TEXT']

Name: TEXT, dtype: object

In [16]:
total = 0
total_correct = 0
new_items = 0

for key in df6[~df6['DISCOVERED'].isnull().values]['LAUNCHES_REC_ID'].unique():
    try:
        match = matched_dict[key]
    except:
        match = []
    
    predict = df6[df6['LAUNCHES_REC_ID'] == key]['DISCOVERED']
    
    #print('Launch ID', key)
    # Match=Predict
    #print(set(match).intersection(predict.values[0]))
    
    # Things I missed
    #print(set(match).difference(predict.values[0]))
    
    # New Things I found
    #print(set(predict.values[0]).difference(match))
    
    total += len(match)
    total_correct += len(set(match).intersection(predict.values[0]))
    new_items += len(set(predict.values[0]).difference(match))
    
print(f"Total Correct: {total_correct/total :.3%}")
print(f"Total Missed: {(total-total_correct)/total :.3%}")
print(f"New Things: {(new_items)/total :.3%}")

Total Correct: 81.019%
Total Missed: 18.981%
New Things: 250.231%


### Save Results

In [17]:
df6[~df6['DISCOVERED'].isnull().values][['LAUNCHES_REC_ID', 'DISCOVERED']].to_csv('data/results_NoKeywords.csv',index=False)