### Library Imports

In [1]:
# Imports
import pandas as pd
import numpy as np
import pickle
import datetime
from shapely import geometry
import geopandas
import shapely.wkt

pd.options.display.max_colwidth = 100000

### Read in Data

In [2]:
# Read FIR Codes
df = pd.read_csv('data/NA_FIR_Codes.csv')

# Read Vertices
df2 = pd.read_csv('data/vertices_20220621.csv')

# Read Spaceports
df3 = pd.read_csv ('data/spaceports_20201027.csv')

# Read pickle file with topics and augmented text
df4 = pd.read_pickle("data/allData.pkl")

# Read Polygon File
df5 = pd.read_csv('data/polygon_20201027.csv')

# Read in launch data
df6 = pd.read_csv('data/launches_20201027.csv', parse_dates=['LAUNCH_DATE'])

# Annotated Data
df7 = pd.read_csv('data/HumanAnnotatedMatches_SVO_DB_20200127_pipes_noquotes.csv', encoding='UTF-8', on_bad_lines='skip', engine="python", delimiter='|')

# Read Basemap Shapefile
states = geopandas.read_file('data/bound_p.shx')

### Helper Functions

#### Facet Filter Function
Applies all our filters to each row of a dataframe

In [3]:
# Processing function
def getNotams(time, launch_id):
    # Date filter
    filterA = df4[((df4['POSSIBLE_START_DATE'] - datetime.timedelta(hours = 8)) <= time) & 
                  ((df4['POSSIBLE_END_DATE'] + datetime.timedelta(hours = 8)) >= time)]
    
    # Altitude filter
    filterB = filterA[filterA['MAX_ALT'] >= 50 | filterA['MAX_ALT'].isna()]
    
    # FIR Code filter
    filterC = filterB[filterB['LOCATION_CODE'].isin(df['FIR']) | filterB['LOCATION_CODE'].isna()]
    
    # Keyword filter
    filterD = filterC[(filterC['TEXT'].str.contains(r'(?:\s|^)rocket(?:\s|$)') == True) | 
                      (filterC['TEXT'].str.contains(r'(?:\s|^)space(?:\s|$)') == True) |
                      (filterC['TEXT'].str.contains(r'(?:\s|^)launch(?:\s|$)') == True) |
                      (filterC['TEXT'].str.contains(r'(?:\s|^)missile(?:\s|$)') == True) |
                      (filterC['TEXT'].str.contains(r'(?:\s|^)canaveral(?:\s|$)') == True) |
                      (filterC['TEXT'].str.contains(r'(?:\s|^)kennedy(?:\s|$)') == True) |
                      (filterC['TEXT'].str.contains(r'(?:\s|^)nasa(?:\s|$)') == True) |
                      (filterC['TEXT'].str.contains(r'(?:\s|^)antares(?:\s|$)') == True) |
                      (filterC['TEXT'].str.contains(r'(?:\s|^)orion(?:\s|$)') == True) |
                      (filterC['TEXT'].str.contains(r'(?:\s|^)atlas(?:\s|$)') == True) |
                      (filterC['TEXT'].str.contains(r'(?:\s|^)zenit(?:\s|$)') == True) |
                      (filterC['TEXT'].str.contains(r'(?:\s|^)falcon(?:\s|$)') == True) |
                      (filterC['TEXT'].str.contains(r'(?:\s|^)dragon(?:\s|$)') == True) |
                      (filterC['TEXT'].str.contains(r'(?:\s|^)spaceship(?:\s|$)') == True) |
                      (filterC['TEXT'].str.contains(r'(?:\s|^)minuteman(?:\s|$)') == True) |
                      (filterC['TEXT'].str.contains(r'(?:\s|^)trident(?:\s|$)') == True) |
                      (filterC['TEXT'].str.contains(r'(?:\s|^)unlimited(?:\s|$)') == True)]
    
    # Topic filter
    filterE = filterD[(filterD['TOPIC'] == 0) | (filterD['TOPIC'] == 7)]
    
    # Get IDs for Candidate NOTAMs
    id_list, rec_ids = getIds(filterE, df5)

    # Get Polygons for Candidate NOTAMs
    P = getPolygons(id_list)
    test2 = pd.DataFrame(P, index=['boundary']).rename_axis('polygon_id', axis=1).transpose().reset_index()

    # Return if no polygons found
    if id_list == []:
        return 
    
    # Get all spaceport to polygon interactions
    interactions_df = find_interactions(sp_df_2, test2, ['SPACEPORT_REC_ID'], ['polygon_id'], 'buffer', 'boundary')
    interactions_df['NOTAM_REC_ID'] = [rec_ids[x] for x in interactions_df['polygon_id']]
    
    # Get polygon intersections from data pre-topic filter
    filterF = filterE[(filterE['NOTAM_REC_ID'].isin(interactions_df[interactions_df['SPACEPORT_REC_ID'] == launch_id]['NOTAM_REC_ID'].unique()))]
    
    # If intersections exist return them
    if len(filterF['NOTAM_REC_ID'] > 0):
        return filterF['NOTAM_REC_ID'].tolist()

#### Get Polygon Ids and Polygon Geometries

In [4]:
# Creates WKT Polygons
def getPolygons(id_list):
    all_polys = {}

    for poly_id in id_list:
        tmp = df2[df2['POLYGON_ID'] == poly_id]

        long = tmp['LONGITUDE'].astype(str).values
        lat = tmp['LATITUDE'].astype(str).values

        pointList = list(zip(tmp['LONGITUDE'], tmp['LATITUDE']))

        if len(pointList) == 1:
            poly = geometry.Point([pointList[0][0], pointList[0][1]])
        elif len(pointList) == 2:
            poly = geometry.LineString([[p[0], p[1]] for p in pointList])
        else:
            poly = geometry.Polygon([[p[0], p[1]] for p in pointList])

        all_polys[poly_id] = poly
        
    return all_polys

# Gets Poylgon ID from NOTAM ID
def getIds(rec_ids, poly_df):
    id_list = []
    rec_id = {}
    for row in rec_ids['NOTAM_REC_ID']:
        for item in poly_df[poly_df['NOTAM_REC_ID'] == row]['POLYGON_ID'].values:
            id_list.append(item)
            rec_id[item] = row
            
    return id_list, rec_id

#### Geocircle and Conversion Functions

In [5]:
#This is a utility function for converting distance values using different units
def conv_dist(distance_value, units_value):
    # Determine the conversion factor for the specified units (meters are required for this projection)
    if units_value == "mi":
        unit_factor = 1609.344
    elif units_value == "km":
        unit_factor = 1000.0
    elif units_value == "ft":
        unit_factor = 0.3048
    elif units_value == "nm":
        unit_factor = 1852
    elif units_value == "m":
        unit_factor = 1
    else:  # Bad units
        unit_factor = 0

    return distance_value * unit_factor

#This function creates geospatial circle(s) based on center, radius and unit values in the dataset
def gen_geocircle(input_df, key_col, center_col, radius_col, units_col):

    # Convert point data to geopandas dataframe
    working_cols = [key_col] + [center_col] + [radius_col] + [units_col]
    return_cols = [key_col] + ["buffer"]
    
    pointsdf = input_df[working_cols]
    gdf_pts = geopandas.GeoDataFrame(pointsdf, geometry=center_col)
    
    # Add CRS (start with WGS84 to match lat/lon values)
    gdf_pts.set_crs(epsg=4326, inplace=True)

    #Prepare projection (North America Lambert Conformal Conic)
    # This projection is equidistant for measuring between points.
    # Units are in meters
    projout = '+proj=lcc +lat_1=20 +lat_2=60 +lat_0=40 +lon_0=-96 +x_0=0 +y_0=0 +ellps=GRS80 +datum=NAD83 +units=m no_defs'

    # Convert to Lambert projection
    gdf_pts = gdf_pts.to_crs(projout)
    gdf_pts[units_col] = gdf_pts[units_col].str.lower()
    gdf_pts["dist"] = 0

    for pt_index,pt_row in gdf_pts.iterrows():
        dvalue = conv_dist(pt_row[2], pt_row[3])

        gdf_pts.loc[pt_index,'dist'] = dvalue

    gdf_pts["buffer"] = gdf_pts[center_col].buffer(gdf_pts['dist'])

    gdf_circle = gdf_pts[return_cols]

    gdf_circle = geopandas.GeoDataFrame(gdf_circle, geometry='buffer')

    gdf_circle = gdf_circle.to_crs(epsg=4326)

    return gdf_circle

#### Interactions Function

In [6]:
#This function determines takes two sets of geospatial objects and determines which ones interact
def find_interactions(geom_set1_df, geom_set2_df,
                      set1_key_cols, set2_key_cols,
                      set1_geometry_col, set2_geometry_col
                      ):

    set1_working_cols = set1_key_cols + [set1_geometry_col]
    set2_working_cols = set2_key_cols + [set2_geometry_col]
    
    # Convert point data to geopandas dataframe

    gdf1 = geom_set1_df[set1_working_cols]
    gdf1 = geopandas.GeoDataFrame(gdf1, geometry=set1_geometry_col)

    # Add CRS (start with WGS84 to match lat/lon values)
    gdf1.set_crs(epsg=4326, inplace=True)
    gdf1_type = gdf1.loc[0, set1_geometry_col].geom_type

    #Convert polygon data to geopandas dataframe
    gdf2 = geom_set2_df[set2_working_cols]
    gdf2 = geopandas.GeoDataFrame(gdf2, geometry=set2_geometry_col)

    # Add CRS (start with WGS84 to match lat/lon values)
    gdf2.set_crs(epsg=4326, inplace=True)
    gdf2_type = gdf2.loc[0, set2_geometry_col].geom_type

    #If datasets are mixed (one polygon and one linestring), ensure polygons are gdf1
    if (gdf1_type == 'LineString' or gdf1_type == 'Point') and gdf2_type == 'Polygon':
        gdf_temp = gdf1
        gdf1 = gdf2
        gdf2 = gdf_temp
        gdf2_type = gdf1_type
        gdf1_type = 'Polygon'
        keys_temp = set1_key_cols
        set1_key_cols = set2_key_cols
        set2_key_cols = keys_temp
        geom_temp = set1_geometry_col
        set1_geometry_col = set2_geometry_col
        set2_geometry_col = geom_temp
        
    # Convert to new equidistant projection

    #Prepare projection (North America Lambert Conformal Conic)
    # This projection is equidistant for measuring between points.
    # Units are in meters
    projout = '+proj=lcc +lat_1=20 +lat_2=60 +lat_0=40 +lon_0=-96 +x_0=0 +y_0=0 +ellps=GRS80 +datum=NAD83 +units=m no_defs'

    # Convert to Lambert projection
    gdf1 = gdf1.to_crs(projout)

    # Convert to Lambert projection
    gdf2 = gdf2.to_crs(projout)

    results = pd.DataFrame(columns=(set1_key_cols + set2_key_cols + ['Interaction']))
    
    df_index = 0
    
    for gdf1_index,gdf1_row in gdf1.iterrows():
        for gdf2_index,gdf2_row in gdf2.iterrows():
            interaction = gdf1.loc[gdf1_index, set1_geometry_col].intersects(gdf2.loc[gdf2_index, set2_geometry_col])
            if interaction:
                interact_str = ['Intersect']
                results.loc[df_index] = np.concatenate((gdf1.loc[gdf1_index, set1_key_cols].values,
                                            gdf2.loc[gdf2_index, set2_key_cols].values,
                                            interact_str),axis=None)
                df_index += 1
            
    return results

In [7]:
#Convert the latitude/longtitude values to geospatial points
spaceport_df = df3

spaceport_df['FacilityLocation'] = [geometry.Point(xy) for xy in zip(spaceport_df['LONGITUDE'], spaceport_df['LATITUDE'])]

spaceport_df['radius'] = 50
spaceport_df['units'] = 'nm'

spaceport_df = spaceport_df[spaceport_df['SPACEPORT_REC_ID'] > 1]

sp_df_2 = gen_geocircle(spaceport_df, 'SPACEPORT_REC_ID', 'FacilityLocation', 'radius', 'units')
sp_df_2.reset_index(inplace=True)

### Apply Facet Filter to All NOTAMs

In [8]:
# Make predictions for launches
df6['DISCOVERED'] = df6.apply(lambda x: getNotams(x['LAUNCH_DATE'], x['SPACEPORT_REC_ID']), axis=1)

### Making Alternative Dataset

In [9]:
good_matches = np.unique(np.hstack(df6[~df6['DISCOVERED'].isnull().values]['DISCOVERED'].values))

In [10]:
# Altitude filter
bad_filterA = df4[df4['MAX_ALT'] < 50000]
    
# FIR Code filter
bad_filterB = bad_filterA[~bad_filterA['LOCATION_CODE'].isin(df['FIR'])]
    
# Topic filter
bad_filterC = bad_filterB[(bad_filterB['TOPIC'] != 0) & (bad_filterB['TOPIC'] != 7)]['NOTAM_REC_ID'].values
    
bad_matches = np.unique(np.hstack((bad_filterC)))

In [11]:
bad_matches.shape

(777414,)

In [12]:
good_matches.shape

(767,)

In [13]:
np.savetxt('data/handmade_good_matches.csv', good_matches, delimiter=",")
np.savetxt('data/handmade_bad_matches.csv', bad_matches, delimiter=",")