In [1]:
# Imports
import pickle
import pandas  as pd
import numpy   as np
import xgboost as xgb
from sklearn.metrics import mean_squared_error
from datetime import datetime
pd.options.display.max_colwidth = 100000

In [2]:
# Read pickle file with topics and augmented text
df4 = pd.read_pickle("data/allData.pkl")

## XGBoost Pre-filtering

Sources: 
* [DataCampt Tutorial](https://www.datacamp.com/tutorial/xgboost-in-python)
* [ML Mastery Tutorial](https://machinelearningmastery.com/data-preparation-gradient-boosting-xgboost-python/)

In [7]:
# Add binary column
df4['y'] = df4['TOPIC'].apply(lambda x: True if x in [0,7] else False)

In [59]:
# Keep only columns that have relevant information
cols_X = ['NOTAM_TYPE', 'SIMPLE_TEXT', 'LOC_ID_ACCOUNTABLE_ORG',
          'TEXT', 'Q_CODE', 'Q_CODE_INTERPRETATION', 
          'A_CODE', 'B_CODE', 'C_CODE', 'D_CODE',
          'E_CODE', 'F_CODE', 'G_CODE', 'CLASSIFICATION',
          'MIN_ALT', 'MAX_ALT', 'MIN_ALT_REF_TYPE', 'MAX_ALT_REF_TYPE',
          'POSSIBLE_START_DATE', 'POSSIBLE_END_DATE', 'ISSUE_DATE',
          'CANCELED_DATE', 'AFFECTED_FIR', 'DESIGNATOR',
          'LOCATION_NAME', 'ACCOUNT_ID', 'LOCATION_CODE']

In [61]:
# Prepare data for model
X = df4[cols_X]
y = df4['y']

In [62]:
# Create optimized data matrix
data_dmatrix = xgb.DMatrix(data=X,label=y)

ValueError: DataFrame.dtypes for data must be int, float, bool or category.  When
categorical type is supplied, DMatrix parameter `enable_categorical` must
be set to `True`. Invalid columns:NOTAM_TYPE, SIMPLE_TEXT, LOC_ID_ACCOUNTABLE_ORG, TEXT, Q_CODE, Q_CODE_INTERPRETATION, A_CODE, B_CODE, C_CODE, D_CODE, E_CODE, F_CODE, G_CODE, CLASSIFICATION, MIN_ALT_REF_TYPE, MAX_ALT_REF_TYPE, POSSIBLE_START_DATE, POSSIBLE_END_DATE, ISSUE_DATE, CANCELED_DATE, AFFECTED_FIR, DESIGNATOR, LOCATION_NAME, ACCOUNT_ID, LOCATION_CODE

In [4]:
df4.head()

Unnamed: 0,NOTAM_REC_ID,FNS_ID,FILENAME,NOTAM_ID,NOTAM_TYPE,RELATED_NOTAM_ID,SIMPLE_TEXT,LOC_ID_ACCOUNTABLE_ORG,NOTAM_NUMBER,RELATED_NOTAM_NUMBER,...,ISSUE_DATE,CANCELED_DATE,AFFECTED_FIR,DESIGNATOR,DESIGNATOR_NAME,LOCATION_NAME,ACCOUNT_ID,LOCATION_CODE,LAUNCHES_REC_ID,TOPIC
0,1,FNS_ID_43130705,E:\Working\Sherlock_NOTAM_Data\Temporary\notam.20160524T1918Z_tmp.xml,0604/16,,,!SUAC 05/604 ZAU AIRSPACE VOLK SOUTH MOA ACT 500FT UP TO BUT NOT INCLUDING FL180 1605241730-1605241945,SUAC,05/604,,...,2016-05-24 04:35:00,2016-05-24 12:10:00,,,,ZAU ARTCC,SUAC,ZAU,,3
1,2,FNS_ID_43130704,E:\Working\Sherlock_NOTAM_Data\Temporary\notam.20160524T1918Z_tmp.xml,0603/16,,,!SUAC 05/603 ZAU AIRSPACE VOLK EAST MOA ACT 8000FT UP TO BUT NOT INCLUDING FL180 1605241730-1605241945,SUAC,05/603,,...,2016-05-24 04:35:00,2016-05-24 12:10:00,,,,ZAU ARTCC,SUAC,ZAU,,3
2,3,FNS_ID_43130706,E:\Working\Sherlock_NOTAM_Data\Temporary\notam.20160524T1918Z_tmp.xml,0605/16,,,!SUAC 05/605 ZMP AIRSPACE FALLS 1 MOA ACT 500FT UP TO BUT NOT INCLUDING FL180 1605241730-1605241945,SUAC,05/605,,...,2016-05-24 04:35:00,2016-05-24 12:10:00,,,,ZMP ARTCC,SUAC,ZMP,,3
3,4,FNS_ID_43130708,E:\Working\Sherlock_NOTAM_Data\Temporary\notam.20160524T1918Z_tmp.xml,0607/16,,,!SUAC 05/607 ZMP AIRSPACE FALLS 2 MOA ACT 500FT UP TO BUT NOT INCLUDING FL180 1605241730-1605241945,SUAC,05/607,,...,2016-05-24 04:35:00,2016-05-24 12:10:00,,,,ZMP ARTCC,SUAC,ZMP,,3
4,5,FNS_ID_43130703,E:\Working\Sherlock_NOTAM_Data\Temporary\notam.20160524T1918Z_tmp.xml,0602/16,,,!SUAC 05/602 ZMP AIRSPACE VOLK WEST MOA ACT 100FT UP TO BUT NOT INCLUDING FL180 1605241730-1605241945,SUAC,05/602,,...,2016-05-24 04:35:00,2016-05-24 12:10:00,,,,ZMP ARTCC,SUAC,ZMP,,3


* Why the time filter?
* Why the altitude filter?

---

In [32]:
# One off testing and tuning

#time = '10/05/2017  08:00:00 AM' #349
#time = '2/20/2016  7:40:00 PM' #230
#time = '4/22/2020  7:30:30 PM' #514
#time = '4/2/2018  8:30:38 PM' #391
time = '11/16/2017  2:52:00 AM' #364
#time = '12/5/2018  6:16:16 PM' #438
#time = '10/17/2018  4:15:00 AM' #431 has JAXA launch in it as well (t=6)
#time = '9/24/2017  5:49:47 AM' #347
#time = '8/6/2019  11:23:00 PM' #475 has a russian launch as well (t=6)
#time = '3/26/2020  8:18:00 PM' #512
#time = '3/18/2017  11:44:00 PM' #305
#time = '8/16/2016  11:31:00 PM' #262
#time = '5/26/2016  9:40:00 PM' #248

date_time_obj = datetime.strptime(time, '%m/%d/%Y %H:%M:%S %p')

In [33]:
# Date filter
filterA = df4[(df4['POSSIBLE_START_DATE'] <= time) & (df4['POSSIBLE_END_DATE'] >= time)]

In [34]:
# Altitude filter
filterB = filterA[filterA['MAX_ALT'] >= 50000 | filterA['MAX_ALT'].isna()]

In [35]:
# Keyword filter
filterC = filterB[(filterB['TEXT'].str.contains(r'(?:\s|^)rocket(?:\s|$)') == True) | 
                      (filterB['TEXT'].str.contains(r'(?:\s|^)space(?:\s|$)') == True) |
                      (filterB['TEXT'].str.contains(r'(?:\s|^)launch(?:\s|$)') == True) |
                      (filterB['TEXT'].str.contains(r'(?:\s|^)airspace restricted(?:\s|$)') == True) |
                      (filterB['TEXT'].str.contains(r'(?:\s|^)missile(?:\s|$)') == True) |
                      (filterB['TEXT'].str.contains(r'(?:\s|^)canaveral(?:\s|$)') == True) |
                      (filterB['TEXT'].str.contains(r'(?:\s|^)kennedy(?:\s|$)') == True) |
                      (filterB['TEXT'].str.contains(r'(?:\s|^)nasa(?:\s|$)') == True) |
                      (filterB['TEXT'].str.contains(r'(?:\s|^)unlimited(?:\s|$)') == True)]

In [36]:
# Read FIR Codes
df = pd.read_csv('data/NA_FIR_Codes.csv')

# Read US Airport Codes
#df2 = pd.read_csv('data/AirportData_Clean_20210629_Geocoded.csv')

In [37]:
# FIR Code filter
filterD = filterC[filterC['LOCATION_CODE'].isin(df['FIR']) | filterC['LOCATION_CODE'].isna()]

# US Airport Code filter
#filterD = filterC[filterC['LOCATION_CODE'].isin(df['FIR']) | filterC['LOCATION_CODE'].isna() | filterC['LOCATION_CODE'].isin(df2['locid'])]

In [38]:
# Print out
filterD[['TOPIC','NOTAM_REC_ID', 'TEXT']]

Unnamed: 0,TOPIC,NOTAM_REC_ID,TEXT
636305,7,636707,cape canaveral flight level unmanned free balloons will be released from within restricted area (followed by identification) at north or northern latitude west or western longitude there will be three balloons released minutes apart the balloons will climb at appox feet performance minute to above flight level launch times have not been determined altostratus of this time when times are known absolute (temperature) separate notice to airmen will be sent referencing this notice to airmen
649180,0,649582,airspace restricted area (followed by identification) absolute (temperature) active surface foot
649181,0,649583,airspace restricted area (followed by identification) blue active foot foot
654029,0,654431,airspace restricted area (followed by identification) active surface foot
655574,0,655976,airspace restricted area (followed by identification) celsius active flight level up to but not including flight level
...,...,...,...
679153,0,679555,airspace restricted area (followed by identification) danger area active foot agl foot
679154,0,679556,airspace restricted area (followed by identification) danger area active surface foot
679155,0,679557,airspace restricted area (followed by identification) celsius active surface foot
679403,0,679805,airspace restricted area (followed by identification) absolute (temperature) active surface foot


In [39]:
# Topic filter
filterE = filterD[(filterD['TOPIC'] == 0) | (filterD['TOPIC'] == 7)]

In [12]:
# Read in launch data
df2 = pd.read_csv('data/launches_20201027.csv', parse_dates=['LAUNCH_DATE'])

In [13]:
# Processing function
def getNotams(time):
    # Date filter
    filterA = df4[(df4['POSSIBLE_START_DATE'] <= time) & (df4['POSSIBLE_END_DATE'] >= time)]
    
    # Altitude filter
    filterB = filterA[filterA['MAX_ALT'] >= 50000 | filterA['MAX_ALT'].isna()]
    
    # FIR Code filter
    filterC = filterB[filterB['LOCATION_CODE'].isin(df['FIR']) | filterB['LOCATION_CODE'].isna()]
    
    # Keyword filter
    filterD = filterC[(filterC['TEXT'].str.contains(r'(?:\s|^)rocket(?:\s|$)') == True) | 
                      (filterC['TEXT'].str.contains(r'(?:\s|^)space(?:\s|$)') == True) |
                      (filterC['TEXT'].str.contains(r'(?:\s|^)launch(?:\s|$)') == True) |
                      (filterC['TEXT'].str.contains(r'(?:\s|^)airspace restricted(?:\s|$)') == True) |
                      (filterC['TEXT'].str.contains(r'(?:\s|^)missile(?:\s|$)') == True) |
                      (filterC['TEXT'].str.contains(r'(?:\s|^)canaveral(?:\s|$)') == True) |
                      (filterC['TEXT'].str.contains(r'(?:\s|^)kennedy(?:\s|$)') == True) |
                      (filterC['TEXT'].str.contains(r'(?:\s|^)nasa(?:\s|$)') == True) |
                      (filterC['TEXT'].str.contains(r'(?:\s|^)unlimited(?:\s|$)') == True) ]
    
    # Topic filter
    filterE = filterD[(filterD['TOPIC'] == 0) | (filterD['TOPIC'] == 7)]
    
    if len(filterE['NOTAM_REC_ID'] > 0):
        return filterE['NOTAM_REC_ID'].tolist()
    else:
        return

In [14]:
# Make predictions for launches
df2['DISCOVERED'] = df2['LAUNCH_DATE'].apply(getNotams)

In [15]:
# Final list of found launches
df2[~df2['DISCOVERED'].isnull().values]

Unnamed: 0,LAUNCHES_REC_ID,LAUNCH_DATE,VEHICLE_NAME,PAD_NAME,PAYLOAD,AGENCY,PURPOSE,ORBIT_TYPE,OUTCOME,SPACE_LAUNCH_REPORT_FLAG,GUNTER_FLAG,SPACE_FLIGHT_NOW_FLAG,NOONAN_FLAG,AST_FLAG,WIKIPEDIA_FLAG,JSR_FLAG,SPACEPORT_REC_ID,DISCOVERED
242,243,2016-05-06 05:21:00,Falcon 9 v1.2,SLC-40,JCSat 14,SpaceX,Communications,Geosynchronous,Operational,True,True,True,True,True,True,True,2.0,"[21892, 21894, 21896, 21898, 21906, 21908, 21912, 21913, 21916, 21917, 21927, 21928, 21929, 2193..."
243,244,2016-05-11 18:30:00,Dragon,,CRS 8,SpaceX,Re-entry,,,False,False,False,True,True,False,False,,"[19924, 19925, 19942, 19943, 19945, 19984, 19986, 19988, 21287, 23008, 23019, 23020, 23022, 2302..."
244,245,2016-05-17 18:57:00,MRBM Target,,,Missile Defense Agency,,,Successful,False,False,False,False,False,False,True,13.0,"[21222, 21232, 21234, 21236, 21257, 21260, 21270, 21276, 21284, 21285, 21286, 21287, 21288, 2129..."
245,246,2016-05-24 12:00:00,SM-3-IB,,,US Navy,,,Successful,False,False,False,False,False,False,True,13.0,"[19505, 19506, 19508, 19511, 19513, 19514, 19515, 19516, 19518, 19519, 19521, 19523, 19524, 1952..."
246,247,2016-05-25 12:00:00,SM-3-IB,,,US Navy,,,Successful,False,False,False,False,False,False,True,13.0,"[264, 265, 266, 354, 923, 1243, 1244, 1245, 1246, 1415, 1416, 1417, 1418, 1419, 1420, 1421, 1422..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
513,514,2020-04-22 19:30:30,Falcon 9 v1.2,LC-39A,Starlink 1 F6,SpaceX,Communications,Low Earth,Operational,True,True,True,False,True,True,False,8.0,"[1625137, 1625138, 1625139, 1625140, 1625141, 1625461, 1625462, 1625463, 1625464, 1625807, 16258..."
514,515,2020-05-17 13:14:00,Atlas 5-501,SLC-41,USSF 7 (X37B OTV6),US Military,Testing,Low Earth,Operational,True,True,True,False,False,True,False,2.0,"[1633617, 1633618, 1633689, 1633690, 1633691, 1633692, 1633699, 1633773, 1633774, 1633775, 16337..."
515,516,2020-05-25 18:50:00,LauncherOne,,Starshine 4/Flight 1,Virgin Orbit,Testing,Low Earth,Failure,False,True,True,False,True,True,False,11.0,[1641222]
516,517,2020-05-30 19:22:45,Falcon 9 v1.2,LC-39A,DM-2/Crew Dragon 2,SpaceX,Testing,Low Earth,Operational,True,True,True,False,False,True,False,8.0,"[1645868, 1645869, 1645870, 1645871, 1645879, 1646067, 1646068, 1646069, 1646070, 1646369, 16470..."
