In [58]:
import numpy as np
import pickle
import pandas as pd

In [2]:
matches = pd.read_csv('data/HumanAnnotatedMatches_SVO_DB_20200127_pipes_noquotes.csv', encoding='UTF-8', on_bad_lines='skip', engine="python", delimiter='|' )
non_matches = pd.read_csv('data/HumanAnnotatedMatches_poormatches_SVO_DB_20201027.csv', encoding='windows-1252')

all_notams = pd.read_pickle("data/allData.pkl")

In [3]:
good_ids = matches['NOTAM_REC_ID']

bad_ids = non_matches['NOTAM_REC_ID']

In [59]:
all_notams.columns

Index(['NOTAM_REC_ID', 'FNS_ID', 'FILENAME', 'NOTAM_ID', 'NOTAM_TYPE',
       'RELATED_NOTAM_ID', 'SIMPLE_TEXT', 'LOC_ID_ACCOUNTABLE_ORG',
       'NOTAM_NUMBER', 'RELATED_NOTAM_NUMBER', 'TEXT', 'Q_CODE',
       'Q_CODE_INTERPRETATION', 'A_CODE', 'B_CODE', 'C_CODE', 'D_CODE',
       'E_CODE', 'F_CODE', 'G_CODE', 'CLASSIFICATION', 'POSSIBLE_NOTAM_ID',
       'MIN_ALT', 'MAX_ALT', 'MIN_ALT_REF_TYPE', 'MAX_ALT_REF_TYPE',
       'POSSIBLE_START_DATE', 'POSSIBLE_END_DATE', 'ISSUE_DATE',
       'CANCELED_DATE', 'AFFECTED_FIR', 'DESIGNATOR', 'DESIGNATOR_NAME',
       'LOCATION_NAME', 'ACCOUNT_ID', 'LOCATION_CODE', 'LAUNCHES_REC_ID',
       'TOPIC'],
      dtype='object')

In [4]:
features = ['NOTAM_TYPE', 'TEXT', 'CLASSIFICATION', 'MIN_ALT', 'MAX_ALT', 'LOCATION_CODE', 'TOPIC']

In [5]:
good_notams = all_notams[all_notams['NOTAM_REC_ID'].isin(good_ids)][features]

bad_notams = all_notams[all_notams['NOTAM_REC_ID'].isin(bad_ids)][features]

In [6]:
bad_notams.columns

Index(['NOTAM_TYPE', 'TEXT', 'CLASSIFICATION', 'MIN_ALT', 'MAX_ALT',
       'LOCATION_CODE', 'TOPIC'],
      dtype='object')

In [7]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le2 = preprocessing.LabelEncoder()
le3 = preprocessing.LabelEncoder()

# Type Encoder
le.fit(all_notams['NOTAM_TYPE'])
good_notams['NOTAM_TYPE'] = le.transform(good_notams['NOTAM_TYPE'])
bad_notams['NOTAM_TYPE'] = le.transform(bad_notams['NOTAM_TYPE'])

# Classification Encoder
le2.fit(all_notams['CLASSIFICATION'])
good_notams['CLASSIFICATION'] = le2.transform(good_notams['CLASSIFICATION'])
bad_notams['CLASSIFICATION'] = le2.transform(bad_notams['CLASSIFICATION'])

# Location Code
le3.fit(all_notams['LOCATION_CODE'])
good_notams['LOCATION_CODE'] = le3.transform(good_notams['LOCATION_CODE'])
bad_notams['LOCATION_CODE'] = le3.transform(bad_notams['LOCATION_CODE'])

In [9]:
bad_notams

Unnamed: 0,NOTAM_TYPE,TEXT,CLASSIFICATION,MIN_ALT,MAX_ALT,LOCATION_CODE,TOPIC
16,3,airspace abel east military operations area ac...,0,50.0,129.99,14277,0
235,1,hnk vhf omnidirectional radio range intensity ...,2,0.0,999.00,12687,2
640,1,microburst windshear detection system not avai...,2,0.0,999.00,2376,4
719,1,automatic dependent surveillance contract auto...,2,0.0,999.00,5165,2
1162,1,surface movement radar shutdown for maintenance,2,0.0,999.00,13606,5
...,...,...,...,...,...,...,...
1646257,3,airspace restricted area (followed by identifi...,0,0.0,130.00,14319,0
1647145,1,due to military stationary airspace reservatio...,2,0.0,250.00,14385,6
1647290,1,rffs level except and june level june level,2,0.0,999.00,12838,3
1648646,3,airspace restricted area (followed by identifi...,0,0.0,150.00,14310,0


In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', encoding='latin-1', ngram_range=(1, 2), stop_words='english')

tfidf.fit(all_notams['TEXT'])

good_encodes = tfidf.transform(good_notams['TEXT'])
bad_encodes = tfidf.transform(bad_notams['TEXT'])

In [31]:
good_notams['LABEL'] = 1
bad_notams['LABEL'] = 0

In [37]:
from scipy.sparse import hstack, vstack

positive = hstack((good_notams['NOTAM_TYPE'].array[:,None], 
                   good_encodes,
                   good_notams['CLASSIFICATION'].array[:,None],
                   good_notams['MIN_ALT'].array[:,None],
                   good_notams['MAX_ALT'].array[:,None],
                   good_notams['LOCATION_CODE'].array[:,None],
                   good_notams['TOPIC'].array[:,None])).A

negative = hstack((bad_notams['NOTAM_TYPE'].array[:,None], 
                   bad_encodes,
                   bad_notams['CLASSIFICATION'].array[:,None],
                   bad_notams['MIN_ALT'].array[:,None],
                   bad_notams['MAX_ALT'].array[:,None],
                   bad_notams['LOCATION_CODE'].array[:,None],
                   bad_notams['TOPIC'].array[:,None])).A

In [56]:
from sklearn.model_selection import train_test_split

X = vstack((positive, negative))
y = vstack((good_notams['LABEL'].array[:,None], bad_notams['LABEL'].array[:,None]))

X_train, X_test, y_train, y_test = train_test_split(X, y.toarray(), test_size=0.33, random_state=42)

In [57]:
from xgboost import XGBClassifier

model = XGBClassifier()
model.fit(X_train, y_train)
yhat = model.predict(X_test)

print('Accuracy:', accuracy_score(y_test, yhat))
print('F1:', f1_score(y_test, yhat, average='micro'))

Accuracy: 0.9140893470790378
F1: 0.9140893470790378
