In [1]:
import numpy as np
import pickle
import pandas as pd
from sklearn import preprocessing
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import accuracy_score
from scipy.sparse import hstack, vstack
from xgboost import XGBClassifier

In [2]:
# Load Matched Data (Positive)
matches = pd.read_csv('data/HumanAnnotatedMatches_SVO_DB_20200127_pipes_noquotes.csv', encoding='UTF-8', on_bad_lines='skip', engine="python", delimiter='|' )

# Load Matched Data (Negative)
non_matches = pd.read_csv('data/HumanAnnotatedMatches_poormatches_SVO_DB_20201027.csv', encoding='windows-1252')

# Load All Data
all_notams = pd.read_pickle("data/allData.pkl")

In [3]:
# Positive NOTAM IDs
good_ids = matches['NOTAM_REC_ID']

# Negative NOTAM IDs
bad_ids = non_matches['NOTAM_REC_ID']

In [4]:
# Check Cols
all_notams.columns

Index(['NOTAM_REC_ID', 'FNS_ID', 'FILENAME', 'NOTAM_ID', 'NOTAM_TYPE',
       'RELATED_NOTAM_ID', 'SIMPLE_TEXT', 'LOC_ID_ACCOUNTABLE_ORG',
       'NOTAM_NUMBER', 'RELATED_NOTAM_NUMBER', 'TEXT', 'Q_CODE',
       'Q_CODE_INTERPRETATION', 'A_CODE', 'B_CODE', 'C_CODE', 'D_CODE',
       'E_CODE', 'F_CODE', 'G_CODE', 'CLASSIFICATION', 'POSSIBLE_NOTAM_ID',
       'MIN_ALT', 'MAX_ALT', 'MIN_ALT_REF_TYPE', 'MAX_ALT_REF_TYPE',
       'POSSIBLE_START_DATE', 'POSSIBLE_END_DATE', 'ISSUE_DATE',
       'CANCELED_DATE', 'AFFECTED_FIR', 'DESIGNATOR', 'DESIGNATOR_NAME',
       'LOCATION_NAME', 'ACCOUNT_ID', 'LOCATION_CODE', 'LAUNCHES_REC_ID',
       'TOPIC', 'NOTAM_TYPE_ENCODE', 'CLASSIFICATION_ENCODE',
       'LOCATION_CODE_ENCODE', 'XGB'],
      dtype='object')

In [5]:
# Set Features
features = ['NOTAM_TYPE', 'TEXT', 'CLASSIFICATION', 'MIN_ALT', 'MAX_ALT', 'LOCATION_CODE', 'TOPIC']

In [6]:
# Get Positive NOTAM Data
good_notams = all_notams[all_notams['NOTAM_REC_ID'].isin(good_ids)][features]

# Get Negaitve NOTAM Data
bad_notams = all_notams[all_notams['NOTAM_REC_ID'].isin(bad_ids)][features]

In [7]:
# Check Cols
bad_notams.columns

Index(['NOTAM_TYPE', 'TEXT', 'CLASSIFICATION', 'MIN_ALT', 'MAX_ALT',
       'LOCATION_CODE', 'TOPIC'],
      dtype='object')

In [8]:
# Initialize Label Encoders
le = preprocessing.LabelEncoder()
le2 = preprocessing.LabelEncoder()
le3 = preprocessing.LabelEncoder()

# Type Encoder
all_notams['NOTAM_TYPE_ENCODE'] = le.fit_transform(all_notams['NOTAM_TYPE'])
good_notams['NOTAM_TYPE'] = le.transform(good_notams['NOTAM_TYPE'])
bad_notams['NOTAM_TYPE'] = le.transform(bad_notams['NOTAM_TYPE'])

# Classification Encoder
all_notams['CLASSIFICATION_ENCODE'] = le2.fit_transform(all_notams['CLASSIFICATION'])
good_notams['CLASSIFICATION'] = le2.transform(good_notams['CLASSIFICATION'])
bad_notams['CLASSIFICATION'] = le2.transform(bad_notams['CLASSIFICATION'])

# Location Code
all_notams['LOCATION_CODE_ENCODE'] = le3.fit_transform(all_notams['LOCATION_CODE'])
good_notams['LOCATION_CODE'] = le3.transform(good_notams['LOCATION_CODE'])
bad_notams['LOCATION_CODE'] = le3.transform(bad_notams['LOCATION_CODE'])

In [9]:
# Check Data
bad_notams.head()

Unnamed: 0,NOTAM_TYPE,TEXT,CLASSIFICATION,MIN_ALT,MAX_ALT,LOCATION_CODE,TOPIC
16,3,airspace abel east military operations area ac...,0,50.0,129.99,14277,0
235,1,hnk vhf omnidirectional radio range intensity ...,2,0.0,999.0,12687,2
640,1,microburst windshear detection system not avai...,2,0.0,999.0,2376,4
719,1,automatic dependent surveillance contract auto...,2,0.0,999.0,5165,2
1162,1,surface movement radar shutdown for maintenance,2,0.0,999.0,13606,5


In [11]:
from sentence_transformers import SentenceTransformer

# Out-of-the-Box Model
#model = SentenceTransformer('bert-large-nli-mean-tokens')
model = SentenceTransformer('all-MiniLM-L6-v2')
#model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

# Fit and Transform All Data
all_encodes = model.encode(all_notams['TEXT'].values)

# Transform Positive Data
good_encodes = model.encode(good_notams['TEXT'].values)

# Transform Negative Data
bad_encodes = model.encode(bad_notams['TEXT'].values)

KeyboardInterrupt: 

In [5]:
import numpy as np
a = np.array([1,2,3,4,5])

In [7]:
a[:,None]

array([[1],
       [2],
       [3],
       [4],
       [5]])

In [None]:
# Set Classification Labels
good_notams['LABEL'] = 1
bad_notams['LABEL'] = 0

In [None]:
# Combine Features (Positive)
positive = hstack((good_notams['NOTAM_TYPE'].array[:,None], 
                   good_encodes,
                   good_notams['CLASSIFICATION'].array[:,None],
                   good_notams['MIN_ALT'].array[:,None],
                   good_notams['MAX_ALT'].array[:,None],
                   good_notams['LOCATION_CODE'].array[:,None],
                   good_notams['TOPIC'].array[:,None])).A

# Combine Features (Negative)
negative = hstack((bad_notams['NOTAM_TYPE'].array[:,None], 
                   bad_encodes,
                   bad_notams['CLASSIFICATION'].array[:,None],
                   bad_notams['MIN_ALT'].array[:,None],
                   bad_notams['MAX_ALT'].array[:,None],
                   bad_notams['LOCATION_CODE'].array[:,None],
                   bad_notams['TOPIC'].array[:,None])).A

In [None]:
# Intitialize K-Fold Split for Cross Validation
sss = StratifiedShuffleSplit(n_splits=5, test_size=0.3, random_state=232323)

# Combine Postive and Negative Data
X = vstack((positive, negative))
y = vstack((good_notams['LABEL'].array[:,None], bad_notams['LABEL'].array[:,None]))

In [None]:
# Cross Validation
for train_index, test_index in sss.split(X.toarray(), y.toarray()):
    X_train, X_test = X.tocsr()[train_index], X.tocsr()[test_index]
    y_train, y_test = y.tocsr()[train_index].toarray(), y.tocsr()[test_index].toarray()

    test_model = XGBClassifier()
    test_model.fit(X_train, y_train)
    yhat = test_model.predict(X_test)

    print('Accuracy:', accuracy_score(y_test, yhat))

In [15]:
# Combine Features (All Data)
pred = hstack((all_notams['NOTAM_TYPE_ENCODE'].array[:,None], 
                   all_encodes,
                   all_notams['CLASSIFICATION_ENCODE'].array[:,None],
                   all_notams['MIN_ALT'].array[:,None],
                   all_notams['MAX_ALT'].array[:,None],
                   all_notams['LOCATION_CODE_ENCODE'].array[:,None],
                   all_notams['TOPIC'].array[:,None])).A

In [16]:
# Final Model Training
model = XGBClassifier()
model.fit(X, y.toarray())

# Predict All NOTAMs
all_notams['XGB_bert'] = model.predict(pred)

In [17]:
# Save Restults
all_notams.to_pickle("data/allData.pkl")