In [1]:
import numpy as np
import pickle
import pandas as pd
from sklearn import preprocessing
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import accuracy_score
from scipy.sparse import hstack, vstack
from xgboost import XGBClassifier
from sklearn import svm
from sklearn.linear_model import LogisticRegression

In [2]:
# Load Matched Data (Positive)
#matches = pd.read_csv('data/HumanAnnotatedMatches_SVO_DB_20200127_pipes_noquotes.csv', encoding='UTF-8', on_bad_lines='skip', engine="python", delimiter='|' )
#good_ids = matches['NOTAM_REC_ID']

# Load Matched Data (Negative)
#non_matches = pd.read_csv('data/HumanAnnotatedMatches_poormatches_SVO_DB_20201027.csv', encoding='windows-1252')
#bad_ids = non_matches['NOTAM_REC_ID']

# Load All Data
all_notams = pd.read_pickle("data/allData.pkl")

In [3]:
good_ids = np.ravel(pd.read_csv('data/handmade_good_matches.csv', header=None))
bad_ids = np.ravel(pd.read_csv('data/handmade_bad_matches.csv', header=None))

In [4]:
sub_bad_ids = np.random.choice(bad_ids, 5 * len(good_ids))

In [5]:
# Set Features
features = ['NOTAM_TYPE', 'TEXT', 'CLASSIFICATION', 'MIN_ALT', 'MAX_ALT', 'LOCATION_CODE']

In [6]:
# Get Positive NOTAM Data
good_notams = all_notams[all_notams['NOTAM_REC_ID'].isin(good_ids)][features]

# Get Negaitve NOTAM Data
bad_notams = all_notams[all_notams['NOTAM_REC_ID'].isin(sub_bad_ids)][features]

In [7]:
# Check Cols
bad_notams.columns

Index(['NOTAM_TYPE', 'TEXT', 'CLASSIFICATION', 'MIN_ALT', 'MAX_ALT',
       'LOCATION_CODE'],
      dtype='object')

In [8]:
# Initialize Label Encoders
le = preprocessing.LabelEncoder()
le2 = preprocessing.LabelEncoder()
le3 = preprocessing.LabelEncoder()

# Type Encoder
all_notams['NOTAM_TYPE_ENCODE'] = le.fit_transform(all_notams['NOTAM_TYPE'])
good_notams['NOTAM_TYPE'] = le.transform(good_notams['NOTAM_TYPE'])
bad_notams['NOTAM_TYPE'] = le.transform(bad_notams['NOTAM_TYPE'])

# Classification Encoder
all_notams['CLASSIFICATION_ENCODE'] = le2.fit_transform(all_notams['CLASSIFICATION'])
good_notams['CLASSIFICATION'] = le2.transform(good_notams['CLASSIFICATION'])
bad_notams['CLASSIFICATION'] = le2.transform(bad_notams['CLASSIFICATION'])

# Location Code
all_notams['LOCATION_CODE_ENCODE'] = le3.fit_transform(all_notams['LOCATION_CODE'])
good_notams['LOCATION_CODE'] = le3.transform(good_notams['LOCATION_CODE'])
bad_notams['LOCATION_CODE'] = le3.transform(bad_notams['LOCATION_CODE'])

In [9]:
# Check Data
bad_notams.head()

Unnamed: 0,NOTAM_TYPE,TEXT,CLASSIFICATION,MIN_ALT,MAX_ALT,LOCATION_CODE
709,1,standard instrument departure sanitary diego i...,1,0.0,999.0,8092
981,0,danger area notamc danger area q ybbb qrrxx iv...,2,50.0,600.0,13941
1398,1,navigation instrument landing system runway lo...,0,0.0,999.0,6641
1503,1,runway locator foot district remaining sign at...,0,0.0,999.0,6641
3009,2,fire and rescue category will be altostratus p...,2,0.0,999.0,4028


In [10]:
# Message Embedding
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', encoding='latin-1', ngram_range=(1, 2), stop_words='english', max_features=10000)

# Fit and Transform All Data
all_encodes = tfidf.fit_transform(all_notams['TEXT'])

# Transform Positive Data
good_encodes = tfidf.transform(good_notams['TEXT'])

# Transform Negative Data
bad_encodes = tfidf.transform(bad_notams['TEXT'])

In [11]:
# Set Classification Labels
good_notams['LABEL'] = 1
bad_notams['LABEL'] = 0

In [12]:
# Combine Features (Positive)
positive = hstack((good_notams['NOTAM_TYPE'].array[:,None], 
                   good_encodes,
                   good_notams['CLASSIFICATION'].array[:,None],
                   good_notams['MIN_ALT'].array[:,None],
                   good_notams['MAX_ALT'].array[:,None],
                   good_notams['LOCATION_CODE'].array[:,None])).A

# Combine Features (Negative)
negative = hstack((bad_notams['NOTAM_TYPE'].array[:,None], 
                   bad_encodes,
                   bad_notams['CLASSIFICATION'].array[:,None],
                   bad_notams['MIN_ALT'].array[:,None],
                   bad_notams['MAX_ALT'].array[:,None],
                   bad_notams['LOCATION_CODE'].array[:,None])).A

In [13]:
# Intitialize K-Fold Split for Cross Validation
sss = StratifiedShuffleSplit(n_splits=5, test_size=0.025, train_size=0.05, random_state=232323)

# Combine Postive and Negative Data
X = vstack((positive, negative))
y = vstack((good_notams['LABEL'].array[:,None], bad_notams['LABEL'].array[:,None]))

In [14]:
# Combine Features (All Data)
pred = hstack((all_notams['NOTAM_TYPE_ENCODE'].array[:,None], 
                   all_encodes,
                   all_notams['CLASSIFICATION_ENCODE'].array[:,None],
                   all_notams['MIN_ALT'].array[:,None],
                   all_notams['MAX_ALT'].array[:,None],
                   all_notams['LOCATION_CODE_ENCODE'].array[:,None])).A

In [15]:
# Cross Validation XGBoost
for train_index, test_index in sss.split(X.toarray(), y.toarray()):
    X_train, X_test = X.tocsr()[train_index], X.tocsr()[test_index]
    y_train, y_test = y.tocsr()[train_index].toarray(), y.tocsr()[test_index].toarray()
   
    test_model = XGBClassifier()
    test_model.fit(X_train, y_train)
    yhat = test_model.predict(X_test)

    print('Accuracy:', accuracy_score(y_test, yhat))
    
# Final Model Training
model = XGBClassifier()
model.fit(X, y.toarray())

# Predict All NOTAMs
all_notams['XGB_NEW'] = model.predict(pred)

Accuracy: 1.0
Accuracy: 0.984
Accuracy: 0.984
Accuracy: 0.984
Accuracy: 1.0


In [16]:
# Set NAN to 0 for models that cannot deal with NULL
pred[np.isnan(pred)] = 0

In [17]:
# Cross Validation Linear Regression
for train_index, test_index in sss.split(X.toarray(), y.toarray()):
    X_train, X_test = X.tocsr()[train_index], X.tocsr()[test_index]
    y_train, y_test = y.tocsr()[train_index].toarray(), y.tocsr()[test_index].toarray()
    
    X_train.data[np.isnan(X_train.data)] = 0
    X_test.data[np.isnan(X_test.data)] = 0
   
    test_model = LogisticRegression(random_state=0, max_iter=1000, penalty='none')
    test_model.fit(X_train, np.ravel(y_train))
    yhat = test_model.predict(X_test)

    print('Accuracy:', accuracy_score(np.ravel(y_test), yhat))

X.data[np.isnan(X.data)] = 0

# Final Model Training
model = LogisticRegression(random_state=0, max_iter=1000, penalty='none')
model.fit(X, np.ravel(y.toarray()))

# Predict All NOTAMs
all_notams['LinReg_NEW'] = model.predict(pred)

Accuracy: 1.0
Accuracy: 1.0
Accuracy: 0.984
Accuracy: 0.968
Accuracy: 0.984


In [18]:
# Cross Validation SVM
for train_index, test_index in sss.split(X.toarray(), y.toarray()):
    X_train, X_test = X.tocsr()[train_index], X.tocsr()[test_index]
    y_train, y_test = y.tocsr()[train_index].toarray(), y.tocsr()[test_index].toarray()
    
    X_train.data[np.isnan(X_train.data)] = 0
    X_test.data[np.isnan(X_test.data)] = 0
   
    test_model = svm.SVC(random_state=0, C=0.1)
    test_model.fit(X_train, np.ravel(y_train))
    yhat = test_model.predict(X_test)

    print('Accuracy:', accuracy_score(y_test, yhat))
    
X.data[np.isnan(X.data)] = 0  

# Final Model Training
model = svm.SVC(random_state=0, C=0.1)
model.fit(X, np.ravel(y.toarray()))

# Predict All NOTAMs
all_notams['SVM_NEW'] = model.predict(pred)

Accuracy: 0.832
Accuracy: 0.832
Accuracy: 0.832
Accuracy: 0.832
Accuracy: 0.832


In [19]:
# Save Results
all_notams.to_pickle("data/allData.pkl")