In [29]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.decomposition import LatentDirichletAllocation as LDA
from nltk.corpus import stopwords
import numpy as np
import pandas as pd
import csv
import re

In [30]:
msg = 'NAV ILS RWY 27 LOC/GP OUT OF SERVICE'

In [31]:
abbrev_dict = {}

with open('data/Contractions (Clean).csv', mode='r') as infile:
    reader = csv.reader(infile)
    next(reader)
    abbrev_dict = {rows[0]:rows[1] for rows in reader}
    
def expandNotam(msg):
    new_string = re.sub(r"[^a-zA-Z]"," ",str(msg))
    new_msg = ''
    for wrd in new_string.split():        
        try:
            new_msg = new_msg + ' ' + abbrev_dict[wrd]
        except:
            new_msg = new_msg + ' ' + wrd
    return new_msg.lower().strip()

In [32]:
expandNotam(msg)

'navigation instrument landing system runway locally glide path out of service'

In [33]:
df4 = pd.read_csv('data/notams.csv', 
                  on_bad_lines='skip', 
                  encoding='utf-16', 
                  parse_dates=['POSSIBLE_START_DATE', 'POSSIBLE_END_DATE', 'ISSUE_DATE', 'CANCELED_DATE'])

df4['MIN_ALT'] = df4['MIN_ALT'].replace('MSL', 0)
df4['MAX_ALT'] = df4['MAX_ALT'].replace('MSL', 0)
df4['MIN_ALT'] = pd.to_numeric(df4['MIN_ALT'], errors='coerce')
df4['MAX_ALT'] = pd.to_numeric(df4['MAX_ALT'], errors='coerce')

  df4 = pd.read_csv('data/notams.csv',


In [34]:
for x in df4['TEXT'].head(20):
    print(x)

AIRSPACE VOLK SOUTH MOA ACT 500FT UP TO BUT NOT INCLUDING FL180
AIRSPACE VOLK EAST MOA ACT 8000FT UP TO BUT NOT INCLUDING FL180
AIRSPACE FALLS 1 MOA ACT 500FT UP TO BUT NOT INCLUDING FL180
AIRSPACE FALLS 2 MOA ACT 500FT UP TO BUT NOT INCLUDING FL180
AIRSPACE VOLK WEST MOA ACT 100FT UP TO BUT NOT INCLUDING FL180
ILS SV 109.5 MHZ RWY 03 U/S
CYQY ILS 07 U/S
RWY 17/35 CLSD EXC XNG
AIRSPACE JACKAL MOA ACT 11000FT UP TO BUT NOT INCLUDING FL180
AIRSPACE JACKAL LOW MOA ACT 100FT-10999FT
AIRSPACE COLUMBUS 3 MOA ACT 8000FT UP TO BUT NOT INCLUDING FL180
AIRSPACE COLUMBUS 4 MOA ACT 10000FT UP TO BUT NOT INCLUDING FL180
AIRSPACE AR112L(E) ACT FL190-FL230
AIRSPACE AR112L(E) ACT FL190-FL230
AIRSPACE AR101(S) ACT FL260-FL290
AIRSPACE AR101(S) ACT FL260-FL290
AIRSPACE ABEL EAST MOA ACT 5000FT-12999FT
AIRSPACE AR20(SW) ACT FL240-FL280
OBST - CRANE IN AREA RADIUS 20M PSN 494110N0180622E.  HGT 30M AGL/283M AMSL. DAY MARKING
CYSB CRANE 1000 FT BEYOND THR 04 AND 1200 FT RIGHT RCL. 150 FT AGL 1293 MSL. NOT L

In [35]:
df4['TEXT'] = df4['TEXT'].apply(expandNotam)

In [36]:
for x in df4['TEXT'].head(20):
    print(x)

airspace volk south military operations area active foot up to but not including flight level
airspace volk east military operations area active foot up to but not including flight level
airspace falls military operations area active foot up to but not including flight level
airspace falls military operations area active foot up to but not including flight level
airspace volk west military operations area active foot up to but not including flight level
instrument landing system sv mhz runway intensity unknown (weather reports only) south or southern latitude
cyqy instrument landing system intensity unknown (weather reports only) south or southern latitude
runway closed except crossing
airspace jackal military operations area active foot up to but not including flight level
airspace jackal low military operations area active foot foot
airspace columbus military operations area active foot up to but not including flight level
airspace columbus military operations area active foot up to 

In [37]:
corpus = df4['TEXT'].values

count_vect = CountVectorizer(stop_words=stopwords.words('english'), lowercase=True)
x_counts = count_vect.fit_transform(corpus)
x_counts.todense()

matrix([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]])

In [38]:
tfidf_transformer = TfidfTransformer()
x_tfidf = tfidf_transformer.fit_transform(x_counts)

In [39]:
dimension = 7
sample_size = 50000

sample = x_tfidf[np.random.choice(x_tfidf.shape[0], sample_size, replace=False, ), :]

lda = LDA(n_components = dimension)
lda_array = lda.fit_transform(sample)

In [40]:
topic_vec = lda.transform(x_tfidf)

df4['TOPIC'] = np.argmax(topic_vec, axis = 1)

In [41]:
import pickle

df4.to_pickle("data/allData.pkl")

In [42]:
def ntopwlst(model, features, ntopwords):
    '''create a list of the top topc words'''
    output = {}
    for topic_idx, topic in enumerate(model.components_):
        tmp = []
        tmp += [features[i] for i in topic.argsort()[:-ntopwords - 1:-1]] 
        output[topic_idx] = tmp
    return output

ntopwords = 8
features = count_vect.get_feature_names_out()
topwds = ntopwlst(lda, features, ntopwords)
topwds

{0: ['active',
  'time',
  'airspace',
  'area',
  'level',
  'foot',
  'restricted',
  'flight'],
 1: ['foot', 'data', 'center', 'west', 'mile', 'nautical', 'agl', 'obstruct'],
 2: ['taxiway',
  'temperature',
  'absolute',
  'blue',
  'east',
  'eastern',
  'monitoring',
  'receiver'],
 3: ['runway',
  'system',
  'navigation',
  'service',
  'landing',
  'instrument',
  'intensity',
  'unknown'],
 4: ['range',
  'omnidirectional',
  'vhf',
  'radio',
  'measuring',
  'distance',
  'equipment',
  'navigation'],
 5: ['aerodrome',
  'information',
  'aeronautical',
  'area',
  'available',
  'publication',
  'control',
  'aircraft'],
 6: ['longitude',
  'eastern',
  'east',
  'northern',
  'north',
  'latitude',
  'western',
  'west']}

In [15]:
sample_size = 60000
test_size = 5000
sample = x_tfidf[np.random.choice(x_tfidf.shape[0], sample_size, replace=False), :]
test = x_tfidf[np.random.choice(x_tfidf.shape[0], test_size, replace=False), :]

for dimension in range(1,25):

    lda = LDA(n_components = dimension)
    lda_array = lda.fit_transform(sample)

    print("Number of Topics", dimension, ":", lda.perplexity(test))

Number of Topics 1 : 153899.6478158038
Number of Topics 2 : 247577.16023022588
Number of Topics 3 : 329083.8305234153
Number of Topics 4 : 458125.7639955059
Number of Topics 5 : 602297.4883216338
Number of Topics 6 : 720805.2679703971
Number of Topics 7 : 802917.9499750045
Number of Topics 8 : 821049.810728881
Number of Topics 9 : 1323281.7831801735
Number of Topics 10 : 1681283.08297869
Number of Topics 11 : 1769210.0481887187
Number of Topics 12 : 2305851.448527696
Number of Topics 13 : 2641919.3607822857
Number of Topics 14 : 2423484.94348105
Number of Topics 15 : 3051007.8765897225
Number of Topics 16 : 3616690.8947409424
Number of Topics 17 : 4054752.7126122285
Number of Topics 18 : 4875565.078997995
Number of Topics 19 : 5077153.110289168
Number of Topics 20 : 4191133.9823292303
Number of Topics 21 : 6905168.172083181
Number of Topics 22 : 7161821.9239946855
Number of Topics 23 : 9284455.495756088
Number of Topics 24 : 8359949.458433262
