In [1]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.decomposition import LatentDirichletAllocation as LDA
from nltk.corpus import stopwords
import numpy as np
import pandas as pd
import csv
import re

In [2]:
msg = 'NAV ILS RWY 27 LOC/GP OUT OF SERVICE'

In [3]:
abbrev_dict = {}

with open('data/Contractions (Clean).csv', mode='r') as infile:
    reader = csv.reader(infile)
    next(reader)
    abbrev_dict = {rows[0]:rows[1] for rows in reader}
    
def expandNotam(msg):
    new_string = re.sub(r'[^a-zA-Z]',' ',str(msg))
    new_msg = ''
    for wrd in new_string.split():        
        try:
            new_msg = new_msg + ' ' + abbrev_dict[wrd]
        except:
            new_msg = new_msg + ' ' + wrd
    return new_msg.lower().strip()

In [4]:
expandNotam(msg)

'navigation instrument landing system runway locally glide path out of service'

In [5]:
df4 = pd.read_csv('data/notam_20201027_pipes_noquotes.csv', 
                  on_bad_lines='skip', 
                  encoding='utf-16', 
                  parse_dates=['POSSIBLE_START_DATE', 'POSSIBLE_END_DATE', 'ISSUE_DATE', 'CANCELED_DATE'],
                  quoting=csv.QUOTE_NONE, 
                  engine="python", 
                  delimiter="|")

df4['MIN_ALT'] = df4['MIN_ALT'].replace('MSL', 0)
df4['MAX_ALT'] = df4['MAX_ALT'].replace('MSL', 0)
df4['MIN_ALT'] = pd.to_numeric(df4['MIN_ALT'], errors='coerce')
df4['MAX_ALT'] = pd.to_numeric(df4['MAX_ALT'], errors='coerce')

In [6]:
for x in df4['TEXT'].head(20):
    print(x)

AIRSPACE VOLK SOUTH MOA ACT 500FT UP TO BUT NOT INCLUDING FL180
AIRSPACE VOLK EAST MOA ACT 8000FT UP TO BUT NOT INCLUDING FL180
AIRSPACE FALLS 1 MOA ACT 500FT UP TO BUT NOT INCLUDING FL180
AIRSPACE FALLS 2 MOA ACT 500FT UP TO BUT NOT INCLUDING FL180
AIRSPACE VOLK WEST MOA ACT 100FT UP TO BUT NOT INCLUDING FL180
ILS SV 109.5 MHZ RWY 03 U/S
CYQY ILS 07 U/S
RWY 17/35 CLSD EXC XNG
AIRSPACE JACKAL MOA ACT 11000FT UP TO BUT NOT INCLUDING FL180
AIRSPACE JACKAL LOW MOA ACT 100FT-10999FT
AIRSPACE COLUMBUS 3 MOA ACT 8000FT UP TO BUT NOT INCLUDING FL180
AIRSPACE COLUMBUS 4 MOA ACT 10000FT UP TO BUT NOT INCLUDING FL180
AIRSPACE AR112L(E) ACT FL190-FL230
AIRSPACE AR112L(E) ACT FL190-FL230
AIRSPACE AR101(S) ACT FL260-FL290
AIRSPACE AR101(S) ACT FL260-FL290
AIRSPACE ABEL EAST MOA ACT 5000FT-12999FT
AIRSPACE AR20(SW) ACT FL240-FL280
OBST - CRANE IN AREA RADIUS 20M PSN 494110N0180622E.  HGT 30M AGL/283M AMSL. DAY MARKING
CYSB CRANE 1000 FT BEYOND THR 04 AND 1200 FT RIGHT RCL. 150 FT AGL 1293 MSL. NOT L

In [7]:
df4['TEXT'] = df4['TEXT'].apply(expandNotam)

In [8]:
for x in df4['TEXT'].head(20):
    print(x)

airspace volk south military operations area active foot up to but not including flight level
airspace volk east military operations area active foot up to but not including flight level
airspace falls military operations area active foot up to but not including flight level
airspace falls military operations area active foot up to but not including flight level
airspace volk west military operations area active foot up to but not including flight level
instrument landing system sv mhz runway intensity unknown (weather reports only) south or southern latitude
cyqy instrument landing system intensity unknown (weather reports only) south or southern latitude
runway closed except crossing
airspace jackal military operations area active foot up to but not including flight level
airspace jackal low military operations area active foot foot
airspace columbus military operations area active foot up to but not including flight level
airspace columbus military operations area active foot up to 

In [9]:
corpus = df4['TEXT'].values

count_vect = CountVectorizer(stop_words=stopwords.words('english'), lowercase=True)
x_counts = count_vect.fit_transform(corpus)
x_counts.todense()

matrix([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]])

In [10]:
tfidf_transformer = TfidfTransformer()
x_tfidf = tfidf_transformer.fit_transform(x_counts)

In [11]:
dimension = 8
sample_size = 100000

np.random.seed(232323)
sample = x_tfidf[np.random.choice(x_tfidf.shape[0], sample_size, replace=False), :]

lda = LDA(n_components = dimension)
lda_array = lda.fit_transform(sample)

In [12]:
topic_vec = lda.transform(x_tfidf)

df4['TOPIC'] = np.argmax(topic_vec, axis = 1)

In [13]:
import pickle

df4.to_pickle("data/allData.pkl")

In [14]:
import pyLDAvis
import pyLDAvis.sklearn
import warnings

warnings.filterwarnings('ignore')
pyLDAvis.enable_notebook()

In [15]:
pyLDAvis.sklearn.prepare(lda, x_tfidf, count_vect)

In [16]:
pyLDAvis.sklearn.prepare(lda, x_tfidf, count_vect, mds='mmds')

In [17]:
pyLDAvis.sklearn.prepare(lda, x_tfidf, count_vect, mds='tsne')

  from imp import reload
  if LooseVersion(np.__version__) < '1.13':
  other = LooseVersion(other)
  if LooseVersion(np.__version__) < '1.13':
  other = LooseVersion(other)
  from imp import reload
  if LooseVersion(np.__version__) < '1.13':
  other = LooseVersion(other)
  if LooseVersion(np.__version__) < '1.13':
  other = LooseVersion(other)
  if LooseVersion(np.__version__) < '1.13':
  other = LooseVersion(other)
  if LooseVersion(np.__version__) < '1.13':
  other = LooseVersion(other)
  from imp import reload
  if LooseVersion(np.__version__) < '1.13':
  other = LooseVersion(other)
  if LooseVersion(np.__version__) < '1.13':
  other = LooseVersion(other)
  if LooseVersion(np.__version__) < '1.13':
  other = LooseVersion(other)
  if LooseVersion(np.__version__) < '1.13':
  other = LooseVersion(other)
  from imp import reload
  if LooseVersion(np.__version__) < '1.13':
  other = LooseVersion(other)
  if LooseVersion(np.__version__) < '1.13':
  other = LooseVersion(other)
  from imp i