In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.decomposition import LatentDirichletAllocation as LDA
from nltk.corpus import stopwords
import numpy as np
import pandas as pd
import csv
import re

In [None]:
msg = 'NAV ILS RWY 27 LOC/GP OUT OF SERVICE'

In [None]:
abbrev_dict = {}

with open('data/Contractions (Clean).csv', mode='r') as infile:
    reader = csv.reader(infile)
    next(reader)
    abbrev_dict = {rows[0]:rows[1] for rows in reader}
    
def expandNotam(msg):
    new_string = re.sub(r'[^a-zA-Z]',' ',str(msg))
    new_msg = ''
    for wrd in new_string.split():        
        try:
            new_msg = new_msg + ' ' + abbrev_dict[wrd]
        except:
            new_msg = new_msg + ' ' + wrd
    return new_msg.lower().strip()

In [None]:
expandNotam(msg)

In [None]:
df4 = pd.read_csv('data/notam_20201027_pipes_noquotes.csv', 
                  on_bad_lines='skip', 
                  encoding='utf-16', 
                  parse_dates=['POSSIBLE_START_DATE', 'POSSIBLE_END_DATE', 'ISSUE_DATE', 'CANCELED_DATE'],
                  quoting=csv.QUOTE_NONE, 
                  engine="python", 
                  delimiter="|")

df4['MIN_ALT'] = df4['MIN_ALT'].replace('MSL', 0)
df4['MAX_ALT'] = df4['MAX_ALT'].replace('MSL', 0)
df4['MIN_ALT'] = pd.to_numeric(df4['MIN_ALT'], errors='coerce')
df4['MAX_ALT'] = pd.to_numeric(df4['MAX_ALT'], errors='coerce')

In [None]:
for x in df4['TEXT'].head(20):
    print(x)

In [None]:
df4['TEXT'] = df4['TEXT'].apply(expandNotam)

In [None]:
for x in df4['TEXT'].head(20):
    print(x)

In [None]:
corpus = df4['TEXT'].values

count_vect = CountVectorizer(stop_words=stopwords.words('english'), lowercase=True)
x_counts = count_vect.fit_transform(corpus)
x_counts.todense()

In [None]:
tfidf_transformer = TfidfTransformer()
x_tfidf = tfidf_transformer.fit_transform(x_counts)

In [None]:
dimension = 8
sample_size = 100000

np.random.seed(232323)
sample = x_tfidf[np.random.choice(x_tfidf.shape[0], sample_size, replace=False), :]

lda = LDA(n_components = dimension)
lda_array = lda.fit_transform(sample)

In [None]:
topic_vec = lda.transform(x_tfidf)

df4['TOPIC'] = np.argmax(topic_vec, axis = 1)

In [None]:
import pickle

df4.to_pickle("data/allData.pkl")

In [None]:
import pyLDAvis
import pyLDAvis.sklearn
import warnings

warnings.filterwarnings('ignore')
pyLDAvis.enable_notebook()

In [None]:
pyLDAvis.sklearn.prepare(lda, x_tfidf, count_vect)

In [None]:
pyLDAvis.sklearn.prepare(lda, x_tfidf, count_vect, mds='mmds')

In [None]:
pyLDAvis.sklearn.prepare(lda, x_tfidf, count_vect, mds='tsne')