In [1]:
# Import general libraries
import pandas as pd
import numpy as np

import datetime
import warnings; warnings.simplefilter('ignore')

np.random.seed(42)

In [2]:
# Import NLP libraries
import re
import gensim

from gensim.models.word2vec import Word2Vec
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS

import nltk
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *

In [3]:
# Import Plotting libraries
import seaborn as sns
import matplotlib.pyplot as plt

In [4]:
# Increase size of dataframe
pd.set_option('display.max_columns', 50000)
pd.set_option('display.max_rows', 100000)
pd.options.display.max_colwidth = 500

In [5]:
# Read processed data
data = pd.read_csv('..ß/data/processed_data.csv', index_col=0)

In [6]:
# ATA code should be an object with leading zeros
data['ata4_code'] = data['ata4_code'].astype(str).map(lambda x: x.zfill(4))

In [7]:
data.shape

(10062, 8)

In [8]:
data.tail(20)

Unnamed: 0,fleet,problem_log,corrective_action,delay_code,station_delay_minutes,d&c_category,ata4_code,ata4_desc
40208,B787-9,ceiling panel near door 4r is coming down,panel secured chkd nml,TA,58,Delay>15mins,2520,passenger compartment
40234,B787-9,mid galley coffee maker pos 509 leaking from the vent,adjusted coffee maker okay for service,TL,27,Delay>15mins,2500,equipment furnishings general
40235,B787-9,coffee maker 812 flowing constantly,removed and replaced coffee maker 812 ops and leak check ok rotable parts change was indicated tt 1431005 sn off 06210coffeemaker a off 78 2500 9 9001 6210 on 78 2500 9 9001 9805,TF,62,Delay>15mins,2500,equipment furnishings general
40236,B787-9,seat monitors at seats row 2 3 and 4 de inop,reboot ife system ops checks good,TA,27,Delay>15mins,2500,equipment furnishings general
40237,B787-9,lav missing an ashtray lav g has missing ashtray,replaced missing ashtray,TF,19,Delay>15mins,2500,equipment furnishings general
40238,B787-9,2 overhead bins 16 17d bull nose trim pulled away bin structure ok,secured loose trim,TF,20,Delay>15mins,2500,equipment furnishings general
40267,B787-9,f a bunk room door will not lock,replaced lock per amm b787 a 25 50 22 00a 520a a op check ok,TF,8,Delay 6-15mins,2500,equipment furnishings general
40268,B787-9,lav 3r not flushing,found lavatory not blacked toilet flush is normal,TF,17,Delay>15mins,2500,equipment furnishings general
40299,B787-9,fwd galley oven no 105 inop c b did not pop,replaced timmer ops check ok,TF,7,Delay 6-15mins,2500,equipment furnishings general
40336,B787-9,3l window button pushed into wall,secured window shade button at seat 3l and ops check good,TL,24,Delay>15mins,2500,equipment furnishings general


### Predict action for a specific log

In [9]:
import spacy
from spacy import displacy

import en_core_web_sm

In [10]:
nlp = en_core_web_sm.load()
text = 'removed and replaced coffee maker 812 ops and leak check ok rotable parts change was indicated'
doc = nlp(text)

In [11]:
token_df = pd.DataFrame(columns=['text', 'pos', 'dep'], index=range(len(doc)))
for i, token in enumerate(doc):
    token_df.loc[i] = [token.text, token.pos_, token.dep_]    
token_df

Unnamed: 0,text,pos,dep
0,removed,VERB,amod
1,and,CCONJ,cc
2,replaced,VERB,conj
3,coffee,NOUN,compound
4,maker,NOUN,dobj
5,812,NUM,nummod
6,ops,NOUN,nmod
7,and,CCONJ,cc
8,leak,NOUN,compound
9,check,NOUN,conj


In [12]:
ent_df = pd.DataFrame(columns=['text', 'start_char', 'end_char', 'ent.label_'], index=range(len(doc.ents)))
for i, ent in enumerate(doc.ents):
#     print(ent.text, ent.start_char, ent.end_char, ent.label_)
    ent_df.loc[i] = [ent.text, ent.start_char, ent.end_char, ent.label_]    
ent_df

Unnamed: 0,text,start_char,end_char,ent.label_
0,812,34,37,CARDINAL


In [13]:
# Analyze syntax
print("Noun phrases:", [chunk.text for chunk in doc.noun_chunks])
print("Verbs:", [token.lemma_ for token in doc if token.pos_ == "VERB"])

Noun phrases: ['coffee maker']
Verbs: ['remove', 'replace', 'be', 'indicate']
