### Prep Work

In [1]:
# Import general libraries
import pandas as pd
import numpy as np

import datetime
import warnings; warnings.simplefilter('ignore')

np.random.seed(42)

In [2]:
# Import NLP libraries
import re
import gensim

from gensim.models.word2vec import Word2Vec
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS

import nltk
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *

In [3]:
# Import Plotting libraries
import seaborn as sns
import matplotlib.pyplot as plt

In [4]:
# Increase size of dataframe
pd.set_option('display.max_columns', 50000)
pd.set_option('display.max_rows', 100000)
pd.options.display.max_colwidth = 500

In [5]:
# Read processed data
data = pd.read_csv('../data/processed_data.csv', index_col=0)


In [6]:
# ATA code should be an object with leading zeros
data['ata4_code'] = data['ata4_code'].astype(str).map(lambda x: x.zfill(4))

In [7]:
data.shape

(10062, 8)

In [8]:
data.tail(20)

Unnamed: 0,fleet,problem_log,corrective_action,delay_code,station_delay_minutes,d&c_category,ata4_code,ata4_desc
40208,B787-9,ceiling panel near door 4r is coming down,panel secured chkd nml,TA,58,Delay>15mins,2520,passenger compartment
40234,B787-9,mid galley coffee maker pos 509 leaking from the vent,adjusted coffee maker okay for service,TL,27,Delay>15mins,2500,equipment furnishings general
40235,B787-9,coffee maker 812 flowing constantly,removed and replaced coffee maker 812 ops and leak check ok rotable parts change was indicated tt 1431005 sn off 06210coffeemaker a off 78 2500 9 9001 6210 on 78 2500 9 9001 9805,TF,62,Delay>15mins,2500,equipment furnishings general
40236,B787-9,seat monitors at seats row 2 3 and 4 de inop,reboot ife system ops checks good,TA,27,Delay>15mins,2500,equipment furnishings general
40237,B787-9,lav missing an ashtray lav g has missing ashtray,replaced missing ashtray,TF,19,Delay>15mins,2500,equipment furnishings general
40238,B787-9,2 overhead bins 16 17d bull nose trim pulled away bin structure ok,secured loose trim,TF,20,Delay>15mins,2500,equipment furnishings general
40267,B787-9,f a bunk room door will not lock,replaced lock per amm b787 a 25 50 22 00a 520a a op check ok,TF,8,Delay 6-15mins,2500,equipment furnishings general
40268,B787-9,lav 3r not flushing,found lavatory not blacked toilet flush is normal,TF,17,Delay>15mins,2500,equipment furnishings general
40299,B787-9,fwd galley oven no 105 inop c b did not pop,replaced timmer ops check ok,TF,7,Delay 6-15mins,2500,equipment furnishings general
40336,B787-9,3l window button pushed into wall,secured window shade button at seat 3l and ops check good,TL,24,Delay>15mins,2500,equipment furnishings general


### Predict action for a specific log

In [9]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from sklearn.metrics.pairwise import cosine_similarity

In [10]:
# String to check
to_check = ['coffee maker 812 flowing constantly']


In [11]:
# Matt Parker helped with this code

#Instantiate TFIDFVectorizer
tf_log = TfidfVectorizer(ngram_range=(2,8), analyzer='char_wb')

# Fit the model
tf_fit = tf_log.fit_transform(to_check)
tf_logs = tf_log.transform(data['problem_log'])

# Create a Dataframe containing cosine similarities
matching_logs = pd.DataFrame(cosine_similarity(tf_fit, tf_logs),
                             columns=data['problem_log' ]).T
matching_logs.columns=['cos_similarity']

In [12]:
# Merge with the original dataframe
matching_logs = pd.merge(matching_logs,data[['problem_log','corrective_action']],on='problem_log', how='left')

In [13]:
# Find corresponding action for highest cosine similarity
matching_logs.sort_values(by='cos_similarity', ascending=False).head(3)

Unnamed: 0,problem_log,cos_similarity,corrective_action
10588,coffee maker 812 flowing constantly,1.0,removed and replaced coffee maker 812 ops and leak check ok rotable parts change was indicated tt 1431005 sn off 06210coffeemaker a off 78 2500 9 9001 6210 on 78 2500 9 9001 9805
9686,fwd coffee maker will not stop brewing water is constantly coming out,0.853548,removed and replaced coffeemaker at position 212ops check normalrotable parts change was indicated tt 1465458 sn off 0688coffeemaker in off 31 2534 9 0001 0688 on 31 2534 9 0001 0010
4157,coffee maker 509 mid galley constantly overflows water,0.841361,rotable parts change was indicated tt 1673152 sn off 07610coffeemaker a off 78 2500 9 9001 7610 on 78 2500 9 9001 3907


#### Save matching logs to csv file

In [14]:
matching_logs.to_csv('../data/matching_logs.csv')

In [15]:
# NOT USING THESE FOR NOW
# regex_seat = []
# for i in data['problem_log']:
#     regex_seat_new = re.findall('[SEAT]{4,5}.?[0-9&]{1,}',  i)
#     regex_seat.append(regex_seat_new)

# data['regex_seat'] = regex_seat