In [69]:
import pandas as pd
from openai import OpenAI
import json



In [70]:
client = OpenAI()

In [71]:
df_flight_manuals = pd.read_csv('../data/flight_manuals.csv', encoding='ISO-8859-1')


In [72]:
def clean_column_names(df):
    clean_column_names = []
    for col in df.columns:
        clean_column_names.append(col.lower().replace(' ','_'))
     
    return clean_column_names

In [73]:
df_flight_manuals.columns = clean_column_names(df_flight_manuals)

In [74]:
# Check for missing values
print(df_flight_manuals.isnull().sum())

# Drop rows with missing values (if any)
df_flight_manuals.dropna(inplace=True)

# drop duplicates
df_flight_manuals.drop_duplicates(inplace=True)

# Reset index after dropping rows
df_flight_manuals.reset_index(drop=True, inplace=True)

manual_section    0
scenario          0
instructions      0
dtype: int64


In [75]:
df_flight_manuals.shape

(300, 3)

In [8]:
flight_manuals_documents = df_flight_manuals.to_dict(orient='records')


In [9]:
import hashlib

In [19]:
def generate_document_id(doc):
    combined = f"{doc['manual_section'] + doc['scenario'] + doc['instructions'][:10]}"
    hash_object = hashlib.md5(combined.encode())
    hash_hex = hash_object.hexdigest()
    document_id = hash_hex[:8]
    return document_id

In [20]:
# create id for each row
for doc in flight_manuals_documents:
    doc['id'] = generate_document_id(doc)

In [21]:
flight_manuals_documents[3]

{'manual_section': 'AOM Section 3',
 'scenario': 'Icing Conditions',
 'instructions': 'Activate anti-ice systems, monitor fuel temperature, consider altitude change.',
 'id': '998902cf'}

In [24]:
with open('../data/flight_manuals_documents-with-ids.json', 'wt') as f_out:
    json.dump(flight_manuals_documents, f_out, indent=2)

In [25]:
prompt_template = """
You emulate a user or captain of our AviMate Pilot cockpit assistant application.
Formulate 5 questions this user might ask based on a provided exercise.
Make the questions specific to this exercise.
The record should contain the answer to the questions, and the questions should
be complete and not too short. Use as fewer words as possible from the record. 

The record:

manual_section: {manual_section}
scenario : {scenario}
instructions: {instructions}


Provide the output in parsable JSON without using code blocks:

{{"questions": ["question1", "question2", ..., "question5"]}}
""".strip()

In [26]:
prompt = prompt_template.format(**flight_manuals_documents[0])
print(prompt)


You emulate a user or captain of our AviMate Pilot cockpit assistant application.
Formulate 5 questions this user might ask based on a provided exercise.
Make the questions specific to this exercise.
The record should contain the answer to the questions, and the questions should
be complete and not too short. Use as fewer words as possible from the record. 

The record:

manual_section: AOM Section 10
scenario : Hydraulic Failure
instructions: Switch to backup hydraulic system, contact ATC for nearest airport diversion.


Provide the output in parsable JSON without using code blocks:

{"questions": ["question1", "question2", ..., "question5"]}


In [27]:
def llm(prompt):
    response = client.chat.completions.create(
        model='gpt-4o-mini',
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response.choices[0].message.content

In [28]:
questions = llm(prompt)
print(questions)

{"questions": ["What steps should I follow in the event of a hydraulic failure?", "Which hydraulic system should I switch to during a hydraulic failure?", "How do I contact ATC for a diversion due to hydraulic failure?", "What is the best way to identify the nearest airport for diversion?", "Are there any specific checklists or procedures for handling hydraulic failure?"]}


In [29]:
json.loads(questions)


{'questions': ['What steps should I follow in the event of a hydraulic failure?',
  'Which hydraulic system should I switch to during a hydraulic failure?',
  'How do I contact ATC for a diversion due to hydraulic failure?',
  'What is the best way to identify the nearest airport for diversion?',
  'Are there any specific checklists or procedures for handling hydraulic failure?']}

In [30]:
def generate_questions(doc):
    prompt = prompt_template.format(**doc)

    response = client.chat.completions.create(
        model='gpt-4o-mini',
        messages=[{"role": "user", "content": prompt}]
    )

    json_response = response.choices[0].message.content
    return json_response

In [32]:
from tqdm.auto import tqdm
import pickle


In [34]:
results = {}


In [35]:
for doc in tqdm(flight_manuals_documents): 
    doc_id = doc['id']
    if doc_id in results:
        continue

    questions_raw = generate_questions(doc)
    questions = json.loads(questions_raw)
    results[doc_id] = questions['questions']

  0%|          | 0/286 [00:00<?, ?it/s]

In [55]:
results

{'6f874e56': ['What steps should I take following a hydraulic failure in the cockpit?',
  'How do I switch to the backup hydraulic system during an emergency?',
  'What information do I need to provide to ATC when contacting them about a diversion?',
  'Is there a specific procedure I should follow to identify the nearest airport after a hydraulic failure?',
  'Can you confirm what section of the AOM provides guidance on hydraulic failures?'],
 '4fb290f8': ['What steps should I follow to switch to the backup navigation system?',
  "How do I ensure I'm maintaining safe altitude while flying manually?",
  'What specific preparations should I make for a manual landing?',
  'Are there any particular checks I should complete before switching to the backup system?',
  'What emergency protocols should I be aware of during this navigation system failure scenario?'],
 '9568af42': ['What steps should I take for manual gear extension in a landing gear malfunction scenario?',
  'How do I prepare f

In [36]:
final_results = []

for doc_id, questions in results.items():
    for q in questions:
        final_results.append((doc_id, q))


In [39]:
final_results[0]

('6f874e56',
 'What steps should I take following a hydraulic failure in the cockpit?')

In [40]:
df_results = pd.DataFrame(final_results, columns=['id', 'question'])


In [41]:
df_results.to_csv('../data/ground-truth-data-flight-manuals.csv', index=False)


In [44]:
from collections import defaultdict

In [46]:
hashes = defaultdict(list)
hashes

defaultdict(list, {})

In [47]:
for doc in flight_manuals_documents:
    doc_id = doc['id']
    hashes[doc_id].append(doc)

In [52]:
len(hashes), len(flight_manuals_documents)

(60, 286)

In [53]:
for k, values in hashes.items():
    if len(values) > 1:
        print(k, len(values))

6f874e56 5
4fb290f8 5
9568af42 8
998902cf 2
5adcfc37 5
e4bfc37e 5
2a2af0e9 3
33153f2b 3
88bb7920 8
6806b7b2 5
6c666f77 10
b3930114 5
58c923ff 5
1c1d54d7 6
de81c65e 6
ba61db15 4
ce9c85cb 9
d044fcb4 4
8067612f 5
c864430b 3
a881bd7c 4
37233acc 7
d543665e 4
5ac7c529 2
1013c799 2
9314dd83 3
d018d472 5
6d588250 5
f9f3c374 6
564b2dc8 5
65ee1297 3
527ce75a 8
0684cd8c 5
fb69630c 10
c95db9e4 3
9459129c 4
1fe2ca38 5
28b1837e 8
adb5cc7f 3
5f62756c 8
d26b56ff 4
736b9bc8 3
33307320 3
6e20e527 5
4f540448 5
3ab6904a 3
5a6bd9e3 6
b512f467 4
a12c4585 3
3753d3ea 9
9af5bf17 5
2f2be4e5 4
2ba83365 5
cc8c44b1 3
f3299990 5
de5b5ee6 2
645b9cc1 4
1ddfa77d 5


In [54]:
hashes['6f874e56']


[{'manual_section': 'AOM Section 10',
  'scenario': 'Hydraulic Failure',
  'instructions': 'Switch to backup hydraulic system, contact ATC for nearest airport diversion.',
  'id': '6f874e56'},
 {'manual_section': 'AOM Section 10',
  'scenario': 'Hydraulic Failure',
  'instructions': 'Switch to backup hydraulic system, contact ATC for nearest airport diversion.',
  'id': '6f874e56'},
 {'manual_section': 'AOM Section 10',
  'scenario': 'Hydraulic Failure',
  'instructions': 'Switch to backup hydraulic system, contact ATC for nearest airport diversion.',
  'id': '6f874e56'},
 {'manual_section': 'AOM Section 10',
  'scenario': 'Hydraulic Failure',
  'instructions': 'Switch to backup hydraulic system, contact ATC for nearest airport diversion.',
  'id': '6f874e56'},
 {'manual_section': 'AOM Section 10',
  'scenario': 'Hydraulic Failure',
  'instructions': 'Switch to backup hydraulic system, contact ATC for nearest airport diversion.',
  'id': '6f874e56'}]