In [69]:
import pandas as pd
from openai import OpenAI
import json



In [70]:
client = OpenAI()

In [71]:
df_flight_manuals = pd.read_csv('../data/flight_manuals.csv', encoding='ISO-8859-1')


In [72]:
def clean_column_names(df):
    clean_column_names = []
    for col in df.columns:
        clean_column_names.append(col.lower().replace(' ','_'))
     
    return clean_column_names

In [73]:
df_flight_manuals.columns = clean_column_names(df_flight_manuals)

In [74]:
# Check for missing values
print(df_flight_manuals.isnull().sum())

# Drop rows with missing values (if any)
df_flight_manuals.dropna(inplace=True)

# drop duplicates
df_flight_manuals.drop_duplicates(inplace=True)

# Reset index after dropping rows
df_flight_manuals.reset_index(drop=True, inplace=True)

manual_section    0
scenario          0
instructions      0
dtype: int64


In [75]:
df_flight_manuals.shape

(300, 3)

In [79]:
flight_manuals_documents = df_flight_manuals.to_dict(orient='records')


In [80]:
import hashlib

In [81]:
def generate_document_id(doc):
    combined = f"{doc['manual_section'] + doc['scenario'] + doc['instructions'][:10]}"
    hash_object = hashlib.md5(combined.encode())
    hash_hex = hash_object.hexdigest()
    document_id = hash_hex[:8]
    return document_id

In [82]:
# create id for each row
for doc in flight_manuals_documents:
    doc['id'] = generate_document_id(doc)

In [83]:
flight_manuals_documents[3]

{'manual_section': 'AOM Section 7',
 'scenario': 'Fuel Leak',
 'instructions': 'Monitor fuel levels closely, communicate fuel status to ATC.',
 'id': 'd2530459'}

In [84]:
with open('../data/flight_manuals_documents-with-ids.json', 'wt') as f_out:
    json.dump(flight_manuals_documents, f_out, indent=2)

In [85]:
prompt_template = """
You emulate a user or captain of our AviMate Pilot cockpit assistant application.
Formulate 5 questions this user might ask based on a provided exercise.
Make the questions specific to this exercise.
The record should contain the answer to the questions, and the questions should
be complete and not too short. Use as fewer words as possible from the record. 

The record:

manual_section: {manual_section}
scenario : {scenario}
instructions: {instructions}


Provide the output in parsable JSON without using code blocks:

{{"questions": ["question1", "question2", ..., "question5"]}}
""".strip()

In [86]:
prompt = prompt_template.format(**flight_manuals_documents[0])
print(prompt)


You emulate a user or captain of our AviMate Pilot cockpit assistant application.
Formulate 5 questions this user might ask based on a provided exercise.
Make the questions specific to this exercise.
The record should contain the answer to the questions, and the questions should
be complete and not too short. Use as fewer words as possible from the record. 

The record:

manual_section: AOM Section 3
scenario : Electrical Failure
instructions: Advise cabin crew of situation, maintain visual contact with other aircraft.


Provide the output in parsable JSON without using code blocks:

{"questions": ["question1", "question2", ..., "question5"]}


In [87]:
def llm(prompt):
    response = client.chat.completions.create(
        model='gpt-4o-mini',
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response.choices[0].message.content

In [89]:
questions = llm(prompt)
print(questions)

{"questions": ["What steps should I take to inform the cabin crew about the electrical failure?", "How do I maintain visual contact with other aircraft during this scenario?", "Are there specific protocols for communicating with other aircraft in case of an electrical failure?", "What actions should I prioritize when dealing with an electrical failure situation?", "Is there any immediate checklist I need to follow for electrical failure management?"]}


In [90]:
json.loads(questions)


{'questions': ['What steps should I take to inform the cabin crew about the electrical failure?',
  'How do I maintain visual contact with other aircraft during this scenario?',
  'Are there specific protocols for communicating with other aircraft in case of an electrical failure?',
  'What actions should I prioritize when dealing with an electrical failure situation?',
  'Is there any immediate checklist I need to follow for electrical failure management?']}

In [91]:
def generate_questions(doc):
    prompt = prompt_template.format(**doc)

    response = client.chat.completions.create(
        model='gpt-4o-mini',
        messages=[{"role": "user", "content": prompt}]
    )

    json_response = response.choices[0].message.content
    return json_response

In [92]:
from tqdm.auto import tqdm
import pickle


In [93]:
results = {}


In [94]:
for doc in tqdm(flight_manuals_documents): 
    doc_id = doc['id']
    if doc_id in results:
        continue

    questions_raw = generate_questions(doc)
    questions = json.loads(questions_raw)
    results[doc_id] = questions['questions']

  0%|          | 0/300 [00:00<?, ?it/s]

In [95]:
final_results = []

for doc_id, questions in results.items():
    for q in questions:
        final_results.append((doc_id, q))


In [96]:
final_results[0]

('61162cbd',
 'What specific instructions should I give to the cabin crew during an electrical failure?')

In [97]:
df_results = pd.DataFrame(final_results, columns=['id', 'question'])


In [98]:
df_results.to_csv('../data/ground-truth-data-flight-manuals.csv', index=False)


In [99]:
from collections import defaultdict

In [100]:
hashes = defaultdict(list)
hashes

defaultdict(list, {})

In [101]:
for doc in flight_manuals_documents:
    doc_id = doc['id']
    hashes[doc_id].append(doc)

In [102]:
len(hashes), len(flight_manuals_documents)

(300, 300)

In [103]:
for k, values in hashes.items():
    if len(values) > 1:
        print(k, len(values))

In [54]:
hashes['6f874e56']


[{'manual_section': 'AOM Section 10',
  'scenario': 'Hydraulic Failure',
  'instructions': 'Switch to backup hydraulic system, contact ATC for nearest airport diversion.',
  'id': '6f874e56'},
 {'manual_section': 'AOM Section 10',
  'scenario': 'Hydraulic Failure',
  'instructions': 'Switch to backup hydraulic system, contact ATC for nearest airport diversion.',
  'id': '6f874e56'},
 {'manual_section': 'AOM Section 10',
  'scenario': 'Hydraulic Failure',
  'instructions': 'Switch to backup hydraulic system, contact ATC for nearest airport diversion.',
  'id': '6f874e56'},
 {'manual_section': 'AOM Section 10',
  'scenario': 'Hydraulic Failure',
  'instructions': 'Switch to backup hydraulic system, contact ATC for nearest airport diversion.',
  'id': '6f874e56'},
 {'manual_section': 'AOM Section 10',
  'scenario': 'Hydraulic Failure',
  'instructions': 'Switch to backup hydraulic system, contact ATC for nearest airport diversion.',
  'id': '6f874e56'}]