In [2]:
import pandas as pd
from tqdm.auto import tqdm
import re
from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk, BulkIndexError
from openai import OpenAI
from sentence_transformers import SentenceTransformer

In [153]:
df_atc_communications = pd.read_csv('../data/atc_communications_realistic.csv', encoding='ISO-8859-1')


In [154]:
df_atc_communications.head()

Unnamed: 0,Time_UTC,call_sign,Frequency,Message,Altitude,Phase,Event_Flag
0,00:05:00,UAE201,121.9,"UAE201, Dubai Ground, pushback and engine star...",0,Pre-flight,
1,00:10:00,UAE201,121.8,"UAE201, taxi to runway 12R via taxiways Alpha ...",0,Taxi,
2,00:20:00,UAE201,118.75,"UAE201, Dubai Tower, wind 110 degrees at 5 kno...",0,Takeoff,
3,00:25:00,UAE201,124.5,"UAE201, contact Dubai Departure on 124.5. Good...",3000,Departure,
4,00:30:00,UAE201,124.5,"UAE201, climb and maintain flight level 150.",10000,Climb,


In [155]:
df_atc_communications.shape

(27, 7)

### Data Preprocessing

In [156]:
def clean_column_names(df):
    clean_column_names = []
    for col in df.columns:
        clean_column_names.append(col.lower().replace(' ','_'))
     
    return clean_column_names

In [157]:
df_atc_communications.columns = clean_column_names(df_atc_communications)

In [158]:
df_atc_communications.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27 entries, 0 to 26
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   time_utc    27 non-null     object
 1   call_sign   27 non-null     object
 2   frequency   27 non-null     object
 3   message     27 non-null     object
 4   altitude    27 non-null     int64 
 5   phase       27 non-null     object
 6   event_flag  7 non-null      object
dtypes: int64(1), object(6)
memory usage: 1.6+ KB


In [159]:
# Check for missing values
print(df_atc_communications.isnull().sum())

df_atc_communications.reset_index(drop=True, inplace=True)

time_utc       0
call_sign      0
frequency      0
message        0
altitude       0
phase          0
event_flag    20
dtype: int64


In [160]:
df_atc_communications['event_flag'] = df_atc_communications['event_flag'].fillna('No Event')

In [161]:
df_atc_communications.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27 entries, 0 to 26
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   time_utc    27 non-null     object
 1   call_sign   27 non-null     object
 2   frequency   27 non-null     object
 3   message     27 non-null     object
 4   altitude    27 non-null     int64 
 5   phase       27 non-null     object
 6   event_flag  27 non-null     object
dtypes: int64(1), object(6)
memory usage: 1.6+ KB


In [162]:
df_atc_communications['text'] = df_atc_communications['frequency'] + ". " + df_atc_communications['message'] + ". " + df_atc_communications['phase'] + ". " + df_atc_communications['event_flag']


In [163]:
df_atc_communications.head()

Unnamed: 0,time_utc,call_sign,frequency,message,altitude,phase,event_flag,text
0,00:05:00,UAE201,121.9,"UAE201, Dubai Ground, pushback and engine star...",0,Pre-flight,No Event,"121.900. UAE201, Dubai Ground, pushback and en..."
1,00:10:00,UAE201,121.8,"UAE201, taxi to runway 12R via taxiways Alpha ...",0,Taxi,No Event,"121.800. UAE201, taxi to runway 12R via taxiwa..."
2,00:20:00,UAE201,118.75,"UAE201, Dubai Tower, wind 110 degrees at 5 kno...",0,Takeoff,No Event,"118.750. UAE201, Dubai Tower, wind 110 degrees..."
3,00:25:00,UAE201,124.5,"UAE201, contact Dubai Departure on 124.5. Good...",3000,Departure,No Event,"124.500. UAE201, contact Dubai Departure on 12..."
4,00:30:00,UAE201,124.5,"UAE201, climb and maintain flight level 150.",10000,Climb,No Event,"124.500. UAE201, climb and maintain flight lev..."


In [164]:
df_atc_communications[df_atc_communications['event_flag'] == "Emergency"]

Unnamed: 0,time_utc,call_sign,frequency,message,altitude,phase,event_flag,text
21,16:00:00,UAE201,125.600,"UAE201, we have an indication of engine failur...",36000,Cruise,Emergency,"125.600. UAE201, we have an indication of engi..."
22,16:05:00,UAE201,125.600,"UAE201, roger, descend and maintain 10,000 fee...",10000,Descent,Emergency,"125.600. UAE201, roger, descend and maintain 1..."
23,17:00:00,UAE201,119.100,"UAE201, cleared for emergency landing runway 2...",3000,Approach,Emergency,"119.100. UAE201, cleared for emergency landing..."
24,17:10:00,UAE201,118.500,"UAE201, Jakarta Tower, runway 25R cleared to l...",0,Landing,Emergency,"118.500. UAE201, Jakarta Tower, runway 25R cle..."
25,17:20:00,UAE201,121.700,"UAE201, taxi to stand, follow the escort vehicle.",0,Taxi to Gate,Emergency,"121.700. UAE201, taxi to stand, follow the esc..."
26,17:30:00,UAE201,---,-- Aircraft grounded for inspection --,0,Ground,Emergency,---. -- Aircraft grounded for inspection --. G...


In [165]:
es_client = Elasticsearch('http://localhost:9200')

In [166]:
index_name = 'atc_communications'

index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "time_utc": {"type": "text"},
            "call_sign": {"type": "keyword"},
            "frequency": {"type": "text"},
            "message": {"type": "text"},
            "phase": {"type": "keyword"},
            "event_flag": {"type": "keyword"},
            "text": {"type": "text"},
            "source":{"type":"text"},
            "embedding": {"type": "dense_vector", "dims": 384, "index": True, "similarity": "cosine"}  # Adjust dims based on embedding size
        }
    }
}


In [167]:
# Delete the index if it already exists
if es_client.indices.exists(index=index_name):
    es_client.indices.delete(index=index_name)

# Create the index
es_client.indices.create(index=index_name, body=index_settings)


ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'atc_communications'})

In [168]:
model = SentenceTransformer('all-MiniLM-L6-v2')

In [169]:
len(model.encode("This is a simple sentence"))

384

In [170]:
df_atc_communications['source'] = 'atc_communication'


In [171]:
def get_embedding(text):
    return model.encode(text).tolist()

# Generate embeddings for all texts
df_atc_communications['embedding'] = df_atc_communications['text'].apply(get_embedding)


 ### Indexing Data with Embeddings

In [172]:
# Prepare Data for Indexing

records = df_atc_communications.to_dict(orient='records')


In [173]:
records[0]

{'time_utc': '00:05:00',
 'call_sign': 'UAE201',
 'frequency': '121.900',
 'message': 'UAE201, Dubai Ground, pushback and engine start approved. Expect runway 12R for departure.',
 'altitude': 0,
 'phase': 'Pre-flight',
 'event_flag': 'No Event',
 'text': '121.900. UAE201, Dubai Ground, pushback and engine start approved. Expect runway 12R for departure.. Pre-flight. No Event',
 'source': 'atc_communication',
 'embedding': [0.05607745051383972,
  0.01026538759469986,
  -0.0053070527501404285,
  0.016052762046456337,
  0.017317432910203934,
  0.012573706917464733,
  -0.08177617937326431,
  -0.007833302021026611,
  -0.02114078402519226,
  -0.016542963683605194,
  0.04867243766784668,
  -0.029997212812304497,
  -0.020496241748332977,
  -0.03377648815512657,
  -0.08672980964183807,
  -0.027755580842494965,
  0.032688774168491364,
  -0.1455686092376709,
  -0.018805749714374542,
  0.013807186856865883,
  0.07517226785421371,
  0.004148000851273537,
  0.07723048329353333,
  0.0456048361957073

In [174]:
def generate_actions():
    for record in records:
        yield {
            "_index": index_name,
            "_source": record
            
        }

In [175]:
def bulk_index_data(es_client, actions):
    try:
        bulk(es_client, generate_actions())
        print("Bulk indexing completed successfully!")

    except BulkIndexError as e:
        print(f"{len(e.errors)} documents failed to index")
        failed_documents = e.errors
        for i, error in enumerate(failed_documents, 1):
            action = error['index']
            error_info = action.get('error', {})
            document_id = action.get('_id', "N/A")
            status = action.get('status', "Unknown Status")

            print(f"\nFailed Document {i}:")
            print(f"ID: {document_id}")
            print(f"Status: {status}")
            print(f"Error Type: {error_info.get('type')}")
            print(f"Reason: {error_info.get('reason')}")
            document_source = action.get('data', {})
            print(f"Document Content: {document_source}")

    

In [176]:
bulk_index_data(es_client, generate_actions())


Bulk indexing completed successfully!


#### Implementing User Query Functions

In [177]:
client = OpenAI()

In [178]:
def get_user_query(query):
    # For demonstration, you can hardcode a query or accept input
    return query


In [179]:
def get_query_embedding(query):
    return model.encode(query).tolist()


In [180]:
conversation_history = []


In [197]:
def elastic_search(query_vector, index_name, es_client, top_k=10):
    # Define the script query
    script_query = {
        "script_score": {
            "query": {
                "match_all": {}
            },
            "script": {
                "source": "cosineSimilarity(params.query_vector, 'embedding') + 1.0",
                "params": {"query_vector": query_vector}
            }
        }
    }
    
    try:
        # Execute the search query
        response = es_client.search(
            index=index_name,
            body={
                "size": top_k,
                "query": script_query,
                "_source": ["time_utc", "frequency", "message", "phase","event_flag","text"]
            }
        )
        
        # Collect the search results
        result_docs = [hit['_source'] for hit in response['hits']['hits']]
        
        return result_docs  # Return the results for further processing if needed
        
    except Exception as e:
        print(f"An error occurred: {e}")
        return None


In [195]:
def retrieve_documents(query):
    query_vector = get_query_embedding(query)
    # Search in ATC communications
    retrieved_docs = elastic_search(query_vector, index_name, es_client, top_k=10)
    return retrieved_docs

In [196]:
retrieve_documents('all communications related to the emergency')

An error occurred: BadRequestError(400, 'search_phase_execution_exception', 'Text fields are not optimised for operations that require per-document field data like aggregations and sorting, so these operations are disabled by default. Please use a keyword field instead. Alternatively, set fielddata=true on [time_utc] in order to load field data by uninverting the inverted index. Note that this can use significant memory.')


In [184]:
def format_retrieved_documents(retrieved_docs):
    formatted_docs = ""
    for doc in retrieved_docs:
        formatted_docs += f"Time: {doc['time_utc']}\n"
        formatted_docs += f"Frequency: {doc['frequency']}\n"
        formatted_docs += f"Message: {doc['message']}\n\n"
        formatted_docs += f"Phase: {doc['phase']}\n\n"
        formatted_docs += f"Event: {doc['event_flag']}\n\n"

    return formatted_docs

In [185]:

def build_prompt(question, retrieved_docs):

    prompt_template = """
    You are an assistant helping pilots with atc communications etc. Based on the following ATC communication information, answer the question:

    ATC communication information:
    {retrieved_docs}

    Question:
    {question}

    Answer:
    """.strip()

    # Format the retrieved documents
    formatted_docs = format_retrieved_documents(retrieved_docs)
    
    # Construct the prompt
    prompt = prompt_template.format(retrieved_docs=formatted_docs, question=question)
    
    return prompt

In [186]:
def build_messages(question, retrieved_docs, conversation_history):
    # System prompt without retrieved documents
    system_prompt = """
    You are an assistant helping pilots with giving informations such as flight manuals, ATC communications etc. Use the provided ATC communication information to answer the user's questions.
    """.strip()

    # Build the messages list
    messages = []

    # Add the system prompt
    messages.append({'role': 'system', 'content': system_prompt})

    # Add the retrieved documents as a system message
    formatted_docs = format_retrieved_documents(retrieved_docs)
    messages.append({'role': 'system', 'content': f"ATC communication information:\n{formatted_docs}"})

    # Add recent conversation history (limit to last N messages to stay within token limits)
    messages.extend(conversation_history[-10:])  # Adjust N as needed

    # Add the user's latest question
    messages.append({'role': 'user', 'content': question})

    return messages


In [187]:
def generate_answer(messages):
    response = client.chat.completions.create(
        model='gpt-4o-mini',
        messages=messages,
        max_tokens=2500,
        temperature=0
    )
    answer = response.choices[0].message.content
    return answer


In [188]:
def rag(query):
    query = get_user_query(query)
    retrieved_docs = retrieve_documents(query)
    messages = build_messages(query, retrieved_docs, conversation_history)
    answer = generate_answer(messages)
    # Update the conversation history
    conversation_history.append({'role': 'user', 'content': query})
    conversation_history.append({'role': 'assistant', 'content': answer})

    return answer

In [189]:
answer = rag("summarize communications related to the emergency")
print(answer)

Here is a summary of the communications related to the emergency for flight UAE201:

1. **16:00:00 (Frequency: 125.600)** - UAE201 reported an indication of engine failure and requested an immediate descent.
   
2. **16:05:00 (Frequency: 125.600)** - ATC responded, instructing UAE201 to descend and maintain 10,000 feet, and to turn left heading 270 degrees for a vector back to Jakarta.

3. **17:00:00 (Frequency: 119.100)** - UAE201 was cleared for an emergency landing on runway 25R, with emergency services standing by.

4. **17:10:00 (Frequency: 118.500)** - UAE201 was cleared to land on runway 25R.

5. **17:20:00 (Frequency: 121.700)** - After landing, UAE201 was instructed to taxi to the stand and follow the escort vehicle.

6. **17:30:00** - The aircraft was grounded for inspection.

This sequence outlines the critical communications during the emergency situation, from the initial engine failure indication to the aircraft being grounded for inspection after landing.


In [190]:
answer = rag("What was the ATC message at 16:00 UTC?")
print(answer)

At 16:00:00 UTC, the ATC message was: "UAE201, we have an indication of engine failure, requesting immediate descent." This communication indicated an emergency situation for flight UAE201.


In [191]:
answer = rag('Did ATC report any turbulence today?')
print(answer)

Yes, ATC reported moderate turbulence for flight UAE201 at 04:00:00 UTC, advising the flight to climb to flight level 370.


In [192]:
answer = rag('is there Landing Gear Malfunction?')
print(answer)

There is no specific communication indicating a landing gear malfunction for flight UAE201 in the provided ATC communications. The emergency reported was related to an engine failure, and all landing clearances were given without mention of any landing gear issues.


In [193]:
answer = rag("summarize last 3 atc communications")
print(answer)

Here is a summary of the last three ATC communications for flight UAE201:

1. **Time: 00:10:00 (Frequency: 121.800)** - UAE201 was instructed to taxi to runway 12R via taxiways Alpha and Bravo, and to hold short of runway 12R.

2. **Time: 09:45:00 (Frequency: 119.100)** - UAE201 was cleared to turn left heading 180 degrees, descend to 3,000 feet, and was cleared for the ILS approach to runway 25R.

3. **Time: 08:00:00 (Frequency: 126.900)** - UAE201 was instructed to descend and maintain flight level 330.

These communications cover taxi instructions, approach clearance, and descent instructions for the flight.


In [111]:
df_flight_manuals = pd.read_csv('../data/flight_manuals.csv', encoding='ISO-8859-1')


In [112]:
df_flight_manuals.columns = clean_column_names(df_flight_manuals)

In [113]:
df_flight_manuals.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300 entries, 0 to 299
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   manual_section  300 non-null    object
 1   scenario        300 non-null    object
 2   instructions    300 non-null    object
dtypes: object(3)
memory usage: 7.2+ KB


In [114]:
df_flight_manuals['text'] = df_flight_manuals['scenario'] + ". " + df_flight_manuals['instructions']


In [121]:
df_flight_manuals['source'] = 'flight_manual'


In [124]:
index_name_flight_manuals = 'flight_manuals'

index_settings_flight_manuals = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "manual_section": {"type": "keyword"},
            "scenario": {"type": "text"},
            "instructions": {"type": "text"},
            "text": {"type": "text"},
            "source":{"type": "text"},
            "embedding": {"type": "dense_vector", "dims": 384, "index": True, "similarity": "cosine"}  # Adjust dims based on embedding size
        }
    }
}

In [125]:
# Delete the index if it already exists
if es_client.indices.exists(index=index_name_flight_manuals):
    es_client.indices.delete(index=index_name_flight_manuals)

# Create the index
es_client.indices.create(index=index_name_flight_manuals, body=index_settings_flight_manuals)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'flight_manuals'})

In [126]:
df_flight_manuals['embedding'] = df_flight_manuals['text'].apply(get_embedding)


In [127]:
flight_manuals_records = df_flight_manuals.to_dict(orient='records')


In [None]:
def elastic_search_combine(query_vector, index_name, es_client, top_k=10, source_fields=None):
    # Define the script query
    script_query = {
        "script_score": {
            "query": {
                "match_all": {}
            },
            "script": {
                "source": "cosineSimilarity(params.query_vector, 'embedding') + 1.0",
                "params": {"query_vector": query_vector}
            }
        }
    }
    
    try:
        # Execute the search query
        response = es_client.search(
            index=index_name,
            body={
                "size": top_k,
                "query": script_query,
                "_source": source_fields or ["*"]
            }
        )
        
        # Collect the search results
        result_docs = [hit['_source'] for hit in response['hits']['hits']]
        
        return result_docs  # Return the results for further processing if needed
        
    except Exception as e:
        print(f"An error occurred: {e}")
        return None

In [129]:
def retrieve_documents_all(query):
    query_vector = get_query_embedding(query)
    # Search in ATC communications
    atc_docs = elastic_search_combine(query_vector, index_name, es_client, top_k=10)
    # serach in flight manual
    flight_manual_docs = elastic_search_combine(query_vector, index_name_flight_manuals, es_client, top_k=10)
    # Combine results
    retrieved_docs = (atc_docs or []) + (flight_manual_docs or [])

    return retrieved_docs

In [131]:
def format_retrieved_documents_all(retrieved_docs):
    formatted_docs = ""

    for doc in retrieved_docs:
        if doc.get('source') == "atc_communication":
            formatted_docs = f"Time: {doc['time_utc']}, Message: {doc['message']}, Phase: {doc['phase']}, Event Flag: {doc['event_flag']}\n"

        elif doc.get('source') == "flight_manual":
            formatted_docs = f"Manual Section: {doc['manual_section']}, Scenario: {doc['scenario']}, Instructions: {doc['instructions']}\n"

        else:
            # Handle other sources if any
            formatted_docs = f"{doc}\n"          

        return formatted_docs

In [132]:
def build_messages_combine(question, retrieved_docs, conversation_history):
    # System prompt without retrieved documents
    system_prompt = """
    You are an assistant helping pilots with giving informations such as flight manuals, ATC communications etc. Use the provided ATC communication information and flight manual to answer the user's questions.
    """.strip()

    # Build the messages list
    messages = []

    # Add the system prompt
    messages.append({'role': 'system', 'content': system_prompt})

    # Add the retrieved documents as a system message
    formatted_docs = format_retrieved_documents_all(retrieved_docs)
    messages.append({'role': 'system', 'content': f"ATC communication information:\n{formatted_docs}"})

    # Add recent conversation history (limit to last N messages to stay within token limits)
    messages.extend(conversation_history[-10:])  # Adjust N as needed

    # Add the user's latest question
    messages.append({'role': 'user', 'content': question})

    return messages

In [136]:
def generate_answer_all(messages):
    response = client.chat.completions.create(
        model='gpt-4o-mini',
        messages=messages,
        max_tokens=2500,
        temperature=0
    )
    answer = response.choices[0].message.content
    return answer

In [134]:
def rag_combine(query):
    query = get_user_query(query)
    retrieved_docs = retrieve_documents_all(query)
    messages = build_messages_combine(query, retrieved_docs, conversation_history)
    answer = generate_answer_all(messages)
    # Update the conversation history
    conversation_history.append({'role': 'user', 'content': query})
    conversation_history.append({'role': 'assistant', 'content': answer})

    return answer

In [141]:
question1 = "summarize communications related to the emergency"
answer1 = rag_combine(question1)
print("Assistant:", answer1)

Assistant: Here is a summary of the communications related to the emergency for flight UAE201:

1. **Engine Failure Report**: At 16:00:00 UTC, flight UAE201 reported an indication of engine failure and requested an immediate descent.

2. **ATC Response**: ATC instructed the flight to descend to 10,000 feet and provided a left turn heading to return to Jakarta.

3. **Emergency Landing Clearance**: ATC cleared flight UAE201 for an emergency landing on runway 25R and confirmed that emergency services were standing by.

These communications indicate a clear sequence of events from the initial report of engine failure to the coordination for an emergency landing.


In [142]:
question2 = "What is manual procedure to handle this failure?"
answer2 = rag_combine(question2)
print("Assistant:", answer2)

Assistant: To handle an engine failure, pilots typically follow the emergency procedures outlined in the aircraft's flight manual. While the specific procedures can vary by aircraft type, here are general steps that are commonly included in engine failure procedures:

1. **Maintain Control of the Aircraft**:
   - Fly the aircraft at the appropriate airspeed, typically the best glide speed if the engine is lost.

2. **Identify the Failed Engine**:
   - Confirm which engine has failed (if applicable) using engine instruments and indications.

3. **Declare an Emergency**:
   - Communicate with ATC to declare an emergency and request assistance.

4. **Perform Engine Failure Checklist**:
   - Follow the specific checklist for engine failure, which may include:
     - Reducing power on the operating engine.
     - Feathering the propeller (if applicable).
     - Adjusting the mixture and fuel flow as necessary.

5. **Prepare for Landing**:
   - Plan for a landing at the nearest suitable airp

In [143]:
question3 = "build some sentences to communicate with the passengers?"
answer3 = rag_combine(question3)
print("Assistant:", answer3)

Assistant: Here are some example sentences you can use to communicate with passengers during an engine failure situation:

1. **Initial Announcement**:
   - "Ladies and gentlemen, this is your captain speaking. We have encountered a technical issue with one of our engines, and we are currently following our emergency procedures."

2. **Reassurance**:
   - "I want to assure you that we are trained to handle this situation, and we are in communication with air traffic control to ensure a safe landing."

3. **Instructions**:
   - "Please remain seated with your seatbelts fastened. We will be preparing for an emergency landing shortly."

4. **Preparation for Landing**:
   - "As we approach the airport, I ask that you follow the instructions of the cabin crew and prepare for landing. Please ensure your seatbacks and tray tables are in their upright positions."

5. **Final Reassurance**:
   - "Thank you for your understanding and cooperation. We will keep you updated as we proceed, and we ap