# What

There is data used by the pytests to test the functionality of the engine and viewer.

I have previously somewhat randomly and manually created these data subsets to be used.

It would be better to have it in one place so that it is clear what data is used and needed.

## Modules

In [None]:
import pandas as pd 
import os
import shutil

This is going to be ran assuming there is a output folder that has the expected files.

Rather than hook this up to the config file it is just a quick and simple hack to make it more transparent.

In [None]:
output_folder = '../../output/'
test_output_folder = '../../tests/data/output/'

## Report titles

In [None]:
titles = pd.read_pickle(os.path.join(test_output_folder, "report_titles.pkl"))

# Getting report PDFs

This is used by the `test_PDFParser.py`

In [None]:
# Create the test pdfs

test_pdfs = [
    "ATSB_r_2021_010",
    "ATSB_r_2021_004",    
    "ATSB_a_2007_030",
    "ATSB_a_2002_646",
    "TSB_a_2022_O0118",
    "TSB_m_2021_A0041",
    "TSB_a_2011_F0012",
    "TAIC_r_2014_103",
    "TAIC_r_2004_121",
    "TAIC_a_2019_006",
]

test_report_dfs = os.path.join(test_output_folder, "report_pdfs") 

shutil.rmtree(test_report_dfs, ignore_errors=True)
os.mkdir(test_report_dfs)
for report_id in test_pdfs:
    shutil.copy(os.path.join(output_folder, 'report_pdfs', f'{report_id}.pdf'), os.path.join(test_report_dfs, f'{report_id}.pdf'))

## Creating the extracted reports data

This is used by:
- `test_RecommendationSafetyIssueLinking.py`
- `test_RecommendationResponseClassification.py`
- `test_Embedding.py`


In [None]:
extracted_reports = pd.read_pickle(os.path.join(output_folder, "extracted_reports.pkl"))

extracted_reports.sample(n=50, random_state=42).to_pickle("../../tests/data/output/extracted_reports.pkl")

# Creating report text dataset

This is used by the `test_ReportExtracting.py`

In [None]:
report_text = pd.read_pickle(os.path.join(output_folder, "parsed_reports.pkl"))

report_text.set_index("report_id",inplace=True)

report_text

In [None]:
ids = [
    "TAIC_m_2016_204",
    "TAIC_m_2020_202",
    "TAIC_r_2014_102",
    "TAIC_a_2014_004",
    "TAIC_m_2010_204",
    "TAIC_a_2010_001",
    "TAIC_r_2022_101",
    "TAIC_a_2010_009",
    "TAIC_r_2019_106",
]

# This is added as this is what was used in the previous extracted set which is used by alot of tests.
ids.extend([
 'TAIC_m_2016_205',
 'TAIC_r_2002_122',
 'TAIC_r_2005_107',
 'TAIC_r_2004_113',
 'TAIC_a_2018_006',
 'TAIC_r_2001_104',
 'TAIC_r_2009_101',
 'TAIC_r_2012_102'])


ids.extend([
  "ATSB_m_2000_157",
  "ATSB_a_2023_011",
  "ATSB_a_2007_012",
  "ATSB_m_2001_170",
  "ATSB_r_2021_002"
 ])


ids.extend([
    "TSB_r_2020_V0230",
])

# Included to be used for content_page reading
ids.extend(
    [
        "TAIC_r_2019_102",
        "ATSB_a_2017_117", 
        "ATSB_a_2014_073",
        "TSB_m_2002_C0018",
        "TSB_a_2005_C0187",
        "ATSB_m_2017_003",
        "ATSB_a_2021_018",
        "TSB_a_2004_H0001",
        "TSB_a_2020_P0013"
    ]
)

# Included for testing recommendation extraction
ids.extend([
    "ATSB_a_2014_096",
    "ATSB_m_2013_011",
])

filtered_report_text = report_text.loc[ids]

filtered_report_text

In [None]:
testing_ids = [
    "TAIC_m_2016_204",
    "TAIC_r_2002_122",
    "TAIC_a_2010_001",
    "TAIC_m_2020_202",
    "TAIC_r_2019_106",
    "TAIC_a_2018_006",
    "TAIC_m_2010_204",
    "ATSB_m_2000_157",
    "ATSB_a_2007_012",
    "ATSB_a_2023_011",
    "ATSB_m_2001_170",
    "ATSB_r_2021_002",
    "TSB_r_2020_V0230",
]
testing_report_text = filtered_report_text.loc[testing_ids]

In [None]:
filtered_report_text.to_pickle(os.path.join(test_output_folder, "parsed_reports.pkl"))

# Extracted reports

In [None]:
extracted_reports = pd.read_pickle(os.path.join(output_folder, "extracted_reports.pkl"))

extracted_reports

In [None]:
pd.read_pickle(os.path.join(test_output_folder, "extracted_reports.pkl"))

In [None]:
extracted_reports.sample(n=50, random_state=42, ignore_index=True).to_pickle(os.path.join(test_output_folder, "extracted_reports.pkl"))

# Embeddings

In [None]:
embedding_files = os.listdir(os.path.join(output_folder,"embeddings"))

embedding_dfs = [pd.read_pickle(os.path.join(output_folder, "embeddings", file)) for file in embedding_files]

embedding_dfs = [df.sample(n=10, random_state=42, ignore_index=True) for df in embedding_dfs]

os.makedirs(os.path.join(output_folder, "embeddings"), exist_ok=True)

for name, df in zip(embedding_files, embedding_dfs):
    df.to_pickle(os.path.join(test_output_folder, "embeddings", name))
    print(df)

## Vector db

In [None]:
import dotenv
import engine.utils.EngineOutputStorage as EngineOutputStorage

dotenv.load_dotenv()

uploader = EngineOutputStorage.EngineOutputUploader(
    os.environ['AZURE_STORAGE_ACCOUNT_NAME'],
    os.environ['AZURE_STORAGE_ACCOUNT_KEY'],
    "engineoutput",
    None,
    "../../tests/data/vector_db",
    "../../output/embeddings/safety_issues_embeddings.pkl",
    "../../output/embeddings/recommendations_embeddings.pkl",
    "../../output/embeddings/report_sections_embeddings.pkl",
    "../../output/embeddings/report_text_embeddings.pkl",
)

uploader._upload_embeddings(sample_frac=0.01)

In [None]:
import lancedb
vector_db = lancedb.connect("../../tests/data/vector_db")

table = vector_db.open_table("all_document_types")

data = table.to_pandas()

data

In [None]:
data['document'].str.contains("work").sum()

## Response classification

In [None]:
rec_classification = pd.read_pickle(os.path.join(output_folder, "recommendation_response_classification.pkl"))
rec_classification

In [None]:
# ATSB website safety issues

atsb_safety_issues = pd.read_pickle(os.path.join(output_folder, "atsb_website_safety_issues.pkl"))

atsb_safety_issues[:-10].to_pickle(os.path.join(test_output_folder, "atsb_website_safety_issues.pkl"))