In [None]:
import pandas as pd
import numpy as np
from transformers import pipeline

In [None]:
df = pd.read_csv('reports_with_image_paths.csv')

In [None]:
df.drop(columns = ['Unnamed: 0'], inplace = True)

In [None]:
df['merged_text'] = str(df['technique'])+str(df['comparison'])+str(df['findings'])+str(df['impression'])

In [None]:
pipe = pipeline("text2text-generation", model="valhalla/t5-small-qa-qg-hl", device = 'cuda')

In [None]:
def clean_text(text):
    if pd.isna(text):
        return np.nan
    text = text.strip()  # Remove leading/trailing spaces
    text = text.replace('___', '')  # Replace placeholders
    text = text.replace('None.', '')  # Replace 'None.' with empty string
    text = text.replace('None', '')  # Replace 'None' with empty string
    text = text if text else np.nan  # Convert empty strings to NaN
    return text

# Apply the cleaning function to each column
for col in ['merged_text']:
    df[col] = df[col].apply(clean_text)

# Drop rows where all columns are NaN
df_cleaned = df.dropna(how='all')

# Optional: Standardize text by converting to lowercase
df_cleaned = df_cleaned.apply(lambda x: x.str.lower() if x.dtype == "object" else x)


In [None]:
df_cleaned.columns

In [None]:
QA = {'Question':[],'Answer': [], 'Path': []}

for i in range(len(df_cleaned)):
    text = df_cleaned[i]['merged_text']
    path = df_cleaned[i]['path']
    resp = pipe(text)
    QA['Question'].append(resp['Question'])
    QA['Answer'].append(resp['Answer'])
    QA['Path'].append(path)
    

In [None]:
QA.to_csv('Question-Corpus.csv')