In [None]:
import pandas as pd
import glob

In [None]:
# Path where all your generated CSVs are stored
csv_folder = "/content/"

# Pattern to match all files (adjust the prefix if different)
csv_files = glob.glob(csv_folder + "final_reports_tinyllama*.csv")

print(f"âœ… Found {len(csv_files)} CSV files:")
for f in csv_files:
    print(" -", f)


âœ… Found 1 CSV files:
 - /content/final_reports_tinyllama_fewshot_10k _valid.csv


In [None]:
# Read and combine all CSVs
dfs = [pd.read_csv(f) for f in csv_files]
combined_df = pd.concat(dfs, ignore_index=True)

print(f"ðŸ“Š Combined dataset size: {len(combined_df)} rows")
print(combined_df.head(3))


ðŸ“Š Combined dataset size: 1000 rows
                                     Path  \
0  patient14487/study21/view1_frontal.jpg   
1   patient14487/study2/view1_frontal.jpg   
2  patient14487/study12/view1_frontal.jpg   

                                   Report_Impression  
0  The cardiomediastinal silhouette and pulmonary...  
1  Edema. Cardiomegaly. Pneumonia. Atelectasis. P...  
2  The cardiomediastinal silhouette and pulmonary...  


In [None]:
# Remove duplicates (optional)
before = len(combined_df)
combined_df.drop_duplicates(subset=["Path", "Report_Impression"], inplace=True)
after = len(combined_df)
print(f"ðŸ§¹ Removed {before - after} duplicate rows. Remaining: {after}")

# Drop rows with missing or empty reports
combined_df.dropna(subset=["Report_Impression"], inplace=True)
combined_df = combined_df[combined_df["Report_Impression"].str.strip() != ""]

print(f"ðŸ§½ Cleaned empty rows. Remaining: {len(combined_df)}")


ðŸ§¹ Removed 0 duplicate rows. Remaining: 1000
ðŸ§½ Cleaned empty rows. Remaining: 992


In [None]:
# Function to count words
def word_count(text):
    return len(str(text).split())

combined_df["word_count"] = combined_df["Report_Impression"].apply(word_count)

# Keep only rows with >=15 words
clean_df = combined_df[combined_df["word_count"] >= 15].reset_index(drop=True)

print(f"âœ… Filtered dataset: {len(clean_df)} rows (â‰¥15 words)")


âœ… Filtered dataset: 907 rows (â‰¥15 words)


In [None]:
import re


bad_patterns = [
    r"(?i)explanation[:\-]",        # "Explanation:" or "Explanation -"
    r"(?i)output[:\-]",             # "Output:" or "Output -"
    r"(?i)the report must",         # instructions
    r"(?i)report title",            # meta format info
    r"(?i)author",                  # "author(s)" etc.
    r"(?i)date",                    # "date" field mentions
    r"(?i)location",                # "location" field mentions
    r"(?i)standard format"          # instruction text
]

def is_bad_report(text):
    if not isinstance(text, str):
        return True
    for pattern in bad_patterns:
        if re.search(pattern, text):
            return True
    return False

# Count and remove such rows
before = len(clean_df)
clean_df = clean_df[~clean_df["Report_Impression"].apply(is_bad_report)].reset_index(drop=True)
after = len(clean_df)

print(f"ðŸ§¹ Removed {before - after} reports with instructions/meta text. Remaining: {after}")


ðŸ§¹ Removed 0 reports with instructions/meta text. Remaining: 907


In [None]:
output_file = "/content/final_reports_llama2_7b_fewshot_10k_clean.csv"
clean_df.drop('word_count',axis=1).to_csv(output_file, index=False)

print(f"ðŸ’¾ Cleaned dataset saved to: {output_file}")


ðŸ’¾ Cleaned dataset saved to: /content/final_reports_llama2_7b_fewshot_10k_clean.csv
