In [7]:
import pandas as pd
import re

# Load the dataset
file_path = 'C:/Users/tejas/Downloads/DSAI First Sem Tejaswi/Biostatistics and Artificial Intelligence In medicine/Repository_Final_Project/medLLM/data/pmc_patients/PMC-Patients-oa-9995.csv'
df = pd.read_csv(file_path)

# Define a function to trim and summarize patient descriptions
def trim_patient_description(description):
    # Extract age
    age_match = re.search(r'\b(\d{1,3})[- ]?(year[- ]old|years? old)\b', description, re.IGNORECASE)
    age = age_match.group(1) + " years old" if age_match else "Age not specified"
    
    # Extract gender
    gender_match = re.search(r'\b(male|female|man|woman)\b', description, re.IGNORECASE)
    gender = gender_match.group(1).capitalize() if gender_match else "Gender not specified"
    
    # Extract symptoms and conditions
    keywords = ["COVID-19", "pneumonia", "ICU", "surgery", "diabetes", "hypertension", 
                "stroke", "cancer", "fever", "fatigue", "pain", "infection", "fracture"]
    found_conditions = [kw for kw in keywords if re.search(rf'\b{kw}\b', description, re.IGNORECASE)]
    
    # Extract durations and time references
    time_matches = re.findall(r'\b\d+\s+(day|week|month|year)[s]?\b', description, re.IGNORECASE)
    duration = ", ".join(time_matches) if time_matches else "No time details"
    
    # Combine extracted details into a summary
    conditions = ", ".join(found_conditions) if found_conditions else "No specific conditions mentioned"
    summary = f"{age}, {gender}. Conditions: {conditions}. Duration: {duration}."
    return summary

# Apply the function to the 'patient' column
df['trimmed_patient'] = df['patient'].apply(trim_patient_description)

# Preview the trimmed descriptions
print(df[['patient', 'trimmed_patient']].head())

# Save the trimmed dataset to a new file
output_path = 'C:/Users/tejas/Downloads/DSAI First Sem Tejaswi/Biostatistics and Artificial Intelligence In medicine/Repository_Final_Project/medLLM/data/pmc_patients/pmc_dataset_trimming.ipynb'
df.to_csv(output_path, index=False)
print(f"Trimmed dataset saved to: {output_path}")


                                             patient  \
0  This 60-year-old male was hospitalized due to ...   
1  A 39-year-old man was hospitalized due to an i...   
2  One week after a positive COVID-19 result this...   
3  This 69-year-old male was admitted to the ICU ...   
4  This 57-year-old male was admitted to the ICU ...   

                                     trimmed_patient  
0  60 years old, Male. Conditions: COVID-19, feve...  
1  39 years old, Man. Conditions: fever. Duration...  
2  57 years old, Male. Conditions: COVID-19, ICU....  
3  69 years old, Male. Conditions: COVID-19, pneu...  
4  57 years old, Male. Conditions: COVID-19, ICU,...  
Trimmed dataset saved to: C:/Users/tejas/Downloads/DSAI First Sem Tejaswi/Biostatistics and Artificial Intelligence In medicine/Repository_Final_Project/medLLM/data/pmc_patients/pmc_dataset_trimming.ipynb
