In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

ICD_FILE_PATH = 'icd_codes.csv'
CPT_FILE_PATH = 'cpt_codes.csv'

# --- Data Loading ---
try:
    print(f"Loading data from {ICD_FILE_PATH} and {CPT_FILE_PATH}...")
    icd_df = pd.read_csv(ICD_FILE_PATH)
    cpt_df = pd.read_csv(CPT_FILE_PATH)
    print("Data loaded successfully.")
except FileNotFoundError:
    print("\n--- ERROR ---")
    print("Could not find the data files. Please create dummy files named 'icd_codes.csv' and 'cpt_codes.csv' to run this script.")
    # Stop the script if files are not found
    exit()

# --- The Analysis: Calculate Word Count ---
print("Analyzing description lengths...")
# Make sure the 'description' column is treated as a string
icd_df['description'] = icd_df['LONG DESCRIPTION (VALID ICD-10 FY2025)'].astype(str)
cpt_df['description'] = cpt_df['DESCRIPTION'].astype(str)

# Calculate the number of words in each description
icd_df['word_count'] = icd_df['description'].str.split().str.len()
cpt_df['word_count'] = cpt_df['description'].str.split().str.len()


# --- Visualization ---
print("Creating plots...")
plt.style.use('seaborn-v0_8-whitegrid')
# Create a figure with two subplots, side-by-side
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(18, 7))

# Plot for ICD Codes
sns.histplot(data=icd_df, x='word_count', bins=30, ax=ax1, color='#4c72b0')
ax1.set_title('Distribution of ICD Code Description Lengths', fontsize=16, pad=20)
ax1.set_xlabel('Number of Words in Description', fontsize=12)
ax1.set_ylabel('Number of Codes (Frequency)', fontsize=12)
ax1.axvline(icd_df['word_count'].mean(), color='red', linestyle='--', linewidth=2, label=f"Average: {icd_df['word_count'].mean():.1f} words")
ax1.legend()

# Plot for CPT Codes
sns.histplot(data=cpt_df, x='word_count', bins=30, ax=ax2, color='#dd8452')
ax2.set_title('Distribution of CPT Code Description Lengths', fontsize=16, pad=20)
ax2.set_xlabel('Number of Words in Description', fontsize=12)
ax2.set_ylabel('') 

ax2.axvline(cpt_df['word_count'].mean(), color='red', linestyle='--', linewidth=2, label=f"Average: {cpt_df['word_count'].mean():.1f} words")
ax2.legend()


plt.tight_layout()
plt.savefig('description_length_analysis.png', dpi=300)
print("Saved plot to 'description_length_analysis.png'")
plt.close()

print("\nEDA script finished successfully.")


Loading data from icd_codes.csv and cpt_codes.csv...
Data loaded successfully.
Analyzing description lengths...
Creating plots...
Saved plot to 'description_length_analysis.png'

EDA script finished successfully.
