<a href="https://colab.research.google.com/github/Bempong-Sylvester-Obese/RSNA-Intracranial-Aneurysm-Detection/blob/main/EDA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# IMPORTANT: SOME KAGGLE DATA SOURCES ARE PRIVATE
# RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES.
import kagglehub
kagglehub.login()


In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

rsna_intracranial_aneurysm_detection_path = kagglehub.competition_download('rsna-intracranial-aneurysm-detection')

print('Data source import complete.')


In [None]:
import pandas as pd
import os
import numpy as np
import pydicom
import nibabel as nib
import matplotlib.pyplot as plt
import seaborn as sns
from glob import glob
from tqdm.notebook import tqdm
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

In [None]:
pd.set_option('display.max_columns', None) # we want to display all columns in this notebook
pd.set_option('display.max_rows', 100) # increase number of displayed rows
pd.set_option('max_colwidth', None) # make full cells content visible

In [None]:
DATA_DIR = "/kaggle/input/rsna-intracranial-aneurysm-detection/"
SEGMENTATION_DIR = os.path.join(DATA_DIR, "segmentation")
SERIES_DIR = os.path.join(DATA_DIR, "series")

In [None]:
# CSV files
# --- 1. Load and Inspect CSV Data ---
print("--- 1. Loading and Inspecting CSV Data ---")
train_df = pd.read_csv(os.path.join(DATA_DIR, "train.csv"))
localizers_df = pd.read_csv(os.path.join(DATA_DIR, "train_localizers.csv"))
print(f"Loaded {len(train_df)} rows from train.csv")
print(f"Loaded {len(localizers_df)} rows from train_localizers.csv")
print("\n--- train.csv head ---")
display(train_df.head())
print("\n--- train_localizers.csv head ---")
display(localizers_df.head())

In [None]:
print(train_df.columns)
print(localizers_df.columns)

In [None]:
# --- 2. Statistical Analysis and Visualization of CSV Data ---
print("\n--- 2. Statistical Analysis and Visualization of CSV Data ---")

# Modality Distribution
plt.figure(figsize=(8, 5))
sns.countplot(data=train_df, x='Modality', palette='viridis')
plt.title('Distribution of Modalities in Training Data')
plt.xlabel('Modality')
plt.ylabel('Number of Series')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()

# Aneurysm Presence Distribution
plt.figure(figsize=(6, 4))
sns.countplot(data=train_df, x='Aneurysm Present', palette='coolwarm')
plt.title('Distribution of Aneurysm Presence')
plt.xlabel('Aneurysm Present (0: No, 1: Yes)')
plt.ylabel('Number of Series')
plt.xticks([0, 1], ['No Aneurysm', 'Aneurysm'])
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()

In [None]:
# 6. Analyze modalities
def analyze_modality_distribution(train_df):
    """Analyze the distribution of imaging modalities"""

    print("\n" + "="*50)
    print("🖼️ IMAGING MODALITY ANALYSIS")
    print("="*50)

    if 'Modality' in train_df.columns:
        modality_counts = train_df['Modality'].value_counts()
        print("Modality Distribution:")
        print(modality_counts)

        # Visualize modality distribution
        plt.figure(figsize=(12, 6))

        plt.subplot(1, 2, 1)
        modality_counts.plot(kind='bar', color='skyblue', edgecolor='black')
        plt.title('Distribution of Imaging Modalities')
        plt.xlabel('Modality')
        plt.ylabel('Number of Series')
        plt.xticks(rotation=45)

        plt.subplot(1, 2, 2)
        plt.pie(modality_counts.values, labels=modality_counts.index, autopct='%1.1f%%', startangle=90)
        plt.title('Modality Distribution (%)')

        plt.tight_layout()
        plt.show()

        return modality_counts
    else:
        print("❌ No 'Modality' column found in train.csv")
        return None
modality_counts = analyze_modality_distribution(train_df)

In [None]:
# Group age into bins
train_df['AgeGroup'] = pd.cut(train_df['PatientAge'], bins=[0, 20, 40, 60, 80, 100], labels=["0–20", "21–40", "41–60", "61–80", "81+"])

# Sex distribution
sns.countplot(x="PatientSex", data=train_df)
plt.title("Patient Sex Distribution")
plt.show()


In [None]:
# Age group by sex
sns.countplot(data=train_df, x="AgeGroup", hue="PatientSex")
plt.title("Age Group by Sex")
plt.show()

In [None]:
# Age Distribution by Aneurysm Status
import seaborn as sns
import matplotlib.pyplot as plt

# Convert age to numeric (handling possible string entries)
train_df['PatientAge'] = pd.to_numeric(train_df['PatientAge'], errors='coerce')

# Plot age distribution
plt.figure(figsize=(10, 5))
sns.histplot(data=train_df, x='PatientAge', hue='Aneurysm Present',
             bins=30, kde=True, element='step', palette=['#1f77b4', '#ff7f0e'])
plt.title('Age Distribution by Aneurysm Status')
plt.xlabel('Age')
plt.ylabel('Count')
plt.show()

# Statistical summary
print("Age stats for aneurysm cases:\n", train_df[train_df['Aneurysm Present'] == 1]['PatientAge'].describe())
print("\nAge stats for non-aneurysm cases:\n", train_df[train_df['Aneurysm Present'] == 0]['PatientAge'].describe())

In [None]:
# Sex Distribution by Aneurysm Status
# Plot sex distribution
plt.figure(figsize=(6, 4))
sns.countplot(data=train_df, x='PatientSex', hue='Aneurysm Present',
              palette=['#1f77b4', '#ff7f0e'])
plt.title('Sex Distribution by Aneurysm Status')
plt.xlabel('Sex')
plt.ylabel('Count')
plt.show()

# Cross-tabulation
print(pd.crosstab(train_df['PatientSex'], train_df['Aneurysm Present'],
      margins=True, margins_name="Total"))

In [None]:

# Get location columns
location_cols = [col for col in train_df.columns if col not in ['SeriesInstanceUID', 'PatientAge', 'PatientSex', 'Modality', 'Aneurysm Present']]

# Convert to numeric if not already
train_df[location_cols] = train_df[location_cols].apply(pd.to_numeric, errors='coerce').fillna(0).astype(int)

# Calculate count and percentage
location_counts = train_df[location_cols].sum().sort_values(ascending=False)
location_percentages = location_counts / location_counts.sum() * 100

# Plotting
fig, axes = plt.subplots(1, 2, figsize=(18, 8))

# Bar chart
sns.barplot(x=location_counts.values, y=location_counts.index, ax=axes[0], palette='mako')
axes[0].set_title("Aneurysm Count per Brain Location")
axes[0].set_xlabel("Count")
axes[0].set_ylabel("Brain Location")

# Annotate with count and percentage
for i, (count, percent) in enumerate(zip(location_counts.values, location_percentages.values)):
    axes[0].text(count + 1, i, f"{int(count)} ({percent:.1f}%)", va='center')

# Pie chart
axes[1].pie(location_counts.values,
            labels=[f"{loc}\n{val} ({pct:.1f}%)" for loc, val, pct in zip(location_counts.index, location_counts.values, location_percentages.values)],
            autopct=None,
            startangle=140,
            colors=sns.color_palette("mako", len(location_counts)))
axes[1].set_title("Aneurysm Distribution by Location (Percentage)")

plt.tight_layout()
plt.show()


### localizer.csv

In [None]:
import ast
# Convert string coordinates to dictionaries
localizers_df['coords'] = localizers_df['coordinates'].apply(ast.literal_eval)

In [None]:
# Extract x, y, z coordinates
localizers_df['x'] = localizers_df['coords'].apply(lambda x: x.get('x', np.nan))
localizers_df['y'] = localizers_df['coords'].apply(lambda x: x.get('y', np.nan))
print(localizers_df[['SeriesInstanceUID', 'SOPInstanceUID', 'location', 'x', 'y']].head())

In [None]:
# Analyze Location Distribution
plt.figure(figsize=(10, 5))
sns.countplot(y="location", data=localizers_df, order=localizers_df['location'].value_counts().index)
plt.title("Brain Location Distribution (where aneurysm is present)")
plt.show()


In [None]:
# Heatmap of Coordinate Densities
heatmap_data = localizers_df[['x', 'y']].dropna()

plt.hist2d(heatmap_data['x'], heatmap_data['y'], bins=50, cmap='hot')
plt.colorbar(label='Frequency')
plt.title("Heatmap of Aneurysm Coordinates")
plt.xlabel("x")
plt.ylabel("y")
plt.show()

In [None]:
# Merge with aneurysm labels
# merged_df = localizers_df.merge(train_df[['SeriesInstanceUID', 'Aneurysm Present']],
#                                on='SeriesInstanceUID', how='left')

merged_df = pd.merge(train_df, localizers_df, on='SeriesInstanceUID', how='left')


In [None]:
# Age vs Location
plt.figure(figsize=(10, 6))
sns.boxplot(data=merged_df, x='location', y='PatientAge')
plt.xticks(rotation=45)
plt.title("Patient Age by Aneurysm Location")
plt.show()

# Sex vs Location
plt.figure(figsize=(10, 5))
sns.countplot(data=merged_df, x='location', hue='PatientSex')
plt.xticks(rotation=45)
plt.title("Aneurysm Location by Patient Sex")
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(10, 8))
sns.scatterplot(data=merged_df[merged_df['Aneurysm Present'] == 1],
                x='x', y='y', hue='location', palette='viridis', s=100, alpha=0.7)
plt.title('Aneurysm Spatial Distribution (x-y plane)')
plt.xlabel('X Coordinate')
plt.ylabel('Y Coordinate')
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.show()

In [None]:
# Brain Location Frequency
# Count aneurysms per brain location
location_counts = merged_df[merged_df['Aneurysm Present'] == 1]['location'].value_counts()

# Plot
plt.figure(figsize=(12, 6))
sns.barplot(x=location_counts.values, y=location_counts.index, palette='rocket')
plt.title('Aneurysm Frequency by Brain Location')
plt.xlabel('Count')
plt.ylabel('Location')
plt.show()

In [None]:
merged_df.columns

In [None]:
# Age distribution by location
plt.figure(figsize=(12, 6))
sns.boxplot(data=merged_df[merged_df['Aneurysm Present'] == 1],
            x='location', y='PatientAge', palette='Set3')
plt.title('Age Distribution by Aneurysm Location')
plt.xticks(rotation=45)
plt.show()

# Sex distribution by location
plt.figure(figsize=(12, 6))
sns.countplot(data=merged_df[merged_df['Aneurysm Present'] == 1],
              x='location', hue='PatientSex', palette='Set2')
plt.title('Sex Distribution by Aneurysm Location')
plt.xticks(rotation=45)
plt.legend(title='Sex')
plt.show()