Necessary packages:

In [None]:
import glob
import os
import warnings


import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

import pydicom
from pydicom.data import get_testdata_files
from sklearn.model_selection import train_test_split

from IPython.display import Markdown, display

 # Import functions from the module
import importlib
import help_files._0_definitions 
import  help_files._1_visuals_script
# import  help_files._01_load_data
 # Reload the module to apply the changes to the script
importlib.reload(help_files._0_definitions)
importlib.reload(help_files._1_visuals_script)
# importlib.reload(help_files._01_load_data)
import  help_files._1_visuals_script  as pauls_vs
# Group by 'condition', 'level', and 'severity' and count occurrences
from help_files._0_definitions import count_severity_by_condition_level 
# Define the path
from pathlib import Path

pd.set_option("display.width", 1000)  # Set a large width to prevent line wrapping
 

Run _01_load_data.py and _0_definitions.py


In [None]:
### In definitions are all the functions that are used in the notebook and globals
with open("help_files/_0_definitions.py") as file:
    exec(file.read())

with open("help_files/_01_load_data.py") as file:
    exec(file.read())


In [None]:
# Load the data from _01_load_data
file_names = ["X_train.csv", "X_train_coor.csv", "X_train_des.csv"]
dataframes = [pd.read_csv(data_path_vor / file_name) for file_name in file_names]
X_train, X_train_coor, X_train_des = dataframes

### X_train manipulation

In [None]:
X_train

In [None]:
# Summary statistics of the dataframe
summary = X_train.describe(include='all')

# Information about the dataframe
info = X_train.info()

# Display the summary and info
print(summary)
print(info)

In [None]:
X_train.dtypes

In [None]:
# List of columns to iterate through: 
columns_to_iterate = [
    'spinal_canal_stenosis_l1_l2', 'spinal_canal_stenosis_l2_l3', 'spinal_canal_stenosis_l3_l4',
    'spinal_canal_stenosis_l4_l5', 'spinal_canal_stenosis_l5_s1', 'left_neural_foraminal_narrowing_l1_l2',
    'left_neural_foraminal_narrowing_l2_l3', 'left_neural_foraminal_narrowing_l3_l4', 'left_neural_foraminal_narrowing_l4_l5',
    'left_neural_foraminal_narrowing_l5_s1', 'right_neural_foraminal_narrowing_l1_l2', 'right_neural_foraminal_narrowing_l2_l3',
    'right_neural_foraminal_narrowing_l3_l4', 'right_neural_foraminal_narrowing_l4_l5', 'right_neural_foraminal_narrowing_l5_s1',
    'left_subarticular_stenosis_l1_l2', 'left_subarticular_stenosis_l2_l3', 'left_subarticular_stenosis_l3_l4',
    'left_subarticular_stenosis_l4_l5', 'left_subarticular_stenosis_l5_s1', 'right_subarticular_stenosis_l1_l2',
    'right_subarticular_stenosis_l2_l3', 'right_subarticular_stenosis_l3_l4', 'right_subarticular_stenosis_l4_l5',
    'right_subarticular_stenosis_l5_s1'
]


# Example operation: Fill missing values with 'Unknown'
for column in columns_to_iterate:
    X_train[column] = X_train[column].map({'Normal/Mild': 1, 'Moderate': 2, 'Severe': 3})

X_train

Disribution of different damages in spine

In [None]:
# Create an empty list to store the distribution data
distribution_data = []

# Calculate the distribution for each column
for column in columns_to_iterate:
    counts = X_train[column].value_counts().sort_index()
    total_counts = counts.sum()
    distribution_data.append({
        'Condition': column,
        'Normal/Mild': counts.get(1.0, 0),
        'Moderate': counts.get(2.0, 0),
        'Severe': counts.get(3.0, 0)
    })
    distribution_data[-1]['Normal/Mild (%)'] = (counts.get(1.0, 0) / total_counts) * 100
    distribution_data[-1]['Moderate (%)'] = (counts.get(2.0, 0) / total_counts) * 100
    distribution_data[-1]['Severe (%)'] = (counts.get(3.0, 0) / total_counts) * 100

# Convert the list to a DataFrame
distribution_df = pd.DataFrame(distribution_data)

# Display the distribution DataFrame
print(distribution_df)

 


In [None]:
X_train_des

In [None]:
## reshaping the data from wide to long
reshape_list = ['spinal_canal_stenosis_l1_l2', 'spinal_canal_stenosis_l2_l3', 'spinal_canal_stenosis_l3_l4', 'spinal_canal_stenosis_l4_l5', 'spinal_canal_stenosis_l5_s1', 'left_neural_foraminal_narrowing_l1_l2', 'left_neural_foraminal_narrowing_l2_l3', 'left_neural_foraminal_narrowing_l3_l4', 'left_neural_foraminal_narrowing_l4_l5', 'left_neural_foraminal_narrowing_l5_s1', 'right_neural_foraminal_narrowing_l1_l2', 'right_neural_foraminal_narrowing_l2_l3', 'right_neural_foraminal_narrowing_l3_l4', 'right_neural_foraminal_narrowing_l4_l5', 'right_neural_foraminal_narrowing_l5_s1', 'left_subarticular_stenosis_l1_l2', 'left_subarticular_stenosis_l2_l3', 'left_subarticular_stenosis_l3_l4', 'left_subarticular_stenosis_l4_l5', 'left_subarticular_stenosis_l5_s1', 'right_subarticular_stenosis_l1_l2', 'right_subarticular_stenosis_l2_l3', 'right_subarticular_stenosis_l3_l4', 'right_subarticular_stenosis_l4_l5', 'right_subarticular_stenosis_l5_s1']  
# reshape_list = ['spinal_canal_stenosis_l1_l2', 'spinal_canal_stenosis_l2_l3', 'spinal_canal_stenosis_l3_l4', 'spinal_canal_stenosis_l4_l5', 'spinal_canal_stenosis_l5_s1', 'left_neural_foraminal_narrowing_l1_l2', 'left_neural_foraminal_narrowing_l2_l3', 'left_neural_foraminal_narrowing_l3_l4', 'left_neural_foraminal_narrowing_l4_l5', 'left_neural_foraminal_narrowing_l5_s1', 'right_neural_foraminal_narrowing_l1_l2', 'right_neural_foraminal_narrowing_l2_l3', 'right_neural_foraminal_narrowing_l3_l4', 'right_neural_foraminal_narrowing_l4_l5', 'right_neural_foraminal_narrowing_l5_s1', 'left_subarticular_stenosis_l1_l2', 'left_subarticular_stenosis_l2_l3', 'left_subarticular_stenosis_l3_l4', 'left_subarticular_stenosis_l4_l5', 'left_subarticular_stenosis_l5_s1', 'right_subarticular_stenosis_l1_l2', 'right_subarticular_stenosis_l2_l3', 'right_subarticular_stenosis_l3_l4', 'right_subarticular_stenosis_l4_l5', 'right_subarticular_stenosis_l5_s1']  
# Assuming df is your DataFrame
reshaped_train = pd.melt(X_train, 
                  id_vars=["study_id"],  # Keep study_id as is
                  var_name="categorie",  # New column for the condition names
                  value_name="severity")   # New column for the values

# Display the reshaped DataFrame
# Ensure the file is not open in another program and you have write permissions
# reshaped_train.to_csv(path / "starfor_sorting_new.csv",  index=False)
 
 
X_train = reshaped_train
X_train = X_train.sort_values(by=['study_id', 'categorie'], ignore_index=True)
X_train.head()

In [None]:
# Split the string into two parts: 'spinal_canal_stenosis' and 'l1_l2'
split_columns = X_train['categorie'].str.rsplit('_', n=2)
# Combine the first part as 'condition' and the last two parts as 'level'
X_train['condition'] = split_columns.str[0]
X_train['level'] = split_columns.str[1] + '_' + split_columns.str[2]

X_train 

In [None]:
## adjusting srtings: disenabling the _ and making all lower case
X_train['condition'] = X_train['condition'].str.replace('_', ' ')
X_train['level'] = X_train['level'].str.replace('_', '/')


X_train_coor['condition'] = X_train_coor['condition'].str.lower()
X_train_coor['level'] = X_train_coor['level'].str.lower()
X_train.head()

### Select which kind of damage

In [None]:
# keep only left neural foraminal narrowing l4/l5
# Filter the DataFrame to keep only rows where the condition is either left_subarticular_stenosis_l4_l5 or right_subarticular_stenosis_l4_l5
filtered_df = X_train[(X_train['condition'].str.contains('spinal canal stenosis')) & ((X_train['level'] == 'l3/l4') | (X_train['level'] == 'l4/l5') | (X_train['level'] == 'l5/s1'))]
print(filtered_df)

X_train = filtered_df


In [None]:
X_train 

In [None]:
# Group by 'categorie' and 'condition', then count the occurrences of each severity level
severity_counts = X_train.groupby(['categorie', 'condition'])['severity'].value_counts().unstack(fill_value=0)
# Display the result
print(severity_counts)



In [None]:
unique_study_ids_train = X_train['study_id'].nunique()
unique_study_ids_train_coor = X_train_coor['study_id'].nunique()

print(f"Number of unique study_id in X_train: {unique_study_ids_train}")
print(f"Number of unique study_id in X_train_coor: {unique_study_ids_train_coor}")

In [None]:
 # get rid of persons with no coordinates
X_train = X_train[X_train['study_id'].isin(X_train_coor['study_id'])]

In [None]:
unique_study_ids_train = X_train['study_id'].nunique()
unique_study_ids_train_coor = X_train_coor['study_id'].nunique()

print(f"Number of unique study_id in X_train: {unique_study_ids_train}")
print(f"Number of unique study_id in X_train_coor: {unique_study_ids_train_coor}")

merging X_train and X_train_coor 

In [None]:
# Perform a left join between merged_df an X_train on multiple variables
merged_df = pd.merge(X_train, X_train_coor, on=['study_id', 'condition', 'level'], how='left')
# Display the merged DataFrame

merged_df = merged_df.sort_values(by=['categorie'], ignore_index=True) 
merged_df.head()

In [None]:
# Group by 'categorie' and 'condition', then count the occurrences of each severity level
severity_counts = X_train.groupby(['categorie', 'condition'])['severity'].value_counts().unstack(fill_value=0)
# Display the result
print(severity_counts)

some statistics of data set

In [None]:
# Count total rows in each DataFrame before the merge
print(f"Rows in X_train: {len(X_train)}")
print(f"Rows in X_train_coor: {len(X_train_coor)}")

# Count total rows in the merged DataFrame
print(f"Rows in merged_df: {len(merged_df)}")

# Find out how many rows have missing values in columns from X_train_coor after the merge
# Assuming columns from X_train_coor start with a common prefix or are listed in a known list
columns_from_X_train_coor = [col for col in X_train_coor.columns if col not in ['study_id', 'condition', 'level']]
missing_rows = merged_df[columns_from_X_train_coor].isnull().all(axis=1).sum()

# Create a new DataFrame for study_ids with at least one row having missing values in columns from X_train_coor
missing_study_ids = merged_df[merged_df[columns_from_X_train_coor].isnull().any(axis=1)]['study_id'].unique()
missing_persons_df = merged_df[merged_df['study_id'].isin(missing_study_ids)]
missing_persons_df = missing_persons_df.sort_values(by=['study_id', 'categorie'], ignore_index=True)
 

print(f"Rows in merged_df without matching rows in X_train_coor: {missing_rows}")
print(f"Rows in merged_df with matches from X_train_coor: {len(merged_df) - missing_rows}")

In [None]:
missing_persons_df.head()

In [None]:
# drop rows with missing values in columns from X_train_coor
merged_df = merged_df.dropna(subset=columns_from_X_train_coor)
display(Markdown('<span style="color:red">later on to take it back to the original shape : 48692</span>'))
merged_df

In [None]:
# Group by 'categorie' and 'condition', then count the occurrences of each severity level
severity_counts = merged_df.groupby(['categorie', 'condition'])['severity'].value_counts().unstack(fill_value=0)
# Display the result
print(severity_counts)

In [None]:
merged_df

Folder with images: preparing paths for merging with main data 

 define which mri type adequate for my goal

In [None]:
X_train_des.dtypes

In [None]:
mri_kind = ['Sagittal T2/STIR', 'Axial T2']

In [None]:
X_train_des = X_train_des[X_train_des['series_description'].isin(mri_kind)]
X_train_des.head()

Paths to images constructing data frame: two options how to do

In [None]:
# using X_train_desÖ Ecxell table is better than taking the path from the folders 
import re

def paths_to_images(df, data_dir):
    image_paths = []
    for study_id, series_id in zip(df['study_id'], df['series_id']):
        study_dir = os.path.join(data_dir, str(study_id))
        series_dir = os.path.join(study_dir, str(series_id))
        
        # List images in the series directory
        images = os.listdir(series_dir)
        # Create full paths for each image
        image_paths.extend([os.path.join(series_dir, img) for img in images])
        
    return image_paths

image_paths = paths_to_images(X_train_des, os.path.join("data/train_images_origin"))

# Sort the image paths to ensure numerical order
def numerical_sort(value):
    parts = re.split(r'(\d+)', value)
    return [int(part) if part.isdigit() else part for part in parts]

image_paths = sorted(image_paths, key=numerical_sort)
image_paths[:75]

df_image_paths = pd.DataFrame(image_paths, columns=['image_path'])
df_image_paths.head()

In [None]:
""" # brauche ich nicht Using real folders and images:  
# path to images folder and images inside the folder
import re
# Define the main directory
main_dir = "data/train_images"

def paths_to_images_2(main_dir):
    image_paths = []
    # Walk through the directory structure starting from the main directory
    for root, subdirs, files in os.walk(main_dir):
        for file in files:
            # Create the full path for each image
            print(file)
            file_path = os.path.join(root, file)
            image_paths.append(file_path)

    return image_paths

paths_to_images_2(main_dir)



################   sort path images in numerical order  
# Define the main directory
main_dir = "data/train_images"

# Get the image paths
image_paths2 = paths_to_images_2(main_dir)

# Sort the image paths to ensure numerical order
def numerical_sort(value):
    parts = re.split(r'(\d+)', value)
    return [int(part) if part.isdigit() else part for part in parts]

image_paths2 = sorted(image_paths2, key=numerical_sort)
image_paths2[:75]

df_image_paths2 = pd.DataFrame(image_paths2, columns=['image_path'])
df_image_paths2.head()
 
"""

In [None]:
df_image_paths['image_path'] = df_image_paths['image_path'].str.replace('\\', '/')
df_image_paths.head()

In [None]:
# Split the path and extract the study_id: make integer type of study_id
def safe_int_conversion(x):
    try:
        return int(x)
    except OverflowError:
        return None  # or handle it in another way

df_image_paths['study_id'] = df_image_paths['image_path'].apply(lambda x: safe_int_conversion(x.split('/')[2]))
df_image_paths.head(75)

In [None]:
df_image_paths.dtypes

In [None]:
# df_image_paths['study_id'] = df_image_paths['study_id'].astype(int)
df_image_paths.head(75)

In [None]:
df_image_paths.dtypes

In [None]:
# keep only selected persons 
from help_files._0_definitions import keep_persons, study_ids_to_keep, all_persons
voraluf = keep_persons(df_image_paths, study_ids_to_keep, all_persons)
df_image_paths = voraluf
df_image_paths

Join  X_train and df_image_paths

In [None]:
""" some checks 
df_image_paths['count'] = df_image_paths.groupby('study_id').cumcount() + 1
df_image_paths['count_per_person'] = df_image_paths.groupby('study_id')['study_id'].transform('count')
df_image_paths['mean_per_of_raws'] = df_image_paths.groupby('study_id').first()['count_per_person'].mean()
df_image_paths['first_row_flag'] = df_image_paths.groupby('study_id').cumcount().apply(lambda x: 1 if x == 0 else 0)
df_image_paths['mean_per_of_raws2'] = np.where(df_image_paths['first_row_flag'] == 1, df_image_paths['count_per_person'].mean(), np.nan)
df_image_paths.head(75)

print("at average there are 81 images per person")
"""

merign: 1:to many left join: df_image_paths merged to merged_df

In [None]:
# join 1 to many: X_train and df_image_paths
df_end = merged_df.merge(df_image_paths, on='study_id', how='left')
df_end.head()


In [None]:
# Call the function count_severity_by_condition_level for checking the count of severity by condition and level
 
print("\nMerged DataFrame:")
count_severity_by_condition_level(df_end)

In [None]:
# join 1 to many: X_train and df_image_paths

df_end = df_end.drop(columns=['categorie', 'instance_number'])
df_end.shape

# Define data_path_vor
 
 
df_end.head()   

In [None]:
# Call the function count_severity_by_condition_level for checking the count of severity by condition and level
 
print("\nMerged DataFrame:")
count_severity_by_condition_level(df_end)

In [None]:
df_end['missing_image'] = df_end['image_path'].isna()
df_end.head()

In [None]:
before = df_end.shape[0]
# Option 1: Drop rows with NaN values in image_path
df_end = df_end.dropna(subset=['image_path'])
df_end['missing_image'] = df_end['image_path'].isna()
print(df_end[df_end['image_path'].isna()])
after = df_end.shape[0]
print(f"Number of rows dropped: {before}, after: {after}, dropped: {before - after}") 


In [None]:
# Call the function count_severity_by_condition_level for checking the count of severity by condition and level
 
print("\nMerged DataFrame:")
count_severity_by_condition_level(df_end)


In [None]:
# Convert image_path to string if necessary
df_end['image_path'] = df_end['image_path'].astype(str)
# Print out the image_path values to inspect
print(df_end['image_path'].head())

In [None]:
df_end.dtypes

In [None]:
df_end

In [None]:
df_end = df_end.sort_values(by=['study_id', 'level'], ignore_index=True)
df_end.head()

random sample selection


In [None]:
print("\nMerged DataFrame:")
count_severity_by_condition_level(df_end)

the code below only once: it trasforms the images and saves them (see _02_transform_images): however it transforms only 'Sagittal T2/STIR' pictures. But for the whole sample it were just 7 minutes

In [None]:
# transform images: already predtransforemed
"""
with open("help_files/_02_transform_images.py") as file:
    exec(file.read())
""" 
 

select randomly one raw for each study_id

In [None]:
df_end_full  = df_end
df_end_full
df_end['study_id_count'] = df_end.groupby('study_id').cumcount() + 1
df_end

In [None]:
RSEED = 42  # Define the random seed

# Randomly select one row from each study_id, ensuring sample size does not exceed population size
random_sample_df = df_end.groupby(['study_id', 'level']).apply(lambda x: x.sample(n=min(80, len(x)), random_state=RSEED)).reset_index(drop=True)
random_sample_df

# Sort the DataFrame by 'study_id' and 'study_id_count' taking into account numerical order
random_sample_df = random_sample_df.sort_values(by=['study_id', 'study_id_count'], ignore_index=True) 
random_sample_df

In [None]:
# Call the function count_severity_by_condition_level for checking the count of severity by condition and level
count_severity_by_condition_level(random_sample_df)

In [None]:
random_sample_df

Number of person for each severity

In [None]:
# Function to get random samples for each severity level
def get_random_samples(df, severity, number, random_state=42):
    # Filter for the given severity level and sample the specified number
    return df[df['severity'] == severity].sample(n=number, random_state=random_state)

# Number of samples to take for each severity level
number_persons_train_0 = 5000  # for severity 1
number_persons_train_1 = 5000  # for severity 2
number_persons_train_2 = 5000   # for severity 3

# Get random samples for each severity level
random_sample_severity_1 = get_random_samples(random_sample_df, severity=1.0, number=number_persons_train_0)
random_sample_severity_2 = get_random_samples(random_sample_df, severity=2.0, number=number_persons_train_1)
random_sample_severity_3 = get_random_samples(random_sample_df, severity=3.0, number=number_persons_train_2)

# Combine the samples into one DataFrame
random_samples_combined = pd.concat([random_sample_severity_1, random_sample_severity_2, random_sample_severity_3])

# Optionally, reset index after concatenation
random_samples_combined = random_samples_combined.reset_index(drop=True)

# Display the first few rows of the combined DataFrame
print(random_samples_combined.head())


# Display the combined random samples
random_samples_combined.head()
random_samples_combined = random_samples_combined.sort_values(by='study_id').reset_index(drop=True)
random_samples_combined.head()


 

In [None]:
# prints in the wide format
df_end = random_samples_combined
# Assuming df_end is your DataFrame
print(df_end.to_string(index=False, header=True))

In [None]:
df_end.dtypes
#df_end['condition'] = df_end['condition'].astype('category').cat.codes
#df_end_full['condition'] = df_end_full['condition'].astype('category').cat.codes

In [None]:
df_end

df_end


In [None]:
df_end = random_samples_combined
 
print(df_end.head())

In [None]:
df_end['severity'] = df_end['severity'] - 1  # Subtracting 1 from each severity level
# Randomly sample 100 rows from random_samples_combined
random_samples_test_check = df_end.sample(n=100, random_state=RSEED)


test_Set

In [None]:
# Group by 'condition', then count the occurrences of each severity level
severity_counts = df_end.groupby(['condition'])['severity'].value_counts().unstack(fill_value=0)
# Display the result
print(severity_counts)

# Count the occurrences of each severity level
severity_counts = df_end['severity'].value_counts()
# Display the result
print(severity_counts)

In [None]:
from sklearn.model_selection import train_test_split

# Split the data into 80% training and 20% testing based on study_id
train_ids, test_ids = train_test_split(df_end['study_id'].unique(), test_size=0.2, random_state=RSEED)

# Create the training and testing sets
train_df = df_end[df_end['study_id'].isin(train_ids)]
test_df = df_end[df_end['study_id'].isin(test_ids)]

# Display the shapes of the resulting dataframes
print(f"Training set shape: {train_df.shape}")
print(f"Testing set shape: {test_df.shape}")

In [None]:
test_df.head()

In [None]:
df_end

In [None]:
# with open("help_files/sample_for_images_colab.py") as file:
#    exec(file.read())

In [None]:
# Save the filtered dataframes to CSV files 
dataframes = [df_end_full, train_df, test_df, df_end_full]
file_names = ["train_df_3_stenosis.csv", "test_df_3_stenosis.csv"]
for df, file_name in zip(dataframes, file_names):
    print(f"df ",df.shape)
    df.to_csv(data_path_vor / file_name, index=False)

 

In [None]:
print((205+1247+520)*30)