In [None]:
# import libraries
import os
import pyreadr
import pandas as pd
import collections
import numpy as np
import re
import datetime
import matplotlib as mpl
import matplotlib.pyplot as plt
from scipy.stats import mannwhitneyu, chi2_contingency
import seaborn as sns
from matplotlib.patches import Patch
import pyreadr
from lifelines import KaplanMeierFitter

In [None]:
import pyreadr

# Import the necessary library

# Read the RDS file and load it into a DataFrame
# Note: Replace 'path/to/your/file.RDS' with the actual path to your RDS file
result = pyreadr.read_r('path/to/your/file.RDS')

# Extract the DataFrame from the result
df = result[None]

# Convert the 'STUDY_NUMBER' column to float type
df['STUDY_NUMBER'] = df['STUDY_NUMBER'].astype(float)

# Display the DataFrame
df

In [None]:
# Read the imputed data from a CSV file and load it into a DataFrame
# Note: Replace 'path/to/your/imputed_data.csv' with the actual path to your imputed data file
df_imp = pd.read_csv('path/to/your/imputed_data.csv')

# Convert the 'STUDY_NUMBER' column to float type
df_imp['STUDY_NUMBER'] = df_imp['STUDY_NUMBER'].astype(float)

# Display the DataFrame
df_imp

In [None]:
# Merge the dataframes on 'STUDY_NUMBER', appending suffixes to overlapping columns
merged_df = df.merge(df_imp, on='STUDY_NUMBER', suffixes=('', '_new'))

# Automatically replace original columns with the new ones from the imputed dataframe
for col in df.columns:
    if col in df_imp.columns and col != 'STUDY_NUMBER':  # Skip the 'STUDY_NUMBER' column
        merged_df[col] = merged_df[col + '_new']
        merged_df.drop(columns=[col + '_new'], inplace=True)  # Drop the suffixed column

# Update the original dataframe with the merged dataframe
df = merged_df

# Display the updated dataframe
df

In [None]:
# Read the IPH data from a CSV file
# Note: Replace 'path/to/your/iph_data.csv' with the actual path to your IPH data file
df_iph = pd.read_csv('path/to/your/iph_data.csv')

# Extract the 'AENR' from the 'case_id' column by splitting the string and taking the last part before the file extension
df_iph['AENR'] = [i.split('/')[-1].rsplit('.')[0] for i in df_iph['case_id']]

# Convert the 'AENR' to 'STUDY_NUMBER' by removing the 'AE' prefix and converting to integer
df_iph['STUDY_NUMBER'] = [int(i.replace('AE', '')) for i in df_iph['AENR']]

# Display the DataFrame
df_iph

In [None]:
# Find the intersection of 'STUDY_NUMBER' between the two dataframes
intersection = set(df['STUDY_NUMBER']) & set(df_iph['STUDY_NUMBER'])

# Print the number of common 'STUDY_NUMBER' entries
len(intersection)

2220

In [None]:
# Filter the IPH dataframe to only include rows with 'STUDY_NUMBER' present in the intersection
df_iph = df_iph[df_iph['STUDY_NUMBER'].isin(list(intersection))]

# Filter the main dataframe to only include rows with 'STUDY_NUMBER' present in the intersection
df = df[df['STUDY_NUMBER'].isin(list(intersection))]

In [None]:
# Merge the main dataframe with the IPH dataframe on 'STUDY_NUMBER' using an inner join
df_combined = df.merge(df_iph, how='inner', on='STUDY_NUMBER')

# Display the combined dataframe
df_combined

In [None]:
# Define the columns to be used for analysis
# The columns include demographic information, medical history, medication usage, and various clinical measurements
# The columns are selected based on the updated requirements (26-08-2024)

# Columns:
# Age: Age of the patient
# Gender: Gender of the patient
# Symptoms.Update2G: Updated symptoms information
# Med.Statin.LLD: Medication - Statin or Lipid-Lowering Drugs
# MI_Dx: History of myocardial infarction
# Med.acetylsal: Medication - Acetylsalicylic acid (Aspirin)
# Med.anticoagulants: Medication - Anticoagulants
# Med.dipyridamole: Medication - Dipyridamole
# Hypertension.composite: Composite measure of hypertension
# stenosis_con_bin: Binary indicator of stenosis condition
# CRP_avg: Average C-reactive protein level
# HDL_final: Final HDL cholesterol level
# epcom.3years: Composite endpoint at 3 years
# ep_com_t_3years: Time to composite endpoint at 3 years
# IPH.bin: Binary indicator of intraplaque hemorrhage (manual)
# area: Area measurement
# prob: Probability measurement
# IPH: Intraplaque hemorrhage indicator (model)
# dateok: Date of the observation

# Select the columns from the combined dataframe for analysis
df_analysis = df_combined[['Age', 'Gender', "Symptoms.Update2G", 'Med.Statin.LLD', 'MI_Dx', 'Med.acetylsal', 
                           'Med.anticoagulants', 'Med.dipyridamole', 'Hypertension.composite', 'stenosis_con_bin', 
                           'CRP_avg', 'HDL_final', 'epcom.3years', "ep_com_t_3years", 'IPH.bin', 'area', 'prob', 
                           'IPH', 'dateok']]


In [None]:
# Define the date for filtering
date_y = 2008
date_m = 3
date_d = 11

# Create a date filter to select rows with 'dateok' before the specified date
date_filter = (df_analysis['dateok'] < datetime.date(date_y, date_m, date_d))

# Apply the date filter to the dataframe
df_analysis = df_analysis[date_filter]

In [None]:
# Check for missing values in the dataframe
df_analysis.isnull().sum()

In [None]:
# Drop rows with any missing values from the dataframe
df_analysis = df_analysis.dropna()

In [None]:
# Drop the 'dateok' column from the dataframe
df_analysis = df_analysis.drop(['dateok'], axis=1)

In [None]:
df_analysis.isnull().sum()

In [None]:
# Count the occurrences of each unique value in the 'Gender' column
gender_counts = df_analysis['Gender'].value_counts()

# Display the counts
gender_counts

In [None]:
# Calculate the mean of the 'ep_com_t_3years' column in the dataframe
mean_ep_com_t_3years = df_analysis["ep_com_t_3years"].mean()

# Display the mean value
mean_ep_com_t_3years

In [None]:
# Count the occurrences of each unique value in the 'epcom.3years' column
# This column represents the composite endpoint at 3 years
epcom_counts = df_analysis["epcom.3years"].value_counts()

# Display the counts
epcom_counts

In [None]:
# Count the occurrences of each unique value in the 'epcom.3years' column for male patients
# This column represents the composite endpoint at 3 years

# Filter the dataframe to include only male patients
male_patients = df_analysis[df_analysis["Gender"] == "male"]

# Count the occurrences of each unique value in the 'epcom.3years' column for the filtered dataframe
epcom_counts_male = male_patients["epcom.3years"].value_counts()

# Display the counts
epcom_counts_male

In [None]:
# Count the occurrences of each unique value in the 'epcom.3years' column for female patients
# This column represents the composite endpoint at 3 years

# Filter the dataframe to include only female patients
female_patients = df_analysis[df_analysis["Gender"] == "female"]

# Count the occurrences of each unique value in the 'epcom.3years' column for the filtered dataframe
epcom_counts_female = female_patients["epcom.3years"].value_counts()

# Display the counts
epcom_counts_female

In [None]:
# Create a copy of the dataframe for further analysis
df_copy = df_analysis

In [None]:
# Define the directory where the results will be saved

# Temporary directory for saving results
SAVE_DIR = "./your_directory/tmp/"


In [None]:
import os

# Check if the directory specified in SAVE_DIR exists, if not, create it
if not os.path.exists(SAVE_DIR):
    os.makedirs(SAVE_DIR)

In [None]:
# Restore the original dataframe from the copy
df_analysis = df_copy

# Uncomment the following lines to filter the dataframe based on the IPH column

# Drop specific columns from the dataframe based on the analysis requirements
# Uncomment the appropriate line to drop the desired columns

# Drop 'area', 'prob', and 'IPH' columns, keeping 'IPH.bin' (original manual)
# df_analysis = df_analysis.drop(['area', 'prob', 'IPH'], axis=1)

# Drop 'area', 'prob', and 'IPH.bin' columns, keeping 'IPH' (CLAM prediction)
# df_analysis = df_analysis.drop(['area', 'prob', 'IPH.bin'], axis=1)

# Drop 'area', 'IPH.bin', and 'IPH' columns, keeping 'prob' (CLAM prediction)
# df_analysis = df_analysis.drop(['area', 'IPH.bin', 'IPH'], axis=1)

# Drop 'IPH.bin', 'prob', and 'IPH' columns, keeping 'area' (CLAM prediction)
df_analysis = df_analysis.drop(['IPH.bin', 'prob', 'IPH'], axis=1)


In [None]:
# Convert 'Gender' and 'Symptoms.Update2G' columns to categorical codes
df_analysis.Gender = pd.Categorical(df_analysis.Gender).codes
df_analysis["Symptoms.Update2G"] = pd.Categorical(df_analysis["Symptoms.Update2G"]).codes

# Convert categorical variables to dummy/indicator variables
df_analysis = pd.get_dummies(df_analysis, drop_first=True)

# Rename 'ep_com_t_3years' to 'time' for CoxPHFitter
df_analysis['time'] = df_analysis["ep_com_t_3years"]
df_analysis = df_analysis.drop("ep_com_t_3years", axis=1)

In [None]:
from lifelines import CoxPHFitter

# Initialize the Cox Proportional Hazards model
cph_a = CoxPHFitter()

# Fit the model to the dataframe
# 'time' is the duration column, and 'epcom.3years' is the event column
# The show_progress parameter displays a progress bar during fitting
cph_a.fit(df_analysis, duration_col='time', event_col='epcom.3years', show_progress=True)

# Print the summary of the fitted model
cph_a.print_summary()

In [None]:
# Create a new figure with specified size
plt.subplots(figsize=(10, 6))

# Plot the Cox Proportional Hazards model summary
cph_a.plot()

# Add title and labels to the plot
plt.title("Cox Proportional Hazards Model Summary")
plt.xlabel("Coefficient")
plt.ylabel("Variables")

# Remove the 'right' and 'top' spines for a cleaner look
ax = plt.gca()  # Get current axes
ax.spines[['right', 'top']].set_visible(False)

# Show the plot
plt.show()

In [None]:
import matplotlib

# Update the default font size for all plots
matplotlib.rcParams.update({'font.size': 11})

In [None]:
# Create a new figure with specified size
plt.figure(figsize=(10, 8))

# Turn off interactive plotting
plt.ioff()

# Plot partial effects on outcome for different covariates if they exist in the dataframe
if 'IPH.bin_yes' in df_analysis.columns.values:
    cph_a.plot_partial_effects_on_outcome(covariates='IPH.bin_yes', values=[False, True], cmap='coolwarm')
    label_file = "manual"
if 'IPH' in df_analysis.columns.values:
    cph_a.plot_partial_effects_on_outcome(covariates='IPH', values=[False, True], cmap='coolwarm')
    label_file = "model"
if 'area' in df_analysis.columns.values:
    cph_a.plot_partial_effects_on_outcome(covariates='area', values=[0, 0.25, 0.5, 0.75, 1], cmap='coolwarm')
    label_file = "area"
if 'prob' in df_analysis.columns.values:
    cph_a.plot_partial_effects_on_outcome(covariates='prob', values=[0, 0.25, 0.5, 0.75, 1], cmap='coolwarm')
    label_file = "prob"

# Set y-axis limits
plt.ylim(0.60, 1.0)

# Set x and y labels
plt.xlabel("Time (years)")
plt.ylabel("Survival Probability")

# Update font size for the plot
plt.rcParams.update({'font.size': 14})

# Remove the 'right' and 'top' spines for a cleaner look
ax = plt.gca()  # Get current axes
ax.spines[['right', 'top']].set_visible(False)

# Access the lines in the plot
lines = ax.get_lines()

# Custom colorblind palette (tab10)
colorblind_palette = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', 
                      '#8c564b', '#e377c2', '#7f7f7f', '#bcbd22', '#17becf']

# Reassign colors to lines based on the palette
for i, line in enumerate(lines):
    line.set_color(colorblind_palette[i % len(colorblind_palette)])

# Customize legend text based on the label_file
L = plt.legend()
if label_file == "manual" or label_file == "model":
    L.get_texts()[0].set_text('IPH=False')
    L.get_texts()[1].set_text('IPH=True')
elif label_file == "area":
    for i, text in enumerate([0, 0.25, 0.5, 0.75, 1]):
        L.get_texts()[i].set_text(f'IPH area={text}')
elif label_file == "prob":
    for i, text in enumerate([0, 0.25, 0.5, 0.75, 1]):
        L.get_texts()[i].set_text(f'IPH prob={text}')

# Save the plot to a file in the specified directory
plt.savefig(SAVE_DIR + f'partial_effects_plot_all_IPH_{label_file}.pdf', format='pdf', dpi=300)

# Turn on interactive plotting
plt.ion()

# Show the plot
plt.show()

In [None]:
# Check the proportional hazards assumption for the Cox Proportional Hazards model
# This function tests whether the proportional hazards assumption holds for each covariate
# The p_value_threshold parameter is used to determine the significance level for the test

# Generalized code for checking assumptions
cph_a.check_assumptions(df_analysis, p_value_threshold=0.05)

In [None]:
from lifelines import CoxPHFitter

# Initialize the Cox Proportional Hazards model for female patients
cph_f = CoxPHFitter()

# Filter the dataframe to include only female patients
female_patients = df_analysis.loc[df_analysis['Gender'] == 0]

# Drop the 'Gender' column as it is not needed for the model
female_patients = female_patients.drop(['Gender'], axis=1)

# Fit the Cox Proportional Hazards model to the filtered dataframe
# 'time' is the duration column, and 'epcom.3years' is the event column
# The show_progress parameter displays a progress bar during fitting
cph_f.fit(female_patients, duration_col='time', event_col='epcom.3years', show_progress=True)

# Print the summary of the fitted model
cph_f.print_summary()

In [None]:
# Create a new figure with specified size
plt.subplots(figsize=(10, 6))

# Plot the Cox Proportional Hazards model summary for female patients
cph_f.plot()

# Add title and labels to the plot
plt.title("Cox Proportional Hazards Model Summary for Female Patients")
plt.xlabel("Coefficient")
plt.ylabel("Variables")

# Remove the 'right' and 'top' spines for a cleaner look
ax = plt.gca()  # Get current axes
ax.spines[['right', 'top']].set_visible(False)

# Show the plot
plt.show()

In [None]:
# Create a new figure with specified size
plt.figure(figsize=(10, 8))

# Turn off interactive plotting
plt.ioff()

# Plot partial effects on outcome for different covariates if they exist in the dataframe
if 'IPH.bin_yes' in df_analysis.columns.values:
    cph_f.plot_partial_effects_on_outcome(covariates='IPH.bin_yes', values=[False, True], cmap='coolwarm')
    label_file = "manual"
if 'IPH' in df_analysis.columns.values:
    cph_f.plot_partial_effects_on_outcome(covariates='IPH', values=[False, True], cmap='coolwarm')
    label_file = "model"
if 'area' in df_analysis.columns.values:
    cph_f.plot_partial_effects_on_outcome(covariates='area', values=[0, 0.25, 0.5, 0.75, 1], cmap='coolwarm')
    label_file = "area"
if 'prob' in df_analysis.columns.values:
    cph_f.plot_partial_effects_on_outcome(covariates='prob', values=[0, 0.25, 0.5, 0.75, 1], cmap='coolwarm')
    label_file = "prob"

# Set y-axis limits
plt.ylim(0.60, 1.0)

# Set x and y labels
plt.xlabel("Time (years)")
plt.ylabel("Survival Probability")

# Remove the 'right' and 'top' spines for a cleaner look
ax = plt.gca()  # Get current axes
ax.spines[['right', 'top']].set_visible(False)

# Update font size for the plot
plt.rcParams.update({'font.size': 14})

# Access the lines in the plot
lines = ax.get_lines()

# Custom colorblind palette (tab10)
colorblind_palette = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', 
                      '#8c564b', '#e377c2', '#7f7f7f', '#bcbd22', '#17becf']

# Reassign colors to lines based on the palette
for i, line in enumerate(lines):
    line.set_color(colorblind_palette[i % len(colorblind_palette)])

# Customize legend text based on the label_file
L = plt.legend()
if label_file == "manual" or label_file == "model":
    L.get_texts()[0].set_text('IPH=False')
    L.get_texts()[1].set_text('IPH=True')
elif label_file == "area":
    for i, text in enumerate([0, 0.25, 0.5, 0.75, 1]):
        L.get_texts()[i].set_text(f'IPH area={text}')
elif label_file == "prob":
    for i, text in enumerate([0, 0.25, 0.5, 0.75, 1]):
        L.get_texts()[i].set_text(f'IPH prob={text}')

# Save the plot to a file in the specified directory
plt.savefig(SAVE_DIR + f'partial_effects_plot_female_IPH_{label_file}.pdf', format='pdf', dpi=300)

# Turn on interactive plotting
plt.ion()

# Show the plot
plt.show()

In [None]:
# Check the proportional hazards assumption for the Cox Proportional Hazards model for female patients
# This function tests whether the proportional hazards assumption holds for each covariate
# The p_value_threshold parameter is used to determine the significance level for the test

# Generalized code for checking assumptions
cph_f.check_assumptions(female_patients, p_value_threshold=0.05)

In [None]:
# Initialize the Cox Proportional Hazards model for male patients
cph_m = CoxPHFitter()

# Filter the dataframe to include only male patients
male_patients = df_analysis.loc[df_analysis['Gender'] == 1]

# Drop the 'Gender' column as it is not needed for the model
male_patients = male_patients.drop(['Gender'], axis=1)

# Fit the Cox Proportional Hazards model to the filtered dataframe
# 'time' is the duration column, and 'epcom.3years' is the event column
# The show_progress parameter displays a progress bar during fitting
cph_m.fit(male_patients, duration_col='time', event_col='epcom.3years', show_progress=True)

# Print the summary of the fitted model
cph_m.print_summary()

In [None]:
# Create a new figure with specified size
plt.subplots(figsize=(10, 6))

# Plot the Cox Proportional Hazards model summary for male patients
cph_m.plot()

# Add title and labels to the plot
plt.title("Cox Proportional Hazards Model Summary for Male Patients")
plt.xlabel("Coefficient")
plt.ylabel("Variables")

# Remove the 'right' and 'top' spines for a cleaner look
ax = plt.gca()  # Get current axes
ax.spines[['right', 'top']].set_visible(False)

# Show the plot
plt.show()

In [None]:
# Create a new figure with specified size
plt.figure(figsize=(10, 8))

# Turn off interactive plotting
plt.ioff()

# Plot partial effects on outcome for different covariates if they exist in the dataframe
if 'IPH.bin_yes' in df_analysis.columns.values:
    cph_m.plot_partial_effects_on_outcome(covariates='IPH.bin_yes', values=[False, True], cmap='coolwarm')
    label_file = "manual"
if 'IPH' in df_analysis.columns.values:
    cph_m.plot_partial_effects_on_outcome(covariates='IPH', values=[False, True], cmap='coolwarm')
    label_file = "model"
if 'area' in df_analysis.columns.values:
    cph_m.plot_partial_effects_on_outcome(covariates='area', values=[0, 0.25, 0.5, 0.75, 1], cmap='coolwarm')
    label_file = "area"
if 'prob' in df_analysis.columns.values:
    cph_m.plot_partial_effects_on_outcome(covariates='prob', values=[0, 0.25, 0.5, 0.75, 1], cmap='coolwarm')
    label_file = "prob"

# Set y-axis limits
plt.ylim(0.6, 1.0)

# Set x and y labels
plt.xlabel("Time (years)")
plt.ylabel("Survival Probability")

# Remove the 'right' and 'top' spines for a cleaner look
ax = plt.gca()  # Get current axes
ax.spines[['right', 'top']].set_visible(False)

# Update font size for the plot
plt.rcParams.update({'font.size': 14})

# Access the lines in the plot
lines = ax.get_lines()

# Custom colorblind palette (tab10)
colorblind_palette = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', 
                      '#8c564b', '#e377c2', '#7f7f7f', '#bcbd22', '#17becf']

# Reassign colors to lines based on the palette
for i, line in enumerate(lines):
    line.set_color(colorblind_palette[i % len(colorblind_palette)])

# Customize legend text based on the label_file
L = plt.legend()
if label_file == "manual" or label_file == "model":
    L.get_texts()[0].set_text('IPH=False')
    L.get_texts()[1].set_text('IPH=True')
elif label_file == "area":
    for i, text in enumerate([0, 0.25, 0.5, 0.75, 1]):
        L.get_texts()[i].set_text(f'IPH area={text}')
elif label_file == "prob":
    for i, text in enumerate([0, 0.25, 0.5, 0.75, 1]):
        L.get_texts()[i].set_text(f'IPH prob={text}')

# Save the plot to a file in the specified directory
plt.savefig(SAVE_DIR + f'partial_effects_plot_male_IPH_{label_file}.pdf', format='pdf', dpi=300)

# Turn on interactive plotting
plt.ion()

# Show the plot
plt.show()

In [None]:
# Check the proportional hazards assumption for the Cox Proportional Hazards model for male patients
# This function tests whether the proportional hazards assumption holds for each covariate
# The p_value_threshold parameter is used to determine the significance level for the test

# Generalized code for checking assumptions
cph_m.check_assumptions(male_patients, p_value_threshold=0.05)