In [41]:
import pandas as pd
from sklearn.impute import SimpleImputer
import numpy as np

# Load the data
data = pd.read_csv("traindata_main/original_data.csv") # 2375 x 109 size
print(len(data))

# Drop rows with missing values in the 'adjuvantchemo' column
data = data.dropna(subset=['adjuvantchemo'])
print(len(data))

# Remove rows where 'OS' column is equal to 0
data = data[data['OS'] != 0]
print(len(data))

# Select covariates
X_orig = data[["DFI<12",'age', 'T_stage', 'N_stage', 'rightleft Rec', 'cea_carcinoembryonic antigen',
               'extrahepatic disease ', 'size', 'number_liver_mets', 'R0', 'KRAS']]

# Outcome to predict: 5-year overall survival
died = data['OSstatus'] == 1
times = data['OS']


# Count missing values in each column
missing_counts = X_orig.isnull().sum() # OS and OSstatus have no missing values :)

# Filter columns with missing values
columns_with_missing = missing_counts[missing_counts > 0].index
X_orig.reset_index(drop=True, inplace=True)


2375
1804
1799


In [46]:
import pandas as pd
import numpy as np
import scipy.stats as stats

# Assuming df is your DataFrame$
data = pd.read_csv("traindata/unmatched_train_data.csv") 

print(data.columns)
df = data  # Load your dataset

def calculate_characteristics(df):
    results = {}
    
    treated = df[df['adjuvantchemo'] == 1]
    untreated = df[df['adjuvantchemo'] == 0]
    
    # Total patients
    results['Total Patients'] = (len(treated), len(untreated), np.nan)
    
    # Function to calculate median and IQR
    def median_iqr(data):
        median = np.median(data)
        q1, q3 = np.percentile(data, [25, 75])
        iqr = q3 - q1
        return f"{median:.1f} ({q1:.1f}-{q3:.1f})"
    
    # Age (years)
    results['Age (years)'] = (median_iqr(treated['age']), median_iqr(untreated['age']), np.nan)

    # CEA (µg/L)
    results['CEA (µg/L)'] = (median_iqr(treated['cea_carcinoembryonic.antigen']), 
                             median_iqr(untreated['cea_carcinoembryonic.antigen']), np.nan)

    # Diameter of Largest CRLM (cm)
    results['Diameter of Largest CRLM (cm)'] = (median_iqr(treated['size']), 
                                                median_iqr(untreated['size']), np.nan)

    # Number of CRLMs
    results['Number of CRLMs'] = (median_iqr(treated['number_liver_mets']), 
                                  median_iqr(untreated['number_liver_mets']), np.nan)

    return results

# Generate the characteristics table
characteristics = calculate_characteristics(df)

# Output the characteristics to the LaTeX table (you can customize this further)
for key, value in characteristics.items():
    if isinstance(value, dict):
        print(f"{key}:")
        for sub_key, sub_value in value.items():
            print(f"  {sub_key}: {sub_value}")
    else:
        print(f"{key}: {value}")


Index(['age', 'size', 'cea_carcinoembryonic.antigen', 'number_liver_mets',
       'DFI.12', 'T_stage', 'N_stage', 'rightleft.Rec', 'extrahepatic.disease',
       'R0', 'KRAS', 'adjuvantchemo', 'died', 'time_to_event'],
      dtype='object')
Total Patients: (1197, 602, nan)
Age (years): ('60.0 (52.0-67.0)', '65.0 (58.0-73.0)', nan)
CEA (µg/L): ('8.0 (3.0-40.0)', '9.9 (3.4-44.6)', nan)
Diameter of Largest CRLM (cm): ('3.0 (2.0-4.0)', '3.3 (2.0-4.0)', nan)
Number of CRLMs: ('2.0 (1.0-3.0)', '2.0 (1.0-3.0)', nan)


In [44]:
import pandas as pd
import numpy as np

# Example categorical variable list
categorical_vars = ["grade","DFI<12", 'T_stage', 'N_stage', 'rightleft Rec', 'extrahepatic disease ', 'R0', 'KRAS']
#categorical_vars = ['T_stage', 'N_stage', 'rightleft Rec', 'extrahepatic disease ', 'R0', 'KRAS']
#categorical_vars = ['T_stage', 'N_stage', 'rightleft.Rec', 'extrahepatic.disease', 'R0', 'KRAS']

# Assuming df is your DataFrame
df = data  # Load your dataset

def calculate_percentages_including_missing(df, categorical_vars):
    results = []

    # Loop through each categorical variable in the list
    for col in categorical_vars:
        treated = df[df['adjuvantchemo'] == 1]
        untreated = df[df['adjuvantchemo'] == 0]

        # Total counts for treated and untreated groups, including missing values
        total_treated = treated.shape[0]
        total_untreated = untreated.shape[0]

        # Loop through each unique value in the column
        for value in df[col].unique():
            if pd.isna(value):
                continue  # We'll handle missing values separately
            
            # Calculate counts for the treated group
            treated_count = treated[treated[col] == value].shape[0]
            treated_percentage = treated_count / total_treated * 100 if total_treated > 0 else np.nan

            # Calculate counts for the untreated group
            untreated_count = untreated[untreated[col] == value].shape[0]
            untreated_percentage = untreated_count / total_untreated * 100 if total_untreated > 0 else np.nan

            # Append the result for this value
            results.append({
                'Variable': col,
                'Category': value,
                'Treated Count': treated_count,
                'Treated Percentage': f"{treated_percentage:.1f}%" if not np.isnan(treated_percentage) else "N/A",
                'Untreated Count': untreated_count,
                'Untreated Percentage': f"{untreated_percentage:.1f}%" if not np.isnan(untreated_percentage) else "N/A"
            })

        # Calculate missing value percentages for treated and untreated groups
        treated_missing = treated[col].isna().sum()
        untreated_missing = untreated[col].isna().sum()
        treated_missing_percentage = treated_missing / total_treated * 100 if total_treated > 0 else np.nan
        untreated_missing_percentage = untreated_missing / total_untreated * 100 if total_untreated > 0 else np.nan

        # Append missing value information
        results.append({
            'Variable': col,
            'Category': 'Missing',
            'Treated Count': treated_missing,
            'Treated Percentage': f"{treated_missing_percentage:.1f}%" if not np.isnan(treated_missing_percentage) else "N/A",
            'Untreated Count': untreated_missing,
            'Untreated Percentage': f"{untreated_missing_percentage:.1f}%" if not np.isnan(untreated_missing_percentage) else "N/A"
        })

    return pd.DataFrame(results)

# Generate percentages for the categorical variables including missing values
percentages_df = calculate_percentages_including_missing(df, categorical_vars)

# Display the DataFrame
percentages_df


Unnamed: 0,Variable,Category,Treated Count,Treated Percentage,Untreated Count,Untreated Percentage
0,grade,2,103,8.6%,56,9.3%
1,grade,1,229,19.1%,222,36.9%
2,grade,3,52,4.3%,39,6.5%
3,grade,0,65,5.4%,87,14.5%
4,grade,x,2,0.2%,8,1.3%
5,grade,Missing,746,62.3%,190,31.6%
6,DFI<12,0.0,447,37.3%,207,34.4%
7,DFI<12,1.0,750,62.7%,393,65.3%
8,DFI<12,Missing,0,0.0%,2,0.3%
9,T_stage,2.0,203,17.0%,58,9.6%


In [40]:
print(percentages_df)

                Variable Category  Treated Count Treated Percentage  \
0                T_stage        4             30              19.2%   
1                T_stage        3            107              68.6%   
2                T_stage        2             17              10.9%   
3                T_stage        1              2               1.3%   
4                T_stage        0              0               0.0%   
5                T_stage  Missing              0               0.0%   
6                N_stage        1            109              69.9%   
7                N_stage        0             47              30.1%   
8                N_stage  Missing              0               0.0%   
9          rightleft.Rec        1             51              32.7%   
10         rightleft.Rec        2             39              25.0%   
11         rightleft.Rec        0             66              42.3%   
12         rightleft.Rec  Missing              0               0.0%   
13  ex

DataFrame.reindex(labels=None, *, index=None, columns=None, axis=None, method=None, copy=None, level=None, fill_value=nan, limit=None, tolerance=None)[source]


In [12]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
import pandas as pd
from scipy.sparse import hstack

# Define the numerical and categorical columns
numerical_cols = ['age', 'size', 'cea_carcinoembryonic antigen', 'number_liver_mets']
categorical_cols = ['T_stage', 'N_stage', 'rightleft Rec','extrahepatic disease ', 'R0', 'KRAS',"DFI<12"]
missing_counts = data[X_orig.columns.tolist()].isnull().sum() # OS and OSstatus have no missing values :)
print("columns that have missing data: ", missing_counts[missing_counts > 0])


# Create the transformers
imputer_numerical = IterativeImputer(max_iter=10, random_state=0)
imputer_categorical = SimpleImputer(strategy='most_frequent')
categorical_transformer = Pipeline(steps=[
    ('imputer', imputer_categorical),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Create the ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', imputer_numerical, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Fit and transform the preprocessor on your data
X_numerical_imputed = imputer_numerical.fit_transform(X_orig[numerical_cols])
X_categorical_imputed = categorical_transformer.fit_transform(X_orig[categorical_cols])
# Generate new column names for the one-hot encoded columns
one_hot_column_names = []
for col, categories in zip(categorical_cols, categorical_transformer.named_steps['onehot'].categories_):
    one_hot_column_names.extend([f"{col}_{category}" for category in categories])

# Convert the numerical and categorical data to DataFrames
X_numerical_imputed_df = pd.DataFrame(X_numerical_imputed, columns=numerical_cols)

X_categorical_imputed_df = pd.DataFrame(X_categorical_imputed.toarray(), columns=one_hot_column_names)

# Concatenate the numerical and categorical DataFrames horizontally
X_orig_imputed_df = pd.concat([X_numerical_imputed_df, X_categorical_imputed_df], axis=1)
display(X_orig_imputed_df)


columns that have missing data:  DFI<12                            2
age                               1
T_stage                          26
N_stage                          19
rightleft Rec                     8
cea_carcinoembryonic antigen    320
size                            204
number_liver_mets                85
R0                               31
KRAS                              2
dtype: int64


Unnamed: 0,age,size,cea_carcinoembryonic antigen,number_liver_mets,T_stage_0.0,T_stage_1.0,T_stage_2.0,T_stage_3.0,T_stage_4.0,N_stage_0.0,...,rightleft Rec_1.0,rightleft Rec_2.0,extrahepatic disease _0.0,extrahepatic disease _1.0,R0_0.0,R0_1.0,KRAS_0.0,KRAS_1.0,DFI<12_0.0,DFI<12_1.0
0,61.0,0.4,6.3,1.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0
1,58.0,4.0,1.4,4.0,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0
2,48.0,4.0,153.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0
3,32.0,1.0,3.0,2.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0
4,65.0,3.0,43.1,2.0,0.0,0.0,0.0,1.0,0.0,1.0,...,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1794,63.0,3.0,6.5,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0
1795,72.0,1.6,3.9,1.0,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0
1796,56.0,2.8,9.9,3.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
1797,62.0,1.1,9.8,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0


# careful with index being dropped: need to append only values otw there will be NaNs!!!

In [15]:
X_orig_imputed_df["adjuvantchemo"] = data["adjuvantchemo"].values
X_orig_imputed_df["died"] = died.values
X_orig_imputed_df["time_to_event"] = times.values
# Find columns where all values are 0
zero_columns = X_orig_imputed_df.columns[X_orig_imputed_df.eq(0).all()]

print("Columns with all values as 0:")
print(zero_columns)
# Find columns where exactly one value is not 0
one_non_zero_columns = X_orig_imputed_df.columns[(X_orig_imputed_df != 0).sum() == 1]

print("Columns with exactly one non-zero value:")
print(one_non_zero_columns)
X_orig_imputed_df

Columns with all values as 0:
Index([], dtype='object')
Columns with exactly one non-zero value:
Index([], dtype='object')


Unnamed: 0,age,size,cea_carcinoembryonic antigen,number_liver_mets,T_stage_0.0,T_stage_1.0,T_stage_2.0,T_stage_3.0,T_stage_4.0,N_stage_0.0,...,extrahepatic disease _1.0,R0_0.0,R0_1.0,KRAS_0.0,KRAS_1.0,DFI<12_0.0,DFI<12_1.0,adjuvantchemo,died,time_to_event
0,61.0,0.4,6.3,1.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,True,47.8
1,58.0,4.0,1.4,4.0,0.0,0.0,0.0,1.0,0.0,1.0,...,1.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,False,36.3
2,48.0,4.0,153.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,...,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,True,17.6
3,32.0,1.0,3.0,2.0,0.0,0.0,1.0,0.0,0.0,0.0,...,1.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,False,32.7
4,65.0,3.0,43.1,2.0,0.0,0.0,0.0,1.0,0.0,1.0,...,1.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,False,16.7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1794,63.0,3.0,6.5,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,True,33.7
1795,72.0,1.6,3.9,1.0,0.0,0.0,0.0,1.0,0.0,1.0,...,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,False,6.2
1796,56.0,2.8,9.9,3.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,False,2.3
1797,62.0,1.1,9.8,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,False,8.9


In [16]:
import pandas as pd


def concatenate_encoded_columns(df,prefix):
    concatenated_col = pd.Series()  # Initialize an empty Series
    # Iterate over each row in the DataFrame
    for index, row in df.iterrows():
        value = None  # Initialize the value to None for the current row

        # Iterate over each column in the DataFrame
        for col in df.columns:
            # Check if the column name starts with the specified prefix
            if col.endswith('.0'):
                # Extract the suffix from the column name
                suffix = col.split('_')[-1]

                # If the value in the current column is 1, assign the corresponding suffix value to t_stage_value
                if row[col] == 1:
                    # Convert the suffix to an integer and assign it to t_stage_value
                    value = int(float(suffix))
        # Append the value for the current row to the concatenated column
        concatenated_col = pd.concat([concatenated_col, pd.Series([value], index=[index])])

    # Rename the concatenated column
    concatenated_col.name = prefix

    return concatenated_col
 
# List of prefixes for the columns
prefixes = ["DFI<12",'T_stage','N_stage', 'rightleft Rec', 'extrahepatic disease ', 'R0', 'KRAS']

# Initialize an empty DataFrame to store the concatenated columns
concatenated_df = pd.DataFrame()
X_imputed =pd.DataFrame()
# Iterate over each prefix and concatenate the encoded columns
for prefix in prefixes:
    #display(X_orig)
    filtered_cols = [col for col in X_orig_imputed_df.columns if col.startswith(prefix)]
    #print(filtered_cols)
    output_series = concatenate_encoded_columns(X_orig_imputed_df[filtered_cols], prefix)
    #print(output_series)
    concatenated_df = pd.concat([concatenated_df, output_series], axis=1)
    #display(concatenated_df)
X_imputed = pd.concat([X_numerical_imputed_df,concatenated_df,X_orig_imputed_df["adjuvantchemo"],
                       X_orig_imputed_df["died"],
                       X_orig_imputed_df["time_to_event"]], axis=1)  # Concatenate new columns


['DFI<12_0.0', 'DFI<12_1.0']
0       0
1       0
2       0
3       1
4       0
       ..
1794    0
1795    1
1796    0
1797    0
1798    0
Name: DFI<12, Length: 1799, dtype: int64
['T_stage_0.0', 'T_stage_1.0', 'T_stage_2.0', 'T_stage_3.0', 'T_stage_4.0']
0       2
1       3
2       3
3       2
4       3
       ..
1794    3
1795    3
1796    3
1797    3
1798    3
Name: T_stage, Length: 1799, dtype: int64
['N_stage_0.0', 'N_stage_1.0']


  concatenated_col = pd.concat([concatenated_col, pd.Series([value], index=[index])])
  concatenated_col = pd.concat([concatenated_col, pd.Series([value], index=[index])])
  concatenated_col = pd.concat([concatenated_col, pd.Series([value], index=[index])])


0       0
1       0
2       0
3       1
4       0
       ..
1794    1
1795    0
1796    1
1797    1
1798    1
Name: N_stage, Length: 1799, dtype: int64
['rightleft Rec_0.0', 'rightleft Rec_1.0', 'rightleft Rec_2.0']
0       2
1       0
2       2
3       2
4       1
       ..
1794    1
1795    2
1796    0
1797    1
1798    1
Name: rightleft Rec, Length: 1799, dtype: int64
['extrahepatic disease _0.0', 'extrahepatic disease _1.0']
0       0
1       1
2       1
3       1
4       1
       ..
1794    0
1795    1
1796    0
1797    0
1798    0
Name: extrahepatic disease , Length: 1799, dtype: int64
['R0_0.0', 'R0_1.0']


  concatenated_col = pd.concat([concatenated_col, pd.Series([value], index=[index])])
  concatenated_col = pd.concat([concatenated_col, pd.Series([value], index=[index])])
  concatenated_col = pd.concat([concatenated_col, pd.Series([value], index=[index])])


0       1
1       0
2       1
3       0
4       0
       ..
1794    0
1795    0
1796    0
1797    0
1798    0
Name: R0, Length: 1799, dtype: int64
['KRAS_0.0', 'KRAS_1.0']
0       1
1       1
2       1
3       1
4       1
       ..
1794    1
1795    0
1796    0
1797    0
1798    0
Name: KRAS, Length: 1799, dtype: int64


  concatenated_col = pd.concat([concatenated_col, pd.Series([value], index=[index])])


In [19]:
X_imputed["DFI<12"].unique()

array([0, 1])

In [20]:
missing_counts = X_imputed.columns.isnull()#.sum() # OS and OSstatus have no missing values :)
missing_counts

array([False, False, False, False, False, False, False, False, False,
       False, False, False, False, False])

In [21]:
# Define the file path where you want to save the CSV file
csv_file_path = "traindata/CRLM_imputed.csv"

# Save the DataFrame to a CSV file
X_imputed.to_csv(csv_file_path, index=False)

print(f"DataFrame saved to '{csv_file_path}'")


DataFrame saved to 'CRLM_imputed.csv'


In [24]:
X_imputed["adjuvantchemo"].unique()
X_imputed


Unnamed: 0,age,size,cea_carcinoembryonic antigen,number_liver_mets,DFI<12,T_stage,N_stage,rightleft Rec,extrahepatic disease,R0,KRAS,adjuvantchemo,died,time_to_event
0,61.0,0.4,6.3,1.0,0,2,0,2,0,1,1,0.0,True,47.8
1,58.0,4.0,1.4,4.0,0,3,0,0,1,0,1,1.0,False,36.3
2,48.0,4.0,153.0,1.0,0,3,0,2,1,1,1,1.0,True,17.6
3,32.0,1.0,3.0,2.0,1,2,1,2,1,0,1,1.0,False,32.7
4,65.0,3.0,43.1,2.0,0,3,0,1,1,0,1,1.0,False,16.7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1794,63.0,3.0,6.5,1.0,0,3,1,1,0,0,1,0.0,True,33.7
1795,72.0,1.6,3.9,1.0,1,3,0,2,1,0,0,0.0,False,6.2
1796,56.0,2.8,9.9,3.0,0,3,1,0,0,0,0,0.0,False,2.3
1797,62.0,1.1,9.8,1.0,0,3,1,1,0,0,0,0.0,False,8.9
