#### This document includes FACT-E scoring guidelines.

In [1]:
import pandas as pd
import numpy as np

# Preliminary

In [3]:
# Read the CSV file - emotional distress 
file_path = "../Temp/EsophagealBank-EmotionalDistress_DATA_2025-02-13_1201.csv"
df = pd.read_csv(file_path)
df

  df = pd.read_csv(file_path)


Unnamed: 0,id,redcap_event_name,redcap_repeat_instrument,redcap_repeat_instance,qol_date,gp1,gp2,gp3,gp4,gp5,...,comp_pulm,other_pulm,grade_pulm,comp_renal,other_renal,grade_renal,comp_wound,other_wound,grade_wound,postop_comp
0,1,baseline_arm_1,,,,,,,,,...,,,,,,,,,,
1,1,5_years_postop_arm_1,,,,,,,,,...,,,,,,,,,,
2,1,surgery_arm_1,surgery_esd_emr,1.0,,,,,,,...,,,,,,,,,,
3,1,surgery_arm_1,postoperative_course,1.0,,,,,,,...,,,,,,,,,,1.0
4,1,surgery_arm_1,postoperative_complications,1.0,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9615,1756,baseline_arm_1,,,,,,,,,...,,,,,,,,,,
9616,1756,baseline_arm_1,functional_assessment_of_cancer_therapy_fact_e...,1.0,1/27/2025,3.0,0.0,2.0,0.0,,...,,,,,,,,,,
9617,1757,baseline_arm_1,,,,,,,,,...,,,,,,,,,,
9618,1757,baseline_arm_1,functional_assessment_of_cancer_therapy_fact_e...,1.0,1/27/2025,2.0,2.0,,,,...,,,,,,,,,,


In [None]:
# Extract columns for FACT-G with the patient id
columns_to_extract = ["id"] + [f"gp{i}" for i in range(1, 8)] + [f"gs{i}" for i in range(1, 8)] + \
                        [f"ge{i}" for i in range(1, 7)] + [f"gf{i}" for i in range(1, 8)]

df_extracted = df[columns_to_extract]

# Display the extracted data
print(df_extracted)

        id  gp1  gp2  gp3  gp4  gp5  gp6  gp7  gs1  gs2  ...  a_e1  a_e2  \
0        1  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  ...   NaN   NaN   
1        1  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  ...   NaN   NaN   
2        1  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  ...   NaN   NaN   
3        1  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  ...   NaN   NaN   
4        1  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  ...   NaN   NaN   
...    ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...   ...   ...   
9615  1756  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  ...   NaN   NaN   
9616  1756  3.0  0.0  2.0  0.0  NaN  2.0  4.0  3.0  3.0  ...   4.0   2.0   
9617  1757  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  ...   NaN   NaN   
9618  1757  2.0  2.0  NaN  NaN  NaN  2.0  NaN  2.0  2.0  ...   0.0   0.0   
9619  1758  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  ...   NaN   NaN   

      a_e3  a_e4  a_e5  a_e6  a_e7  a_c6  a_c2  a_act11  
0      NaN   NaN   NaN   NaN 

# Scroring scheme

In [None]:
# Define all subscale columns
pwb_cols = [f"gp{i}" for i in range(1, 8)]  # Physical Well-Being
swb_cols = [f"gs{i}" for i in range(1, 8)]  # Social/Family Well-Being
ewb_cols = [f"ge{i}" for i in range(1, 7)]  # Emotional Well-Being
fwb_cols = [f"gf{i}" for i in range(1, 8)]  # Functional Well-Being


# Create function to calculate subscale scores
def calculate_subscale_score(df, cols, reverse_items=None, subscale_name=""):
    """
    Calculate subscale scores according to FACT scoring guidelines, only if >=50% items are answered
    
    Parameters:
    df (DataFrame): DataFrame containing the item responses
    cols (list): List of column names for the subscale items
    reverse_items (list): List of items that need to be reverse-scored (if any)
    subscale_name (str): Name of the subscale for column naming
    
    Returns:
    DataFrame: DataFrame with added subscale score column
    """
    # Create score columns for each item
    for col in cols:
        if reverse_items and col in reverse_items:
            # Reverse scoring (4 - score)
            df[f"{col}_score"] = 4 - df[col]
        else:
            # Regular scoring
            df[f"{col}_score"] = df[col]
    
    # Calculate the number of items and the 50% threshold
    total_items = len(cols)
    min_items_required = total_items / 2  # 50% threshold
    
    # Count non-NaN items for each row
    score_cols = [f"{col}_score" for col in cols]
    num_answered_items = df[score_cols].notna().sum(axis=1)
    
    # Calculate subscale score: sum of scores * number of items / number of answered items
    # Only calculate if >=50% items are answered
    df[f"{subscale_name}_subscale_score"] = np.where(
        num_answered_items >= min_items_required,
        (df[score_cols].sum(axis=1, skipna=True) * len(cols)) / 
        df[score_cols].notna().sum(axis=1),
        np.nan
    )
    
    return df


# Calculate Physical Well-Being (PWB) subscale
# All PWB items are reverse-scored
df_extracted = calculate_subscale_score(df_extracted, pwb_cols, reverse_items=pwb_cols, subscale_name="pwb")

# Calculate Social/Family Well-Being (SWB) subscale
df_extracted = calculate_subscale_score(df_extracted, swb_cols, subscale_name="swb")

# Calculate Emotional Well-Being (EWB) subscale
# All EWB items except GE2 are reverse-scored
ewb_reverse_items = [f"ge{i}" for i in range(1, 7) if i != 2]
df_extracted = calculate_subscale_score(df_extracted, ewb_cols, reverse_items=ewb_reverse_items, subscale_name="ewb")

# Calculate Functional Well-Being (FWB) subscale
df_extracted = calculate_subscale_score(df_extracted, fwb_cols, subscale_name="fwb")

# Calculate FACT-G total score (PWB + SWB + EWB + FWB)
fact_g_items = pwb_cols + swb_cols + ewb_cols + fwb_cols  # 27 items
fact_g_item_cols = [f"{col}_score" for col in fact_g_items]
fact_g_num_answered = df_extracted[fact_g_item_cols].notna().sum(axis=1)
fact_g_min_items = 22  # 80% of 27 items = 21.6, rounded up to 22

df_extracted["fact_g_total"] = np.where(
    # Condition 1: All subscales must be non-NaN (already ensures 50% of items per subscale)
    (df_extracted["pwb_subscale_score"].notna()) & 
    (df_extracted["swb_subscale_score"].notna()) & 
    (df_extracted["ewb_subscale_score"].notna()) & 
    (df_extracted["fwb_subscale_score"].notna()) &
    # Condition 2: At least 80% of FACT-G items must be answered
    (fact_g_num_answered >= fact_g_min_items),
    df_extracted["pwb_subscale_score"] + df_extracted["swb_subscale_score"] + 
    df_extracted["ewb_subscale_score"] + df_extracted["fwb_subscale_score"],
    np.nan
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[f"{col}_score"] = 4 - df[col]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[f"{col}_score"] = 4 - df[col]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[f"{col}_score"] = 4 - df[col]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer]

# Results

In [None]:
print(df_extracted)

# Display the results
print("FACT-G Scoring Results:")
print(df_extracted[["id", "pwb_subscale_score", "swb_subscale_score", "ewb_subscale_score", 
                    "fwb_subscale_score", "fact_g_total"]])

        id  gp1  gp2  gp3  gp4  gp5  gp6  gp7  gs1  gs2  ...  a_e5_score  \
0        1  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  ...         NaN   
1        1  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  ...         NaN   
2        1  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  ...         NaN   
3        1  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  ...         NaN   
4        1  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  ...         NaN   
...    ...  ...  ...  ...  ...  ...  ...  ...  ...  ...  ...         ...   
9615  1756  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  ...         NaN   
9616  1756  3.0  0.0  2.0  0.0  NaN  2.0  4.0  3.0  3.0  ...         4.0   
9617  1757  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  ...         NaN   
9618  1757  2.0  2.0  NaN  NaN  NaN  2.0  NaN  2.0  2.0  ...         4.0   
9619  1758  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  NaN  ...         NaN   

      a_e6_score  a_e7_score  a_c6_score  a_c2_score  a_act11_score  \
0            NaN

In [None]:
# Export the results to a CSV file
output_file_path = "../results/FACT_G_scoring_results.csv"
df_extracted.to_csv(output_file_path, index=False)
print(f"Results exported to {output_file_path}")

Results exported to ../results/FACT_E_scoring_results.csv
