In [None]:
!pip install lifelines --quiet

  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m349.3/349.3 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.7/115.7 kB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for autograd-gamma (setup.py) ... [?25l[?25hdone


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import KNNImputer

from lifelines.statistics import logrank_test
from lifelines import KaplanMeierFitter

In [None]:
def preparing_dataset(df):
    """
    Prepares the dataset for analysis, applying necessary filters and transformations.

    Parameters:
    -----------
    df : pandas.DataFrame
        The dataset to be prepared.

    Returns:
    --------
    pandas.DataFrame
        The dataset after the transformations.
    """

    df_aux = df.copy()

    # Lung Cancer Topography (C34) - Filter for lung cancer
    df_aux = df_aux[df_aux.TOPOGRUP == 'C34']

    # Residency - Filter for residents of SP
    df_aux = df_aux[df_aux.UFRESID == 'SP']

    # Microscopic Confirmation - Filter for records with microscopic confirmation (BASEDIAG == 3)
    df_aux = df_aux[df_aux.BASEDIAG == 3]

    # ECGRUP - Remove records with ECGRUP equal to 0, X or Y
    df_aux = df_aux[~df_aux.ECGRUP.isin(['0','X','Y'])]

    # ECGRUP Categorization - Categorize ECGRUP into 'Inicial' and 'Avançado'
    df_aux['ECGRUP_CAT'] = ['Inicial' if ec in ['I', 'II'] else 'Avançado' for ec in df_aux.ECGRUP]

    # HORMONIO and HORMOAPOS - Remove patients who underwent HORMONIO and HORMOAPOS
    df_aux = df_aux[df_aux.HORMONIO == 0]
    df_aux = df_aux[df_aux.HORMOAPOS == 0]

    # TMO and TMOAPOS - Remove patients who underwent TMO and TMOAPOS
    df_aux = df_aux[df_aux.TMO == 0]
    df_aux = df_aux[df_aux.TMOAPOS == 0]

    # ANODIAG - Filter for diagnosis year before 2020 (commented out)
    # df_aux = df_aux[df_aux.ANODIAG < 2020]

    # Age - Remove age ranges 0-9 and 10-19
    df_aux = df_aux[~df_aux.FAIXAETAR.isin(['00-09', '10-19'])]

    # Date Columns - Convert to datetime objects
    list_datas = ['DTCONSULT', 'DTDIAG', 'DTTRAT', 'DTULTINFO']

    for col_data in list_datas:
        df_aux[col_data] = pd.to_datetime(df_aux[col_data])

    # Time Calculations - Calculate time differences in days
    df_aux['CONSDIAG'] = (df_aux.DTDIAG - df_aux.DTCONSULT).dt.days
    df_aux['DIAGTRAT'] = (df_aux.DTTRAT - df_aux.DTDIAG).dt.days
    df_aux['TRATCONS'] = (df_aux.DTTRAT - df_aux.DTCONSULT).dt.days
    df_aux['ULTIDIAG'] = (df_aux.DTULTINFO - df_aux.DTDIAG).dt.days
    # df_aux['ULTIDIAG_M'] = (df_aux.DTULTINFO - df_aux.DTDIAG).dt.months  # Commented out

    # # Function to calculate the difference in months (commented out)
    # df_aux['ULTIDIAG_M'] = df_aux.apply(lambda row: relativedelta(row['DTULTINFO'], row['DTDIAG']).months +
    #                                     12 * (relativedelta(row['DTULTINFO'], row['DTDIAG']).years), axis=1)

    df_aux[['DIAGTRAT', 'TRATCONS']] = df_aux[['DIAGTRAT', 'TRATCONS']].fillna(-1)

    # Time Categories - Create categorical variables for time differences
    df_aux['CONSDIAG_CAT'] = [0 if consdiag <= 30 else 1 if consdiag <= 60 else 2 for consdiag in df_aux.CONSDIAG]
    df_aux['TRATCONS_CAT'] = [3 if tratcons < 0 else 0 if tratcons <= 60 else 1 if tratcons <= 90 else 2 for tratcons in df_aux.TRATCONS]
    df_aux['DIAGTRAT_CAT'] = [3 if diagtrat < 0 else 0 if diagtrat <= 60 else 1 if diagtrat <= 90 else 2 for diagtrat in df_aux.DIAGTRAT]

    # DRS - Extract DRS number
    DRS_expand = df_aux.DRS.str.split(' ', expand=True)
    df_aux['DRS'] = DRS_expand[1]

    # DRS_INST - Extract DRS_INST number
    DRS_expand = df_aux.DRS_INST.str.split(' ', expand=True)
    df_aux['DRS_INST'] = DRS_expand[1]

    # Metastasis Presence - Create a column indicating the presence of metastasis
    df_aux['PRESENCA_META'] = 0
    df_aux.loc[df_aux.META01.notnull(), 'PRESENCA_META'] = 1

    # Recurrence Presence - Create a variable for presence of recurrence (0 = No, 1 = Yes)
    df_aux['PRESENCA_REC'] = [0 if rec == 1 else 1 for rec in df_aux.RECNENHUM]
    df_aux.loc[(df_aux.PRESENCA_REC == 0) & (df_aux.DTRECIDIVA.notnull()), 'PRESENCA_REC'] = 1

    # Outcome Variables - Create outcome variables (overall survival, cancer-specific survival, and survival at 1, 2, 3, 4 and 5 years)
    df_aux['obito_geral'] = 0
    df_aux['obito_cancer'] = 0

    df_aux['sobrevida_ano1'] = 0
    df_aux['sobrevida_ano2'] = 0
    df_aux['sobrevida_ano3'] = 0
    df_aux['sobrevida_ano4'] = 0
    df_aux['sobrevida_ano5'] = 0

    df_aux.loc[df_aux.ULTINFO > 2, 'obito_geral'] = 1

    df_aux.loc[df_aux.ULTINFO == 3, 'obito_cancer'] = 1

    df_aux.loc[df_aux.ULTIDIAG > 365.25, 'sobrevida_ano1'] = 1
    df_aux.loc[df_aux.ULTIDIAG > 2*365.25, 'sobrevida_ano2'] = 1
    df_aux.loc[df_aux.ULTIDIAG > 3*365.25, 'sobrevida_ano3'] = 1
    df_aux.loc[df_aux.ULTIDIAG > 4*365.25, 'sobrevida_ano4'] = 1
    df_aux.loc[df_aux.ULTIDIAG > 5*365.25, 'sobrevida_ano5'] = 1

    # Drop Unused Columns - Remove columns that will not be used in the analysis
    cols = df_aux.columns
    drop_cols = ['UFNASC', 'UFRESID', 'CIDADE', 'DTCONSULT', 'CLINICA', 'DTDIAG',
                 'BASEDIAG', 'TOPOGRUP', 'DESCTOPO', 'DESCMORFO', 'T', 'N', 'M',
                 'PT', 'PN', 'PM', 'S', 'G', 'LOCALTNM', 'IDMITOTIC', 'PSA',
                 'GLEASON', 'OUTRACLA', 'META01', 'META02', 'META03', 'META04',
                 'DTTRAT', 'NAOTRAT', 'TRATAMENTO', 'TRATHOSP', 'TRATFANTES',
                 'TRATFAPOS', 'HORMONIO', 'TMO', 'NENHUMANT', 'CIRURANT', 'RADIOANT',
                 'QUIMIOANT', 'HORMOANT', 'TMOANT', 'IMUNOANT', 'OUTROANT',
                 'HORMOAPOS', 'TMOAPOS', 'DTULTINFO', 'CICI', 'CICIGRUP', 'CICISUBGRU',
                 'FAIXAETAR', 'LATERALI', 'INSTORIG', 'ERRO', 'DTRECIDIVA',
                 'RECNENHUM', 'RECLOCAL', 'RECREGIO', 'RECDIST', 'REC01', 'REC02',
                 'REC03', 'REC04', 'CIDO', 'DSCCIDO', 'HABILIT', 'HABIT11',
                 'HABILIT1', 'CIDADEH', 'PERDASEG', 'CIDADE_INS']

    cols = cols.drop(drop_cols)

    return df_aux[cols]

# **Data Preparation**

In [None]:
# Database - Set/2024
!gdown 1aFSW3w4sgOIJdXvUni8Dv_DsIjVGBpRp --quiet

In [None]:
# Load Data and Display Information
df = pd.read_csv('pacigeral_set24.csv')  # Load the CSV file into a Pandas DataFrame
print(df.shape)  # Print the dimensions of the DataFrame (rows, columns)
df.head(3)  # Display the first 3 rows of the DataFrame

  df = pd.read_csv('pacigeral_set24.csv')


(1233793, 105)


Unnamed: 0,INSTITU,ESCOLARI,IDADE,SEXO,UFNASC,UFRESID,IBGE,CIDADE,CATEATEND,DTCONSULT,...,CIDO,DSCCIDO,HABILIT,HABIT11,HABILIT1,HABILIT2,CIDADEH,DRS_INST,RRAS_INST,CIDADE_INS
0,14,4,49,2,SC,SC,4209409,LAGUNA,9,2011-03-23,...,81603.0,COLANGIOCARCINOMA,14,Inativo,6,5,São Paulo,DRS 01 Grande Sao Paulo,RRAS 06,SAO PAULO
1,8672,9,54,1,SP,SP,3550308,SAO PAULO,9,2006-05-30,...,80903.0,CARCINOMA BASOCELULAR SOE,7,CACON com Serviço de Oncologia Pediátrica,3,2,São Paulo,DRS 01 Grande Sao Paulo,RRAS 06,SAO PAULO
2,19100,9,77,1,SP,SP,3504503,AVARE,9,2003-12-14,...,82113.0,ADENOCARCINOMA TUBULAR,14,Inativo,6,5,Avaré,DRS 06 Bauru,RRAS 09,AVARE


In [None]:
# Prepare Data for Lung Cancer Analysis
df_pulm = preparing_dataset(df) # Prepare the data, filtering for cervical cancer (ICD-10 code 'C53')

print(df_pulm.shape) # Print the shape (rows, columns) of the filtered DataFrame
df_pulm.head(3) # Display the first 3 rows of the filtered DataFrame

(45729, 49)


Unnamed: 0,INSTITU,ESCOLARI,IDADE,SEXO,IBGE,CATEATEND,DIAGPREV,TOPO,MORFO,EC,...,DIAGTRAT_CAT,PRESENCA_META,PRESENCA_REC,obito_geral,obito_cancer,sobrevida_ano1,sobrevida_ano2,sobrevida_ano3,sobrevida_ano4,sobrevida_ano5
14,612374,2,51,2,3550308,2,2,C341,81403,IV,...,3,1,0,1,1,0,0,0,0,0
89,8,9,56,2,3548708,1,1,C342,83233,IB,...,0,0,0,0,0,1,1,1,1,1
142,8,9,53,1,3550308,9,2,C341,81403,IIIB,...,0,0,0,0,0,1,1,1,1,1


In [None]:
# Calculate and Adjust Survival Time in Months
df_pulm['meses_diag'] = np.ceil(df_pulm['ULTIDIAG']/30).astype(int) # Calculate survival time in months, rounding up

df_pulm.loc[df_pulm.meses_diag == 0, 'meses_diag'] = 1 # Ensure no survival time is zero (set to 1 month)
df_pulm.loc[df_pulm.meses_diag > 60, ['meses_diag', 'obito_geral']] = [61, 0] # Cap survival time at 61 months and set obito_geral to 0 for those exceeding the cap

df_pulm.meses_diag.value_counts().sort_index().tail() # Display the value counts for the last few survival times (for checking)

Unnamed: 0_level_0,count
meses_diag,Unnamed: 1_level_1
57,81
58,62
59,80
60,68
61,2833


In [None]:
# Final Columns
df_pulm.columns

Index(['INSTITU', 'ESCOLARI', 'IDADE', 'SEXO', 'IBGE', 'CATEATEND', 'DIAGPREV',
       'TOPO', 'MORFO', 'EC', 'ECGRUP', 'NENHUM', 'CIRURGIA', 'RADIO',
       'QUIMIO', 'IMUNO', 'OUTROS', 'NENHUMAPOS', 'CIRURAPOS', 'RADIOAPOS',
       'QUIMIOAPOS', 'IMUNOAPOS', 'OUTROAPOS', 'ULTINFO', 'CONSDIAG',
       'TRATCONS', 'DIAGTRAT', 'ANODIAG', 'DRS', 'RRAS', 'DSCINST', 'IBGEATEN',
       'HABILIT2', 'DRS_INST', 'RRAS_INST', 'ECGRUP_CAT', 'ULTIDIAG',
       'CONSDIAG_CAT', 'TRATCONS_CAT', 'DIAGTRAT_CAT', 'PRESENCA_META',
       'PRESENCA_REC', 'obito_geral', 'obito_cancer', 'sobrevida_ano1',
       'sobrevida_ano2', 'sobrevida_ano3', 'sobrevida_ano4', 'sobrevida_ano5',
       'meses_diag'],
      dtype='object')

**Saving the database into a csv file**

In [None]:
# Saving database
df_pulm.to_csv('pulmao.csv', index=False)