## **Import The Necessities**

In [None]:
import pandas as pd
import numpy as np

train_df = pd.read_csv('./train_data.csv')
test_df = pd.read_csv('./test_data.csv')

# **Initial Exploration**

In [None]:
print("Shape:", train_df.shape)
print("\nFirst 5 rows:")
display(train_df.head())

print("\nColumn info:")
print(train_df.info())

print("\nMissing values:")
print(train_df.isnull().sum())

Shape: (19405, 22)

First 5 rows:


Unnamed: 0,id,Birth_Date,Weight,Height,Urban_Rural,Occupation,Insurance_Type,Family_History,Cancer_Type,Stage_at_Diagnosis,...,Tumor_Size,Surgery_Date,Chemotherapy_Drugs,Radiation_Sessions,Immunotherapy,Targeted_Therapy,Recurrence_Status,Smoking_History,Alcohol_Use,label
0,1,7/1/1994,64.9,155.0cm,Urban,Unemployed,UEBMI,No,Breast,II,...,8.0,10/19/2024,"Paclitaxel,Docetaxel,Doxorubicin",16.0,No,Yes,NO,Never,Regular,1.0
1,2,7/16/1992,61.4,171.0cm,Urban,Factory Worker,UEBMI,Yes,Breast,I,...,10.0,2/28/2021,"Cyclophosphamide,Paclitaxel,Doxorubicin,Docetaxel",10.0,No,No,Yes,Former,Regular,1.0
2,3,6/23/1948,60.7,170.0cm,Rural,Unemployed,NRCMS,No,Stomach,IV,...,13.0,9/25/2022,"Fluorouracil,Cisplatin",21.0,Yes,No,NO,Former,Never,0.0
3,4,11/26/1954,70.2,171.0cm,Urban,Farmer,URBMI,Yes,Cervical,IV,...,3.0,9/13/2024,Cisplatin,10.0,No,Yes,NO,Never,Regular,1.0
4,5,7/8/1979,100.3,186.0cm,Rural,Office Worker,Self-pay,Yes,Lung,II,...,12.0,12/8/2023,"Gemcitabine,Carboplatin",6.0,Yes,No,Yes,Former,Never,0.0



Column info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19405 entries, 0 to 19404
Data columns (total 22 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   id                  19405 non-null  int64  
 1   Birth_Date          19405 non-null  object 
 2   Weight              19405 non-null  float64
 3   Height              19405 non-null  object 
 4   Urban_Rural         19405 non-null  object 
 5   Occupation          19404 non-null  object 
 6   Insurance_Type      19404 non-null  object 
 7   Family_History      19404 non-null  object 
 8   Cancer_Type         19404 non-null  object 
 9   Stage_at_Diagnosis  19404 non-null  object 
 10  Diagnosis_Date      19404 non-null  object 
 11  Symptoms            19332 non-null  object 
 12  Tumor_Size          19404 non-null  float64
 13  Surgery_Date        18038 non-null  object 
 14  Chemotherapy_Drugs  18718 non-null  object 
 15  Radiation_Sessions  19404 non-null  flo

# **Data Cleaning and Preprocessing**

##Remove 'cm' from Height and convert to float type

In [None]:
def remove_cm_from_height(df):
  df['Height'] = df['Height'].str.replace('cm', '').astype(float)
  return df

train_df_height_fixed = remove_cm_from_height(train_df)
test_df_height_fixed = remove_cm_from_height(test_df)

print(train_df_height_fixed['Height'].head())
print("\n")
print(test_df_height_fixed['Height'].head())

0    155.0
1    171.0
2    170.0
3    171.0
4    186.0
Name: Height, dtype: float64


0    179.0
1    154.0
2    191.0
3    172.0
4    190.0
Name: Height, dtype: float64


##Parse date columns to datetime

In [None]:
def parse_date_columns(df, date_cols):
  for col in date_cols:
    df[col] = pd.to_datetime(df[col], errors='coerce')
  return df

date_cols = ['Birth_Date', 'Diagnosis_Date', 'Surgery_Date']

train_df_parsed_dates = parse_date_columns(train_df_height_fixed, date_cols)
test_df_parsed_dates = parse_date_columns(test_df_height_fixed, date_cols)

print("\nSample dates:")
display(train_df_parsed_dates[date_cols].head())
display(test_df_parsed_dates[date_cols].head())


Sample dates:


Unnamed: 0,Birth_Date,Diagnosis_Date,Surgery_Date
0,1994-07-01,2020-02-10,2024-10-19
1,1992-07-16,2014-08-17,2021-02-28
2,1948-06-23,2014-09-25,2022-09-25
3,1954-11-26,2021-01-04,2024-09-13
4,1979-07-08,2019-07-26,2023-12-08


Unnamed: 0,Birth_Date,Diagnosis_Date,Surgery_Date
0,1973-11-08,2018-12-16,2013-09-29
1,1949-02-16,2017-03-24,2014-08-29
2,1970-05-08,2013-02-05,2019-05-07
3,1960-04-01,2019-07-01,2023-01-21
4,1973-07-27,2019-02-09,2014-05-07


##Convert Yes/No columns and columns having null values to numerical ones

In [None]:
def convert_yes_no_to_numerical(df, cols):
  #to lower all the columns
  for col in cols:
    df[col] = df[col].str.strip().str.lower()

  for col in cols:
    df[col] = df[col].map({'yes': 1, 'no': 0})

  return df

def convert_cols_having_null_to_numerical(df):
  #Binary flag for whether surgery was done
  df['Had_Surgery'] = df['Surgery_Date'].notnull().astype(int)

  df['Had_Chemo'] = df['Chemotherapy_Drugs'].notnull().astype(int)
  return df


yes_no_cols = ['Immunotherapy', 'Targeted_Therapy', 'Family_History', 'Recurrence_Status']

train_df_with_numerical_col = convert_yes_no_to_numerical(train_df_parsed_dates, yes_no_cols)
train_df_with_numerical_col = convert_cols_having_null_to_numerical(train_df_with_numerical_col)

test_df_with_numerical_col = convert_yes_no_to_numerical(test_df_parsed_dates, yes_no_cols)
test_df_with_numerical_col = convert_cols_having_null_to_numerical(test_df_with_numerical_col)


#See preview
print("\Fixed features preview:")
display(train_df_with_numerical_col[['Had_Surgery', 'Had_Chemo', 'Immunotherapy', 'Targeted_Therapy',
                 'Family_History', 'Recurrence_Status']].head())

\Fixed features preview:


Unnamed: 0,Had_Surgery,Had_Chemo,Immunotherapy,Targeted_Therapy,Family_History,Recurrence_Status
0,1,1,0.0,1.0,0.0,0.0
1,1,1,0.0,0.0,1.0,1.0
2,1,1,1.0,0.0,0.0,0.0
3,1,1,0.0,1.0,1.0,0.0
4,1,1,1.0,0.0,1.0,1.0


##Handle sequential categorical features by Ordinal Ordering

In [None]:
#Cancer stages to numbers
def convert_stages_to_numbers(df):
  stage_map = {'I': 1, 'II': 2, 'III': 3, 'IV': 4}
  df['Stage_at_Diagnosis'] = df['Stage_at_Diagnosis'].map(stage_map)
  return df

def ordinal_ordering(df):
  smoking_map = {'Never': 0, 'Former': 1, 'Current': 2}
  alcohol_map = {'Never': 0, 'Occasional': 1, 'Regular': 2}

  df['Smoking_History'] = df['Smoking_History'].map(smoking_map)
  df['Alcohol_Use'] = df['Alcohol_Use'].map(alcohol_map)

  return df


train_df_with_numerical_stages = convert_stages_to_numbers(train_df_with_numerical_col)
test_df_with_numerical_stages = convert_stages_to_numbers(test_df_with_numerical_col)

train_df_ordinal_order = ordinal_ordering(train_df_with_numerical_stages)
test_df_ordinal_order = ordinal_ordering(test_df_with_numerical_stages)

#See preview
print("\Fixed features preview:")
display(train_df_ordinal_order[['Stage_at_Diagnosis', 'Smoking_History', 'Alcohol_Use']].head())

\Fixed features preview:


Unnamed: 0,Stage_at_Diagnosis,Smoking_History,Alcohol_Use
0,2.0,0.0,2.0
1,1.0,1.0,2.0
2,4.0,1.0,0.0
3,4.0,0.0,2.0
4,2.0,1.0,0.0


##Handle nominal categorical features

In [None]:
#Find all the distinct symptoms, occupations, Urban-Rural values, insurance types, cancer types, and drugs in the train data

def find_unique_items(df, col_name):
  unique_items = set()
  for item_list in df[col_name]:
    if(item_list=='nan'):
      continue
    try:
      items = [s.strip() for s in item_list.split(',')]  # remove spaces
      unique_items.update(items)
    except:
      pass
  unique_items_list = sorted(unique_items)

  return unique_items_list

unique_symptoms = find_unique_items(train_df_ordinal_order, 'Symptoms')
unique_occupations = find_unique_items(train_df_ordinal_order, 'Occupation')
unique_urban_rural = find_unique_items(train_df_ordinal_order, 'Urban_Rural')
unique_insurance_types = find_unique_items(train_df_ordinal_order, 'Insurance_Type')
unique_cancer_types = find_unique_items(train_df_ordinal_order, 'Cancer_Type')
unique_chemo_drugs = find_unique_items(train_df_ordinal_order, 'Chemotherapy_Drugs')

print(unique_symptoms)
print(unique_occupations)
print(unique_urban_rural)
print(unique_insurance_types)
print(unique_cancer_types)
print(unique_chemo_drugs)

['Blood in Stool', 'Cough', 'Fatigue', 'Lump', 'Nausea', 'Pain', 'Swelling', 'Vomiting', 'Weight Loss']
['Factory Worker', 'Farmer', 'Office Worker', 'Retired', 'Unemployed']
['Rural', 'Urba', 'Urban']
['NRCMS', 'Self-pay', 'UEBMI', 'URBMI']
['Breast', 'Cervical', 'Colorectal', 'Esophageal', 'Liver', 'Lung', 'Stomach']
['Carboplatin', 'Cisplatin', 'Cyclophosphamide', 'Docetaxel', 'Doxorubicin', 'Fluorouracil', 'Gemcitabine', 'Irinotecan', 'Leucovorin', 'Oxaliplatin', 'Paclitaxel', 'Sorafenib']


In [None]:
#for each unique item, we need to add a column with that name
def add_new_cols(df, col_list, prefix_str, corresponding_col_name_in_df):
  for col_name in col_list:
    new_col = []
    for item_list in df[corresponding_col_name_in_df]:

      if pd.isna(item_list):
                new_col.append(0)
      else:
        try:
          items = [s.strip() for s in str(item_list).split(',')]
          new_col.append(1 if col_name in items else 0)
        except:
          pass
    df[prefix_str + col_name] = new_col
  return df

train_df_add_symptoms = add_new_cols(train_df_ordinal_order, unique_symptoms, "Symptom_", "Symptoms")
train_df_add_occupations = add_new_cols(train_df_add_symptoms, unique_occupations, "Occupation_", "Occupation")
train_df_add_urban_rural = add_new_cols(train_df_add_occupations, unique_urban_rural, "U/R_", "Urban_Rural")
train_df_add_insurance_types = add_new_cols(train_df_add_urban_rural, unique_insurance_types, "InsuranceType_", "Insurance_Type")
train_df_add_cancer_types = add_new_cols(train_df_add_insurance_types, unique_cancer_types, "CancerType_", "Cancer_Type")
train_df_add_drugs = add_new_cols(train_df_add_cancer_types, unique_chemo_drugs, "Drug_", "Chemotherapy_Drugs")

pd.set_option('display.max_columns', None)
#Show the first 5 rows
display(train_df_add_drugs.head())

Unnamed: 0,id,Birth_Date,Weight,Height,Urban_Rural,Occupation,Insurance_Type,Family_History,Cancer_Type,Stage_at_Diagnosis,Diagnosis_Date,Symptoms,Tumor_Size,Surgery_Date,Chemotherapy_Drugs,Radiation_Sessions,Immunotherapy,Targeted_Therapy,Recurrence_Status,Smoking_History,Alcohol_Use,label,Had_Surgery,Had_Chemo,Symptom_Blood in Stool,Symptom_Cough,Symptom_Fatigue,Symptom_Lump,Symptom_Nausea,Symptom_Pain,Symptom_Swelling,Symptom_Vomiting,Symptom_Weight Loss,Occupation_Factory Worker,Occupation_Farmer,Occupation_Office Worker,Occupation_Retired,Occupation_Unemployed,U/R_Rural,U/R_Urba,U/R_Urban,InsuranceType_NRCMS,InsuranceType_Self-pay,InsuranceType_UEBMI,InsuranceType_URBMI,CancerType_Breast,CancerType_Cervical,CancerType_Colorectal,CancerType_Esophageal,CancerType_Liver,CancerType_Lung,CancerType_Stomach,Drug_Carboplatin,Drug_Cisplatin,Drug_Cyclophosphamide,Drug_Docetaxel,Drug_Doxorubicin,Drug_Fluorouracil,Drug_Gemcitabine,Drug_Irinotecan,Drug_Leucovorin,Drug_Oxaliplatin,Drug_Paclitaxel,Drug_Sorafenib
0,1,1994-07-01,64.9,155.0,Urban,Unemployed,UEBMI,0.0,Breast,2.0,2020-02-10,"Cough, Weight Loss",8.0,2024-10-19,"Paclitaxel,Docetaxel,Doxorubicin",16.0,0.0,1.0,0.0,0.0,2.0,1.0,1,1,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,1,0,1,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,1,0
1,2,1992-07-16,61.4,171.0,Urban,Factory Worker,UEBMI,1.0,Breast,1.0,2014-08-17,Blood in Stool,10.0,2021-02-28,"Cyclophosphamide,Paclitaxel,Doxorubicin,Docetaxel",10.0,0.0,0.0,1.0,1.0,2.0,1.0,1,1,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,1,0,0,0,0,0,0,0,0,1,1,1,0,0,0,0,0,1,0
2,3,1948-06-23,60.7,170.0,Rural,Unemployed,NRCMS,0.0,Stomach,4.0,2014-09-25,"Nausea, Vomiting",13.0,2022-09-25,"Fluorouracil,Cisplatin",21.0,1.0,0.0,0.0,1.0,0.0,0.0,1,1,0,0,0,0,1,0,0,1,0,0,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0
3,4,1954-11-26,70.2,171.0,Urban,Farmer,URBMI,1.0,Cervical,4.0,2021-01-04,"Nausea, Vomiting",3.0,2024-09-13,Cisplatin,10.0,0.0,1.0,0.0,0.0,2.0,1.0,1,1,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
4,5,1979-07-08,100.3,186.0,Rural,Office Worker,Self-pay,1.0,Lung,2.0,2019-07-26,"Cough, Weight Loss",12.0,2023-12-08,"Gemcitabine,Carboplatin",6.0,1.0,0.0,1.0,1.0,0.0,0.0,1,1,0,1,0,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0


In [None]:
#Do the same for the test set

test_df_add_symptoms = add_new_cols(test_df_ordinal_order, unique_symptoms, "Symptom_", "Symptoms")
test_df_add_occupations = add_new_cols(test_df_add_symptoms, unique_occupations, "Occupation_", "Occupation")
test_df_add_urban_rural = add_new_cols(test_df_add_occupations, unique_urban_rural, "U/R_", "Urban_Rural")
test_df_add_insurance_types = add_new_cols(test_df_add_urban_rural, unique_insurance_types, "InsuranceType_", "Insurance_Type")
test_df_add_cancer_types = add_new_cols(test_df_add_insurance_types, unique_cancer_types, "CancerType_", "Cancer_Type")
test_df_add_drugs = add_new_cols(test_df_add_cancer_types, unique_chemo_drugs, "Drug_", "Chemotherapy_Drugs")

pd.set_option('display.max_columns', None)
#Show the first 5 rows
display(test_df_add_drugs.head())

Unnamed: 0,id,Birth_Date,Weight,Height,Urban_Rural,Occupation,Insurance_Type,Family_History,Cancer_Type,Stage_at_Diagnosis,Diagnosis_Date,Symptoms,Tumor_Size,Surgery_Date,Chemotherapy_Drugs,Radiation_Sessions,Immunotherapy,Targeted_Therapy,Recurrence_Status,Smoking_History,Alcohol_Use,Had_Surgery,Had_Chemo,Symptom_Blood in Stool,Symptom_Cough,Symptom_Fatigue,Symptom_Lump,Symptom_Nausea,Symptom_Pain,Symptom_Swelling,Symptom_Vomiting,Symptom_Weight Loss,Occupation_Factory Worker,Occupation_Farmer,Occupation_Office Worker,Occupation_Retired,Occupation_Unemployed,U/R_Rural,U/R_Urba,U/R_Urban,InsuranceType_NRCMS,InsuranceType_Self-pay,InsuranceType_UEBMI,InsuranceType_URBMI,CancerType_Breast,CancerType_Cervical,CancerType_Colorectal,CancerType_Esophageal,CancerType_Liver,CancerType_Lung,CancerType_Stomach,Drug_Carboplatin,Drug_Cisplatin,Drug_Cyclophosphamide,Drug_Docetaxel,Drug_Doxorubicin,Drug_Fluorouracil,Drug_Gemcitabine,Drug_Irinotecan,Drug_Leucovorin,Drug_Oxaliplatin,Drug_Paclitaxel,Drug_Sorafenib
0,1,1973-11-08,73.7,179.0,Rural,Office Worker,UEBMI,0,Liver,3,2018-12-16,"Cough, Weight Loss",8.0,2013-09-29,"Doxorubicin,Sorafenib",18,1,0,0,1,1,1,1,0,1,0,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1
1,2,1949-02-16,33.2,154.0,Urban,Retired,Self-pay,1,Lung,2,2017-03-24,"Lump, Swelling",10.0,2014-08-29,"Gemcitabine,Carboplatin",10,1,0,1,2,1,1,1,0,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0
2,3,1970-05-08,83.9,191.0,Rural,Office Worker,UEBMI,0,Colorectal,2,2013-02-05,"Cough, Weight Loss",14.0,2019-05-07,"Irinotecan,Oxaliplatin",1,0,0,1,2,2,1,1,0,1,0,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0
3,4,1960-04-01,47.3,172.0,Rural,Unemployed,Self-pay,1,Cervical,1,2019-07-01,Blood in Stool,9.0,2023-01-21,"Cisplatin,Paclitaxel",14,0,1,0,2,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0
4,5,1973-07-27,54.1,190.0,Urban,Office Worker,NRCMS,1,Breast,3,2019-02-09,"Nausea, Vomiting",2.0,2014-05-07,"Cyclophosphamide,Docetaxel,Doxorubicin",16,0,0,1,0,2,1,1,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,0,1,1,0,0,0,1,0,0,0,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0


##Add new features

In [None]:
def add_new_features(df):
  #Age at diagnosis (in years)
  df['Age_at_Diagnosis'] = (df['Diagnosis_Date'] - df['Birth_Date']).dt.days // 365

  #Days between diagnosis and surgery
  df['Days_To_Surgery'] = (df['Surgery_Date'] - df['Diagnosis_Date']).dt.days

  #Number of symptoms
  df['Number_of_Symptoms'] = df['Symptoms'].fillna('').apply(lambda x: len(x.split(',')) if x else 0)

  #Number of drugs
  drug_cols = [col for col in df.columns if col.startswith('Drug_')]
  df['Number_of_Drugs'] = df['Chemotherapy_Drugs'].fillna('').apply(lambda x: len(x.split(',')) if x else 0)

  #Calculate BMI
  df['Height_m'] = df['Height'] / 100
  df['BMI'] = df['Weight'] / (df['Height_m'] ** 2)

  return df

train_df_with_new_features = add_new_features(train_df_add_drugs)
test_df_with_new_features = add_new_features(test_df_add_drugs)

#See the preview
display(train_df_with_new_features[['Age_at_Diagnosis', 'Days_To_Surgery', 'Number_of_Symptoms',
                  'BMI', 'Number_of_Drugs']].head())



Unnamed: 0,Age_at_Diagnosis,Days_To_Surgery,Number_of_Symptoms,BMI,Number_of_Drugs
0,25.0,1713.0,2,27.013528,3
1,22.0,2387.0,1,20.997914,4
2,66.0,2922.0,2,21.00346,2
3,66.0,1348.0,2,24.007387,1
4,40.0,1596.0,2,28.991791,2


##Remove unnecessary columns

In [None]:
#chemotherapy drugs
#occupation
#all the nominal columns

print(train_df_with_new_features.shape)
final_train_df = train_df_with_new_features.drop(['id', 'Urban_Rural', 'Occupation', 'Insurance_Type', 'Cancer_Type', 'Symptoms', 'Chemotherapy_Drugs', 'Birth_Date', 'Diagnosis_Date', 'Surgery_Date'], axis=1)
final_test_df = test_df_with_new_features.drop(['id', 'Urban_Rural', 'Occupation', 'Insurance_Type', 'Cancer_Type', 'Symptoms', 'Chemotherapy_Drugs', 'Birth_Date', 'Diagnosis_Date', 'Surgery_Date'], axis=1)

print(final_train_df.shape)
print(final_test_df.shape)

pd.set_option('display.max_columns', None)
#Show the first 5 rows
display(final_train_df.head())

(19405, 70)
(19405, 60)
(4467, 59)


Unnamed: 0,Weight,Height,Family_History,Stage_at_Diagnosis,Tumor_Size,Radiation_Sessions,Immunotherapy,Targeted_Therapy,Recurrence_Status,Smoking_History,Alcohol_Use,label,Had_Surgery,Had_Chemo,Symptom_Blood in Stool,Symptom_Cough,Symptom_Fatigue,Symptom_Lump,Symptom_Nausea,Symptom_Pain,Symptom_Swelling,Symptom_Vomiting,Symptom_Weight Loss,Occupation_Factory Worker,Occupation_Farmer,Occupation_Office Worker,Occupation_Retired,Occupation_Unemployed,U/R_Rural,U/R_Urba,U/R_Urban,InsuranceType_NRCMS,InsuranceType_Self-pay,InsuranceType_UEBMI,InsuranceType_URBMI,CancerType_Breast,CancerType_Cervical,CancerType_Colorectal,CancerType_Esophageal,CancerType_Liver,CancerType_Lung,CancerType_Stomach,Drug_Carboplatin,Drug_Cisplatin,Drug_Cyclophosphamide,Drug_Docetaxel,Drug_Doxorubicin,Drug_Fluorouracil,Drug_Gemcitabine,Drug_Irinotecan,Drug_Leucovorin,Drug_Oxaliplatin,Drug_Paclitaxel,Drug_Sorafenib,Age_at_Diagnosis,Days_To_Surgery,Number_of_Symptoms,Number_of_Drugs,Height_m,BMI
0,64.9,155.0,0.0,2.0,8.0,16.0,0.0,1.0,0.0,0.0,2.0,1.0,1,1,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,1,0,1,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,1,0,25.0,1713.0,2,3,1.55,27.013528
1,61.4,171.0,1.0,1.0,10.0,10.0,0.0,0.0,1.0,1.0,2.0,1.0,1,1,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,1,0,0,0,0,0,0,0,0,1,1,1,0,0,0,0,0,1,0,22.0,2387.0,1,4,1.71,20.997914
2,60.7,170.0,0.0,4.0,13.0,21.0,1.0,0.0,0.0,1.0,0.0,0.0,1,1,0,0,0,0,1,0,0,1,0,0,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,66.0,2922.0,2,2,1.7,21.00346
3,70.2,171.0,1.0,4.0,3.0,10.0,0.0,1.0,0.0,0.0,2.0,1.0,1,1,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,66.0,1348.0,2,1,1.71,24.007387
4,100.3,186.0,1.0,2.0,12.0,6.0,1.0,0.0,1.0,1.0,0.0,0.0,1,1,0,1,0,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,40.0,1596.0,2,2,1.86,28.991791


##Move the label column in the train dataframe to be the last column

In [None]:
#Get list of all columns except the one to move
cols = [col for col in final_train_df.columns if col != 'label']
# Append the column to move at the end
final_train_df = final_train_df[cols + ['label']]

display(final_train_df.head())

Unnamed: 0,Weight,Height,Family_History,Stage_at_Diagnosis,Tumor_Size,Radiation_Sessions,Immunotherapy,Targeted_Therapy,Recurrence_Status,Smoking_History,Alcohol_Use,Had_Surgery,Had_Chemo,Symptom_Blood in Stool,Symptom_Cough,Symptom_Fatigue,Symptom_Lump,Symptom_Nausea,Symptom_Pain,Symptom_Swelling,Symptom_Vomiting,Symptom_Weight Loss,Occupation_Factory Worker,Occupation_Farmer,Occupation_Office Worker,Occupation_Retired,Occupation_Unemployed,U/R_Rural,U/R_Urba,U/R_Urban,InsuranceType_NRCMS,InsuranceType_Self-pay,InsuranceType_UEBMI,InsuranceType_URBMI,CancerType_Breast,CancerType_Cervical,CancerType_Colorectal,CancerType_Esophageal,CancerType_Liver,CancerType_Lung,CancerType_Stomach,Drug_Carboplatin,Drug_Cisplatin,Drug_Cyclophosphamide,Drug_Docetaxel,Drug_Doxorubicin,Drug_Fluorouracil,Drug_Gemcitabine,Drug_Irinotecan,Drug_Leucovorin,Drug_Oxaliplatin,Drug_Paclitaxel,Drug_Sorafenib,Age_at_Diagnosis,Days_To_Surgery,Number_of_Symptoms,Number_of_Drugs,Height_m,BMI,label
0,64.9,155.0,0.0,2.0,8.0,16.0,0.0,1.0,0.0,0.0,2.0,1,1,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,1,0,1,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,1,0,25.0,1713.0,2,3,1.55,27.013528,1.0
1,61.4,171.0,1.0,1.0,10.0,10.0,0.0,0.0,1.0,1.0,2.0,1,1,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,1,0,0,0,0,0,0,0,0,1,1,1,0,0,0,0,0,1,0,22.0,2387.0,1,4,1.71,20.997914,1.0
2,60.7,170.0,0.0,4.0,13.0,21.0,1.0,0.0,0.0,1.0,0.0,1,1,0,0,0,0,1,0,0,1,0,0,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,66.0,2922.0,2,2,1.7,21.00346,0.0
3,70.2,171.0,1.0,4.0,3.0,10.0,0.0,1.0,0.0,0.0,2.0,1,1,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,66.0,1348.0,2,1,1.71,24.007387,1.0
4,100.3,186.0,1.0,2.0,12.0,6.0,1.0,0.0,1.0,1.0,0.0,1,1,0,1,0,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,40.0,1596.0,2,2,1.86,28.991791,0.0


# **Building and Evaluating Classification Models**

##Split the data

In [None]:
from sklearn.model_selection import train_test_split

train_data_cleaned = final_train_df.dropna()

X = train_data_cleaned.drop('label', axis=1)
y = train_data_cleaned['label']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


##Train multiple classifiers

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Only scale for models that need it
models = {
    'SVM': Pipeline([
        ('scaler', StandardScaler()),
        ('clf', SVC())
    ]),
    'Logistic Regression': Pipeline([
        ('scaler', StandardScaler()),
        ('clf', LogisticRegression(max_iter=1000))
    ]),
    'Random Forest': RandomForestClassifier(random_state=42),  # No scaling
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='logloss')  # No scaling
}


for name, model in models.items():
    print(f"\n===== {name} =====")
    model.fit(X_train, y_train)
    preds = model.predict(X_val)

    acc = accuracy_score(y_val, preds)
    prec = precision_score(y_val, preds, average='weighted', zero_division=0)
    rec = recall_score(y_val, preds, average='weighted', zero_division=0)
    f1 = f1_score(y_val, preds, average='weighted', zero_division=0)

    print(f"Accuracy:  {acc:.4f}")
    print(f"Precision: {prec:.4f}")
    print(f"Recall:    {rec:.4f}")
    print(f"F1 Score:  {f1:.4f}")


===== SVM =====
Accuracy:  0.7631
Precision: 0.7633
Recall:    0.7631
F1 Score:  0.7632

===== Logistic Regression =====
Accuracy:  0.7014
Precision: 0.7014
Recall:    0.7014
F1 Score:  0.7014

===== Random Forest =====
Accuracy:  0.8763
Precision: 0.8763
Recall:    0.8763
F1 Score:  0.8763

===== XGBoost =====


Parameters: { "use_label_encoder" } are not used.



Accuracy:  0.8885
Precision: 0.8888
Recall:    0.8885
F1 Score:  0.8885


In [None]:
def create_best_model(X_train, y_train, X_eval, y_eval):
    best_i = 0
    best_d = 0
    best_acc_Score = 0
    for i in range(50,400,25):
        for d in range(6,20,2):
            xgb = RandomForestClassifier(
                        n_estimators=i,
                        # learning_rate = 0.1,
                        max_depth=d,
                        random_state=42,
                    )
            xgb.fit(X_train, y_train)
            y_pred = xgb.predict(X_eval)
            acc = accuracy_score(y_eval, y_pred)
            if(acc > best_acc_Score):
                best_acc_Score = acc
                best_i=i
                best_d=d
            print("i,d= ", i , d,  "mse = ", acc)
    print("best restult" , best_i, best_d, best_acc_Score)
    return [best_i, best_d]

In [None]:
best_i, best_d = create_best_model(X_train, y_train, X_val, y_val)

i,d=  50 2 mse =  0.8504309146510981
i,d=  50 4 mse =  0.8773978315262719
i,d=  50 6 mse =  0.8879621907144843
i,d=  50 8 mse =  0.8929663608562691
i,d=  50 10 mse =  0.8932443703085905
i,d=  50 12 mse =  0.8904642757853767
i,d=  50 14 mse =  0.8876841812621629
i,d=  75 2 mse =  0.8546010564359188
i,d=  75 4 mse =  0.8843480678343064
i,d=  75 6 mse =  0.8932443703085905
i,d=  75 8 mse =  0.8938003892132332
i,d=  75 10 mse =  0.8915763135946623
i,d=  75 12 mse =  0.8904642757853767
i,d=  75 14 mse =  0.8899082568807339
i,d=  100 2 mse =  0.8609952738393105
i,d=  100 4 mse =  0.886294134000556
i,d=  100 6 mse =  0.8938003892132332
i,d=  100 8 mse =  0.892132332499305
i,d=  100 10 mse =  0.8899082568807339
i,d=  100 12 mse =  0.8901862663330553
i,d=  100 14 mse =  0.8879621907144843
i,d=  125 2 mse =  0.8668334723380595
i,d=  125 4 mse =  0.8887962190714485
i,d=  125 6 mse =  0.890742285237698
i,d=  125 8 mse =  0.8885182096191271
i,d=  125 10 mse =  0.8904642757853767
i,d=  125 12 mse = 

# Train the Final Selected model on the whole dataset

In [None]:
X_train,y_train = X, y
xgb = XGBClassifier(
    n_estimators=best_i,
    learning_rate = 0.1,
    max_depth=best_d,
    random_state = 42,
    verbosity=1
)
xgb.fit(X_train, y_train)



In [None]:
# And your test dataset (with preprocessing already done) is called `test_df`

# Step 1: Predict labels for test data
xgb_preds = xgb.predict(final_test_df)

# Step 2: Create the output DataFrame
submission_df = pd.DataFrame({
    'id': range(1, len(xgb_preds) + 1),
    'Label': xgb_preds
})

# Step 3 (optional): Preview the output
print(submission_df.head())

# Step 4: Save to CSV if needed
submission_df.to_csv("xgboost_submission.csv", index=False)


   id  Label
0   1      1
1   2      0
2   3      0
3   4      0
4   5      1
