In [1]:
import pandas as pd
import numpy as np  
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler , LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE
from sklearn.pipeline import Pipeline 
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import OneHotEncoder

In [2]:
df = pd.read_csv('lung_cancer_data.csv')

In [3]:
df.head()

Unnamed: 0,Patient_ID,Age,Gender,Smoking_History,Tumor_Size_mm,Tumor_Location,Stage,Treatment,Survival_Months,Ethnicity,...,Alanine_Aminotransferase_Level,Aspartate_Aminotransferase_Level,Creatinine_Level,LDH_Level,Calcium_Level,Phosphorus_Level,Glucose_Level,Potassium_Level,Sodium_Level,Smoking_Pack_Years
0,Patient0000,68,Male,Current Smoker,81.678677,Lower Lobe,Stage III,Surgery,44,Hispanic,...,27.985571,46.801214,1.245849,239.240255,10.366307,3.547734,113.919243,4.968163,139.822861,17.006956
1,Patient0001,58,Male,Never Smoked,78.448272,Lower Lobe,Stage I,Radiation Therapy,101,Caucasian,...,30.120956,39.711531,1.463231,233.515237,10.081731,2.94502,101.321578,3.896795,135.449361,93.270893
2,Patient0002,44,Male,Former Smoker,67.714305,Lower Lobe,Stage I,Chemotherapy,69,African American,...,5.882418,32.640602,0.630109,169.03746,8.660892,4.637399,78.214177,4.36905,143.377155,70.348376
3,Patient0003,72,Male,Current Smoker,70.806008,Lower Lobe,Stage III,Chemotherapy,95,African American,...,38.908154,44.319393,0.594342,213.96759,8.832669,3.617098,127.895361,4.348474,138.586005,19.828128
4,Patient0004,37,Female,Never Smoked,87.272433,Lower Lobe,Stage IV,Radiation Therapy,105,Asian,...,26.344877,15.746906,1.478239,118.187543,9.247609,4.773255,148.801185,3.671976,141.230724,81.047456


In [4]:
df.drop(['Ethnicity'], axis=1, inplace=True)
df.drop(['Insurance_Type'], axis=1, inplace=True)
df.drop(['Gender'], axis=1, inplace=True)
df.drop(['Patient_ID'], axis=1, inplace=True)

In [5]:
Categorical = ['Smoking_History','Tumor_Location','Stage','Treatment','Family_History','Comorbidity_Diabetes','Comorbidity_Hypertension','Comorbidity_Heart_Disease','Comorbidity_Chronic_Lung_Disease','Comorbidity_Kidney_Disease','Comorbidity_Autoimmune_Disease','Comorbidity_Other']
# #Initialize OneHotEncoder
# encoder = OneHotEncoder(sparse_output=False)
# # Apply one-hot encoding to the categorical columns
# one_hot_encoded = encoder.fit_transform(df[Categorical])
# #We use get_feature_names_out() to get the column names for the encoded data
# one_hot_df = pd.DataFrame(one_hot_encoded, columns=encoder.get_feature_names_out(Categorical))
# # Concatenate the one-hot encoded dataframe with the original dataframe
# df_encoded = pd.concat([df, one_hot_df], axis=1)

# # Drop the original categorical columns
# df_encoded = df_encoded.drop(Categorical, axis=1)
label_encoders = {}

for column in Categorical:
    le = LabelEncoder()
    df[column] = le.fit_transform(df[column])
    label_encoders[column] = le

In [6]:
df.head()

Unnamed: 0,Age,Smoking_History,Tumor_Size_mm,Tumor_Location,Stage,Treatment,Survival_Months,Family_History,Comorbidity_Diabetes,Comorbidity_Hypertension,...,Alanine_Aminotransferase_Level,Aspartate_Aminotransferase_Level,Creatinine_Level,LDH_Level,Calcium_Level,Phosphorus_Level,Glucose_Level,Potassium_Level,Sodium_Level,Smoking_Pack_Years
0,68,0,81.678677,0,2,2,44,0,1,1,...,27.985571,46.801214,1.245849,239.240255,10.366307,3.547734,113.919243,4.968163,139.822861,17.006956
1,58,2,78.448272,0,0,1,101,1,1,1,...,30.120956,39.711531,1.463231,233.515237,10.081731,2.94502,101.321578,3.896795,135.449361,93.270893
2,44,1,67.714305,0,0,0,69,1,0,0,...,5.882418,32.640602,0.630109,169.03746,8.660892,4.637399,78.214177,4.36905,143.377155,70.348376
3,72,0,70.806008,0,2,0,95,1,1,0,...,38.908154,44.319393,0.594342,213.96759,8.832669,3.617098,127.895361,4.348474,138.586005,19.828128
4,37,2,87.272433,0,3,1,105,0,1,1,...,26.344877,15.746906,1.478239,118.187543,9.247609,4.773255,148.801185,3.671976,141.230724,81.047456


In [7]:
X_features = df.drop(['Stage'], axis=1)
y_target = df['Stage']

In [8]:
# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_features, y_target, test_size=0.2, random_state=42)

In [9]:
model = RandomForestClassifier(n_estimators=100, random_state=42)


In [10]:
# Create an RFE object with the RandomForestClassifier and specify the number of features to select
rfe = RFE(estimator=model, n_features_to_select=5, step=1)

In [11]:
# Create a pipeline with RFE and RandomForestClassifier
pipeline = Pipeline([
    ('feature_selection', rfe),
    ('classifier', model)
])

In [12]:
# Fit the pipeline on the training data
pipeline.fit(X_train, y_train)