In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import plotly.express as px
from sklearn.model_selection import KFold, cross_val_score



In [2]:
data = pd.read_csv("Colorectal_Cancer_copy.csv")

In [13]:
data.shape

(167497, 28)

In [3]:
import pandas as pd
Colorectal_Cancer = pd.read_csv('Colorectal_Cancer_copy.csv', sep = ',')
Colorectal_Cancer

Unnamed: 0,Patient_ID,Country,Age,Gender,Cancer_Stage,Tumor_Size_mm,Family_History,Smoking_History,Alcohol_Consumption,Obesity_BMI,...,Survival_5_years,Mortality,Healthcare_Costs,Incidence_Rate_per_100K,Mortality_Rate_per_100K,Urban_or_Rural,Economic_Classification,Healthcare_Access,Insurance_Status,Survival_Prediction
0,1,UK,77,M,Localized,69,No,No,Yes,Overweight,...,Yes,No,54413,50,5,Urban,Developed,Moderate,Insured,Yes
1,2,UK,59,M,Localized,33,No,No,No,Overweight,...,Yes,No,76553,37,25,Urban,Developing,High,Uninsured,Yes
2,3,Japan,66,M,Regional,17,No,Yes,No,Normal,...,Yes,No,62805,54,27,Urban,Developed,Moderate,Uninsured,No
3,4,USA,83,M,Regional,14,No,No,No,Obese,...,Yes,No,89393,45,11,Urban,Developed,Moderate,Insured,Yes
4,5,France,66,M,Localized,34,No,Yes,No,Normal,...,Yes,No,66425,15,27,Urban,Developing,High,Insured,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
167492,167493,USA,69,M,Localized,49,No,Yes,No,Overweight,...,No,Yes,100924,57,13,Rural,Developed,Moderate,Insured,Yes
167493,167494,USA,79,F,Regional,29,Yes,Yes,Yes,Overweight,...,Yes,Yes,90331,39,22,Urban,Developed,Low,Insured,Yes
167494,167495,USA,74,M,Metastatic,62,Yes,Yes,Yes,Normal,...,Yes,Yes,90631,13,19,Urban,Developing,Low,Insured,Yes
167495,167496,UK,68,F,Localized,35,No,Yes,Yes,Normal,...,Yes,No,114385,18,23,Urban,Developed,Moderate,Insured,Yes


In [4]:
x= data['Survival_Prediction']
px.histogram(x)

In [5]:
x= data['Obesity_BMI']
px.histogram(x)

In [6]:
x= data['Smoking_History']
px.histogram(x)

In [7]:
feature_columns = ['Age','Tumor_Size_mm','Gender','Obesity_BMI','Smoking_History']
target_column = 'Survival_Prediction'

Model_Dataframe = data[feature_columns + [target_column]].copy()
Model_Dataframe.dropna(inplace=True)

In [8]:
X = Model_Dataframe[feature_columns]
y = Model_Dataframe[target_column]

numerical_features = ['Age', 'Tumor_Size_mm']
categorical_features = ['Gender', 'Obesity_BMI', 'Smoking_History']

preprocessor = ColumnTransformer(
    transformers=[ ('num', StandardScaler(), numerical_features),
                   ('cat', OneHotEncoder(drop='first', handle_unknown='ignore'), categorical_features)]
)

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=32)

In [10]:
rf = Pipeline(steps=[ ('preprocessor', preprocessor),('classifier', RandomForestClassifier(random_state=30,class_weight='balanced'))])

rf.fit(X_train, y_train)

0,1,2
,steps,"[('preprocessor', ...), ('classifier', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,'first'
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [11]:
rf = Pipeline(steps=[ ('preprocessor', preprocessor),('classifier', RandomForestClassifier(random_state=30,class_weight='balanced'))])

rf.fit(X_train, y_train)

0,1,2
,steps,"[('preprocessor', ...), ('classifier', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,'first'
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [12]:
predictions = rf.predict(X_test)
kf = KFold(n_splits=5, shuffle=True, random_state=32)
score = cross_val_score(rf, X, y, cv=kf)

print(classification_report(y_test, predictions))
print("\n\n")
print(score)

print(f"Mean Score is ",{np.mean(score)})

              precision    recall  f1-score   support

          No       0.40      0.44      0.42     13468
         Yes       0.60      0.56      0.58     20032

    accuracy                           0.51     33500
   macro avg       0.50      0.50      0.50     33500
weighted avg       0.52      0.51      0.52     33500




[0.51310448 0.5118806  0.51419445 0.51514971 0.51434371]
Mean Score is  {np.float64(0.5137345891813211)}
