## Breast Cancer prediction:
The dataset is huge so we are aiming to preprocess it before training it on models
We will start by importing some libraries:

In [222]:
#importing the necessary libraries
import pandas as pd
import numpy as np


### Step 1: Read data and check missing values...

In [None]:
#Reading the data and checking number of missing values and duplicate rows
data = pd.read_csv("data.csv")
print(data.head())
print("\nData Shape: ", data.shape)

print("\nNumber of duplicated rows: ", data.duplicated().sum())

print("\nNumber of Missing Values per column: ", data.isna().sum())


### Step 2: Drop the unnecessary columns and outliers

In [224]:
#Filling out the one record with the missing cause of death
data.loc[data['patient_id'] == 5130, 'death_from_cancer'] = 'Died of Disease'

#Dropping irrelevant columns
data = data.drop(columns=["er_status_measured_by_ihc", "radio_therapy", "primary_tumor_laterality", "chemotherapy", "patient_id", "3-gene_classifier_subtype", "cohort"])

#Dropping outliers
data.drop(data[data['pam50_+_claudin-low_subtype'] == 'NC'].index, inplace=True)
data.drop(data[data['tumor_stage'] == '0'].index, inplace=True)


### Step 3: Complete the missing records

##### 1) Completing the missing **tumor_size** records

In [225]:
# Dropping rows with missing 'tumor_size' values
temp_data = data.dropna(subset=['tumor_size'])

# Grouping the data by 'cancer_type' and 'pam50_+_claudin-low_subtype' on 'tumor_size'
grouped = temp_data.groupby(['cancer_type', 'pam50_+_claudin-low_subtype'])['tumor_size']

# Calculating the mean tumor size for each group
avg_tumor_size = grouped.mean()

# Filling the missing values in the original dataframe with the average of the respective type x subtype combination
data.loc[data['tumor_size'].isna(), 'tumor_size'] = data.loc[data['tumor_size'].isna(), ['cancer_type', 'pam50_+_claudin-low_subtype']].apply(lambda x: avg_tumor_size[x['cancer_type'], x['pam50_+_claudin-low_subtype']], axis=1)


##### 2) Completing the missing **neoplasm_histologic_grade** records

In [226]:
#Select rows where 'neoplasm_histologic_grade' is missing
grade_pred = data[data['neoplasm_histologic_grade'].isna()]

# Select relevant columns
Stage = data[["nottingham_prognostic_index", "neoplasm_histologic_grade", "tumor_size", "lymph_nodes_examined_positive"]]

# Drop rows with missing values
Stage = Stage.dropna()

# Reset index
Stage = Stage.reset_index(drop=True)

# Calculate the lymph node stage
conditions = [
    (Stage['lymph_nodes_examined_positive'] == 0) | ((Stage['lymph_nodes_examined_positive'] > 0) & (Stage['tumor_size'] <= 20)),
    (Stage['lymph_nodes_examined_positive'].between(1, 3, inclusive='both')) | ((Stage['lymph_nodes_examined_positive'] > 0) & (Stage['tumor_size'].between(20, 200, inclusive='both'))),
    Stage['lymph_nodes_examined_positive'].between(4, 9, inclusive='both'),
    Stage['lymph_nodes_examined_positive'] >= 10
]
choices = [1, 2, 3, 4]
Stage['lymph_node_stage'] = np.select(conditions, choices, default=0)

# Calculate the histological grade from the NPI
Stage['histological_grade'] = (Stage['nottingham_prognostic_index'] - (Stage['tumor_size'] * 0.2) - Stage['lymph_node_stage'])

# Ensure the histological grade is at least 1
Stage['histological_grade'] = Stage['histological_grade'].apply(lambda x: max(1, int(x)))

# Align the indices of the original DataFrame and the Stage DataFrame
Stage = Stage.reindex(data.index)

# Impute missing grades in the original DataFrame
data.loc[data['neoplasm_histologic_grade'].isna(), 'neoplasm_histologic_grade'] = Stage.loc[grade_pred.index, 'histological_grade'].fillna(1).astype(int)

##### 3) Completing the missing **tumor_stage** records

In [None]:
#We are building a decision tree classifier to predict the missing values in the tumor_stage column which has arounf 395 missing values
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import learning_curve
from sklearn.tree import DecisionTreeClassifier
import matplotlib.pyplot as plt
import numpy as np

tumor_stage_pred = data[data['tumor_stage'].isnull()]       #taking the rows that contains null value for tumor_stage
tumor_stage_pred.reset_index(inplace=True)                  #resetting index
tumor_stage_pred = tumor_stage_pred.dropna(subset=["tumor_size"])
tumor_stage_pred = tumor_stage_pred.dropna(subset=["lymph_nodes_examined_positive"])
tumor_stage_pred = tumor_stage_pred.dropna(subset=["neoplasm_histologic_grade"])
tumor_stage_pred.reset_index(inplace=True)

Stage = data[["tumor_size","tumor_stage","lymph_nodes_examined_positive","neoplasm_histologic_grade"]]  #to predict tumor stage we will use 3 features
Stage.dropna(inplace=True)
Stage.reset_index(inplace=True)
x = Stage.drop(columns="tumor_stage")   #features
x = pd.get_dummies(x)
x = x.drop(columns="index")
y = Stage["tumor_stage"]                #label which is tumor_stage

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.25, shuffle = True)

#We will run gridsearch to find the best combination of hyperparameters
param_grid = {'max_depth':[2,5,10,20],
              'min_samples_split':[5,10,15],
              'criterion':['gini','entropy','log_loss']}

model = DecisionTreeClassifier()

grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy')


grid_search.fit(x_train, y_train)

best_params = grid_search.best_params_
print("Best tree Hyperparameters:", best_params)

best_score = grid_search.best_score_
print("Best Cross-Validation Score:", best_score)

best_model = grid_search.best_estimator_
print("Best Model:", best_model)


y_pred = grid_search.predict(x_test)

def plot_learning_curves(model, x, y):
    train_sizes, train_scores, val_scores = learning_curve(model, x, y, cv=5, scoring='accuracy', train_sizes=np.linspace(0.1, 1.0, 10))

    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    val_scores_mean = np.mean(val_scores, axis=1)
    val_scores_std = np.std(val_scores, axis=1)

    plt.figure(figsize=(10, 6))
    plt.title("Learning Curve")
    plt.xlabel("Training Size")
    plt.ylabel("Score")
    plt.grid()

    plt.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.1, color="r")
    plt.fill_between(train_sizes, val_scores_mean - val_scores_std, val_scores_mean + val_scores_std, alpha=0.1, color="g")

    plt.plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training score")
    plt.plot(train_sizes, val_scores_mean, 'o-', color="g", label="Cross-validation score")

    plt.legend(loc="best")
    plt.show()


In [None]:
#Plotting training vs cross validation curves
plot_learning_curves(grid_search, x_train, y_train)

#Best classifier to predict tumor_stage
print("\n",best_model)
y_pred = best_model.predict(x_test)
print("Tumor stage Predictor's accuracy: ", accuracy_score(y_test, y_pred))


In [None]:
#Checking the accuracy and the score of the model through different metrics
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

tumor_f1 = f1_score(y_test, y_pred, average = None)    #F1 for every class
print("\nTest tumor F1_score:", tumor_f1)
tumor_recall = recall_score(y_test, y_pred, average = None) #recall for every class
print("\nTest tumor Recall:", tumor_recall)
tumor_precision = precision_score(y_test, y_pred, average = None)   #precision for every class
print("\nTest tumor Precision:", tumor_precision)

print("\nTest tumor confusion matrix:", confusion_matrix(y_test, y_pred))   #confusion matrix

print("\nTest tumor classification report: ", classification_report(y_test, y_pred, target_names=["1", "2", "3"]))  #report


##### Here we will use our classifier to predct the missing values for all the rows in the dataset that had an empty tumor_stage column

In [None]:
#fill the null values of tumor stage with model predictions
X_tumor_test = tumor_stage_pred[["tumor_size","lymph_nodes_examined_positive","neoplasm_histologic_grade"]]

X_tumor_test.dropna(inplace=True)
#print("\nmissing values after: ", X_tumor_test.isna().sum())
y_pred = best_model.predict(X_tumor_test)    #getting the predictions on the test data which we extracted from the dataset (test data is all rows that had null values for tumor_stage

tumor_stage_pred["tumor_stage"] = y_pred     #filling the values of tumor_stage with our predicted values


data = data.dropna(subset=["tumor_stage"])

data = pd.concat([data, tumor_stage_pred], ignore_index=False, sort=False)
print("Data shape: ", data.shape, "\nData: ",data.head())
print(data["tumor_stage"])

In [None]:
#dropping unnecessary column and checking number of missing values after predicting tumor_stage column
data = data.drop(columns=["level_0","index"])
print(data.shape)
print("\nmissing values: ", data.isnull().sum().sum())

#### Encoding categorical features and then filling null values with means.


In [None]:
#Using Label Encoder to encode categorical features in the data
from sklearn.preprocessing import LabelEncoder
categorical = data.select_dtypes(exclude="number")
categorical = categorical.drop(columns="cancer_type")


data = data.drop(columns=categorical)

categorical = categorical.astype(str)

label_encoder = LabelEncoder()
categorical = categorical.apply(LabelEncoder().fit_transform)

data = pd.concat([data,categorical], axis=1)
print("full dataset with encoded values :", data)


In [None]:
#fill null values with means
print(data.isna().sum().sum())
label = data["cancer_type"]
data = data.drop(columns="cancer_type")
print(data)
mean = data.mean()
data.fillna(mean, inplace=True)
print(data.shape)
print(data.isna().sum().sum())
data = pd.concat([label,data], axis=1)
data

In [234]:
#Checking number of missing values
print(data.isna().sum().sum())

0


### Normalizing the data with z-score

In [235]:
#Normalizing the data
from sklearn.preprocessing import StandardScaler

label = data["cancer_type"]
data = data.drop(columns="cancer_type")
data_z_scaled = data.copy() 
  
#looping through every column to normalize with z-score
for column in data_z_scaled.columns: 
    data_z_scaled[column] = (data_z_scaled[column] - data_z_scaled[column].mean()) / data_z_scaled[column].std()  
    

data = pd.concat([label,data_z_scaled], axis=1)
data = data.fillna(0)        #because one column has only 0 values when computing the z score we got an undefined result and led to a lot of NAN.
  


### Splitting the data 

In [236]:
#Splitting the data (70-30)
from sklearn.model_selection import train_test_split

y = data["cancer_type"]
x = data.drop(columns="cancer_type")


x_train_val, x_test, y_train_val, y_test = train_test_split(x, y, test_size = 0.3, random_state=3, shuffle = True)



x_train, x_val, y_train, y_val = train_test_split(x_train_val, y_train_val, test_size = 0.1, random_state=3, shuffle = True)

### Optional: Oversampling Or Undersampling since dataset is imbalanced

In [126]:
temp = pd.concat([x_train_val,y_train_val], axis=1)
temp1 = pd.concat([x_test,y_test], axis=1)
def oversample(df):
    classes = {"Breast Mixed Ductal and Lobular Carcinoma":len(df[df['cancer_type'].str.contains('Breast Mixed Ductal and Lobular Carcinoma')]),"Breast Invasive Ductal Carcinoma":len(df[df['cancer_type'].str.contains('Breast Invasive Ductal Carcinoma')]), "Breast Invasive Lobular Carcinoma":len(df[df['cancer_type'].str.contains('Breast Invasive Lobular Carcinoma')]), "Breast Invasive Mixed Mucinous Carcinoma":len(df[df['cancer_type'].str.contains('Breast Invasive Mixed Mucinous Carcinoma')])}
    most = max(classes.values())
    classes_list = []
    for key in classes:
        classes_list.append(df[df['cancer_type'] == key]) 
    classes_sample = []
    for i in range(1,len(classes_list)):
        classes_sample.append(classes_list[i].sample(most, replace=True))
    df_maybe = pd.concat(classes_sample)
    final_df = pd.concat([df_maybe,classes_list[0]], axis=0)
    final_df = final_df.reset_index(drop=True)
    return final_df

temp = oversample(temp)
temp1 = oversample(temp1)
y_train_val = temp["cancer_type"]
x_train_val = temp.drop(columns="cancer_type")
y_test = temp1["cancer_type"]
x_test = temp1.drop(columns="cancer_type")

#### SMOTE Oversampling

In [217]:
from imblearn.over_sampling import RandomOverSampler

ros = RandomOverSampler(random_state=42)
x_train_val, y_train_val = ros.fit_resample(x_train_val, y_train_val)

### Undersampling

In [240]:
from imblearn.under_sampling import RandomUnderSampler

rus = RandomUnderSampler(sampling_strategy="not minority",random_state=42)
x_train_val, y_train_val = rus.fit_resample(x_train_val, y_train_val)

### Training different models and tuning the hyperparameters with gridsearch

In [None]:
#importing necessary libraries
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import learning_curve
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

#!pip install keras.wrappers
#!pip install tensorflow
#!pip install scikeras
#!pip install keras==2.15.0
#from scikeras.wrappers import KerasClassifier
#from tensorflow import keras
#from keras.models import Sequential
#from keras.layers import Flatten
#from tensorflow.keras.layers import Dense
#from sklearn.model_selection import RandomizedSearchCV
x_train_val


#### SVM Model

In [None]:

svm_parameters = {'kernel':['linear','rbf','sigmoid','poly'],
                  'gamma':['scale','auto'],
                  'C': [0.1, 1, 10, 100, 1000]
                 }

svm_CLF = SVC()

svm_grid_search = GridSearchCV(svm_CLF,svm_parameters, cv=5, scoring='accuracy')

svm_grid_search.fit(x_train_val, y_train_val)

svm_model = svm_grid_search.best_estimator_
print("Best svm Model:", svm_model)

svm_best_params = svm_grid_search.best_params_
print("Best svm Hyperparameters:", svm_best_params)

svm_best_score = svm_grid_search.best_score_
print("Best svm Cross-Validation Score:", svm_best_score)

##### Learning curve

In [None]:
#plot validation vs training curve
def plot_learning_curves(model, x, y):
    train_sizes, train_scores, val_scores = learning_curve(model, x, y, cv=5, scoring='accuracy', train_sizes=np.linspace(0.1, 1.0, 10))

    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    val_scores_mean = np.mean(val_scores, axis=1)
    val_scores_std = np.std(val_scores, axis=1)

    plt.figure(figsize=(10, 6))
    plt.title("Learning Curve")
    plt.xlabel("Training Size")
    plt.ylabel("Score")
    plt.grid()

    plt.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.1, color="r")
    plt.fill_between(train_sizes, val_scores_mean - val_scores_std, val_scores_mean + val_scores_std, alpha=0.1, color="g")

    plt.plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training score")
    plt.plot(train_sizes, val_scores_mean, 'o-', color="g", label="Cross-validation score")

    plt.legend(loc="best")
    plt.show()

plot_learning_curves(svm_model, x_train_val, y_train_val)

##### F1 score, recall, precision and confusion matrix

In [None]:
#check F1, recall, accuracy
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

svm_pred = svm_model.predict(x_test)
print("\nsvm model's accuracy is : " ,accuracy_score(y_test,svm_pred))

svm_f1 = f1_score(y_test, svm_pred, average = None)
print("\nTest svm F1_score:", svm_f1)
svm_recall = recall_score(y_test, svm_pred, average = None)
print("\nTest svm Recall:", svm_recall)
svm_precision = precision_score(y_test, svm_pred, average = None)
print("\nTest svm Precision:", svm_precision)

print("\nTest svm confusion matrix:", confusion_matrix(y_test, svm_pred))

print("\nTest svm classification report: ", classification_report(y_test, svm_pred, target_names=["Breast Invasive Ductal Carcinoma","Breast Mixed Ductal and Lobular Carcinoma", "Breast Invasive Lobular Carcinoma","Breast Invasive Mixed Mucinous Carcinoma"]))

#### XGBoost Model

In [None]:
#xgboost
xgb_parameters = {'learning_rate': [0.01,0.05,0.1],
        
        'subsample': [0.6, 0.8, 1.0],
        
        'max_depth': [3, 4, 5, 6, 7, 8, 9, 10],
        'n_estimators': [100, 200, 300, 400, 500],
        
        }

xgb_CLF = GradientBoostingClassifier()

xgb_grid_search = GridSearchCV(xgb_CLF,xgb_parameters, cv=5, scoring='accuracy')

xgb_grid_search.fit(x_train_val, y_train_val)

xgb_best_model = xgb_grid_search.best_estimator_
print("Best xgb Model:", xgb_best_model)

xgb_best_params = xgb_grid_search.best_params_
print("Best xgb Hyperparameters:", xgb_best_params)

xgb_best_score = xgb_grid_search.best_score_
print("Best xgb Cross-Validation Score:", xgb_best_score)


##### Learning curve

In [64]:
plot_learning_curves(xgb_best_model, x_train_val, y_train_val)

##### F1, recall, precision and confusion matrix

In [None]:
xgb_pred = xgb_best_model.predict(x_test)
print("\nsvm model's accuracy is : " ,accuracy_score(y_test,xgb_pred))

xgb_f1 = f1_score(y_test, xgb_pred, average = None)
print("\nTest xgb F1_score:", xgb_f1)
xgb_recall = recall_score(y_test, xgb_pred, average = None)
print("\nTest xgb Recall:", xgb_recall)
xgb_precision = precision_score(y_test, xgb_pred, average = None)
print("\nTest xgb Precision:", xgb_precision)

print("\nTest xgb confusion matrix:", confusion_matrix(y_test, xgb_pred))

print("\nTest xgb classification report: ", classification_report(y_test, xgb_pred, target_names=["Breast Invasive Ductal Carcinoma","Breast Mixed Ductal and Lobular Carcinoma", "Breast Invasive Lobular Carcinoma","Breast Invasive Mixed Mucinous Carcinoma"]))

#### Decision tree Model

In [None]:
tree_parameters = {'max_depth':[2,5,10],
              'min_samples_split':[5,10,15],
              'criterion':['gini','entropy','log_loss']}

tree_CLF = DecisionTreeClassifier()

tree_grid_search = GridSearchCV(tree_CLF,tree_parameters, cv=5, scoring='accuracy')

tree_grid_search.fit(x_train_val, y_train_val)

best_modeltree = tree_grid_search.best_estimator_
print("Best tree Model:", best_modeltree)

best_paramstree = tree_grid_search.best_params_
print("Best tree Hyperparameters:", best_paramstree)

best_scoretree = tree_grid_search.best_score_
print("Best tree Cross-Validation Score:", best_scoretree)


##### Learning curve

In [None]:
plot_learning_curves(best_modeltree, x_train_val, y_train_val)

In [None]:
tree_pred = best_modeltree.predict(x_test)
print("\ntree model's accuracy is : " ,accuracy_score(y_test,tree_pred))

tree_f1 = f1_score(y_test, tree_pred, average = None)
print("\nTest tree F1_score:", tree_f1)
tree_recall = recall_score(y_test, tree_pred, average = None)
print("\nTest tree Recall:", tree_recall)
tree_precision = precision_score(y_test, tree_pred, average = None)
print("\nTest tree Precision:", tree_precision)

print("\nTest tree confusion matrix:", confusion_matrix(y_test, tree_pred))

print("\nTest tree classification report: ", classification_report(y_test, tree_pred, target_names=["Breast Invasive Ductal Carcinoma","Breast Mixed Ductal and Lobular Carcinoma", "Breast Invasive Lobular Carcinoma","Breast Invasive Mixed Mucinous Carcinoma"]))

### Random Forest Model

In [None]:
rf_parameters = {'max_depth':[2,3,5,10,20],
              'criterion':['gini','entropy','log_loss'],
              'min_samples_split':[2,5,10],
                 'n_estimators':[50,100,200]}

rf_CLF = RandomForestClassifier()

rf_grid_search = GridSearchCV(rf_CLF,rf_parameters, cv=5, scoring='accuracy')

rf_grid_search.fit(x_train_val, y_train_val)

best_modelrf = rf_grid_search.best_estimator_
print("Best rf Model:", best_modelrf)

best_paramsrf = rf_grid_search.best_params_
print("Best rf Hyperparameters:", best_paramsrf)

best_scorerf = rf_grid_search.best_score_
print("Best rf Cross-Validation Score:", best_scorerf)


##### Learning curve

In [None]:
plot_learning_curves(best_modelrf, x_train_val, y_train_val)

### F1, recall, precision and confusion matrix

In [None]:
rf_pred = best_modelrf.predict(x_test)
print("\nrf model's accuracy is : " ,accuracy_score(y_test,rf_pred))

rf_f1 = f1_score(y_test, rf_pred, average = None)
print("\nTest rf F1_score:", rf_f1)
rf_recall = recall_score(y_test, rf_pred, average = None)
print("\nTest rf Recall:", rf_recall)
rf_precision = precision_score(y_test, rf_pred, average = None)
print("\nTest rf Precision:", rf_precision)

print("\nTest rf confusion matrix:", confusion_matrix(y_test, rf_pred))

print("\nTest rf classification report: ", classification_report(y_test, rf_pred, target_names=["Breast Invasive Ductal Carcinoma","Breast Mixed Ductal and Lobular Carcinoma", "Breast Invasive Lobular Carcinoma","Breast Invasive Mixed Mucinous Carcinoma"]))