In [38]:
#importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
import os
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

In [39]:
#importing our cancer dataset
glioma_data = pd.read_csv('TCGA_InfoWithGrade.csv')
print(glioma_data.shape)


(839, 24)


In [40]:
X = glioma_data.drop("Grade", axis=1)
y = glioma_data["Grade"]

drop_list1 = ['BCOR', 'CSMD3']
X1 = X.drop(drop_list1,axis = 1 )

print(X1.shape)
print(y.shape)

(839, 21)
(839,)


In [41]:
X_train, X_test, y_train, y_test = train_test_split(X1, y, test_size=0.2, random_state=123)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(671, 21) (671,)
(168, 21) (168,)


In [42]:
#Feature Scaling

from sklearn.preprocessing import StandardScaler

sc_scaler = StandardScaler()

X_train = sc_scaler.fit_transform(X_train)
X_test = sc_scaler.transform(X_test)

print(X_train.shape)

(671, 21)


In [43]:
from imblearn.over_sampling import SMOTE

# Instantiate SMOTE
smote = SMOTE(sampling_strategy='auto', random_state=42)
# Apply SMOTE to the training data
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

**INDIVIDUAL MODELS**

**1. GaussianProcessClassifier**

In [44]:
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.metrics import accuracy_score

# Create a Gaussian Process Classifier with an RBF kernel
kernel = RBF(1.0)  # Radial basis function (RBF) kernel
gpc = GaussianProcessClassifier(kernel= 1**2 * RBF(length_scale=1), n_restarts_optimizer= 0)

# Fit the classifier to the training data
gpc.fit(X_train_resampled, y_train_resampled)

# Make predictions on the test data
y_pred = gpc.predict(X_test)

# Calculate accuracy score
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.9107142857142857


**2. Logistic Regression**

In [45]:
import warnings
warnings.filterwarnings("ignore")
from sklearn.metrics import accuracy_score

# Step 1: Load a pre-trained model (e.g., Logistic Regression)
pretrained_model = LogisticRegression(penalty='l2', C=0.2)  # You can replace this with any other pre-trained model

# Step 2: Fine-tune the pre-trained model on the glioma grading task
pretrained_model.fit(X_train_resampled, y_train_resampled)

# Step 3: Evaluate the model's performance on the test set
y_pred = pretrained_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.9047619047619048


**4. DecisionTreeClassifier**

In [46]:
from sklearn.tree import DecisionTreeClassifier
dtree = DecisionTreeClassifier(criterion="gini", max_depth=5, min_samples_leaf=2, min_impurity_decrease=0.1)
# Train the classifier on the training data
dtree.fit(X_train_resampled, y_train_resampled)

# Make predictions on the test set
y_pred = dtree.predict(X_test)

# Evaluate the classifier's performance
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.9047619047619048


**5. GaussianNaiveBayes**

In [47]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB(var_smoothing=0.01)

# Train the classifier on the training data
gnb.fit(X_train_resampled, y_train_resampled)

# Make predictions on the test set
y_pred = gnb.predict(X_test)

# Evaluate the classifier's performance
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.9107142857142857


**6. XGBoost**

In [48]:
#XGBClassifier
import xgboost as xgb
xgb_clf = xgb.XGBClassifier(gamma= 0, learning_rate= 0.2, max_depth= 3, n_estimators= 50)
xgb_clf.fit(X_train_resampled, y_train_resampled)
print(xgb_clf.score(X_test, y_test))

0.8988095238095238


**7. GradientBoostingClassifier**

In [49]:
from sklearn.ensemble import GradientBoostingClassifier
gbc_clf = GradientBoostingClassifier(learning_rate= 0.01, max_depth= 5, min_samples_leaf= 1, min_samples_split= 10, n_estimators= 50, subsample= 0.8)
gbc_clf.fit(X_train_resampled, y_train_resampled)
# Make predictions on the test set
y_pred = gbc_clf.predict(X_test)

# Evaluate the classifier's performance
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.9166666666666666


Ensemble methods


In [50]:
from sklearn.ensemble import VotingClassifier

estimators = [("GNB", gnb), ("DT", dtree), ("XGB", xgb_clf), ("GPC", gpc), ("GBC", gbc_clf)]
voting_clf1 = VotingClassifier(estimators=estimators, voting="hard")

voting_clf1 = voting_clf1.fit(X_train_resampled, y_train_resampled)
print("Voter 1's accuracy: %.2f%%" % (100*voting_clf1.score(X_test, y_test)))


Voter 1's accuracy: 91.07%


In [51]:
from itertools import combinations
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score,
                             roc_curve, roc_auc_score, confusion_matrix, precision_recall_curve)
estimators = [("GNB", gnb), ("DT", dtree), ("XGB", xgb_clf), ("GPC", gpc), ("GBC", gbc_clf)]

vot_classifiers = []
for n in range(3, 6):
    for subset in combinations(estimators, n):
        vot_classifier = VotingClassifier(estimators=list(subset), voting='hard')
        vot_classifiers.append(vot_classifier)

results_df = pd.DataFrame(columns=['Classifier', 'Accuracy', 'Precision', 'Recall', 'F1 Score'])
results_dff = pd.DataFrame(columns=['Classifier', 'TP', 'FP', 'FN', 'TN'])


for vc in vot_classifiers:
    vc.fit(X_train_resampled, y_train_resampled)
    y_pred = vc.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    classifier_name = [name for name, _ in vc.estimators]
    clasf_names_str = ', '.join(classifier_name)
    cm = confusion_matrix(y_test, y_pred)
    tp, fp, fn, tn = cm.ravel()
    results_df = results_df.append({'Classifier': clasf_names_str, 'Accuracy': accuracy,
                                    'Precision': precision, 'Recall': recall, 'F1 Score': f1},
                                   ignore_index=True)
    results_dff = results_dff.append({'Classifier': clasf_names_str, 'TP': tp, 'FP': fp, 'FN': fn, 'TN': tn},ignore_index=True)


In [52]:
combo = results_df.style.format(precision=3)
def highlight_max(s):
    is_max = s == s.max()
    return ['background: lightgreen' if cell else '' for cell in is_max]
combo.apply(highlight_max)


Unnamed: 0,Classifier,Accuracy,Precision,Recall,F1 Score
0,"GNB, DT, XGB",0.911,0.85,0.958,0.901
1,"GNB, DT, GPC",0.911,0.85,0.958,0.901
2,"GNB, DT, GBC",0.917,0.852,0.972,0.908
3,"GNB, XGB, GPC",0.905,0.848,0.944,0.893
4,"GNB, XGB, GBC",0.911,0.85,0.958,0.901
5,"GNB, GPC, GBC",0.911,0.85,0.958,0.901
6,"DT, XGB, GPC",0.905,0.848,0.944,0.893
7,"DT, XGB, GBC",0.917,0.852,0.972,0.908
8,"DT, GPC, GBC",0.917,0.852,0.972,0.908
9,"XGB, GPC, GBC",0.905,0.848,0.944,0.893


**STACKING ENSEMBLE**

In [33]:
from sklearn.ensemble import StackingClassifier

estimators = [("GNB", gnb), ("DT", dtree), ("XGB", xgb_clf), ("GPC", gpc), ("GBC", gbc_clf)]
stk_classifiers = []
for n in range(3, 6):
    for subset in combinations(estimators, n):
        stk_classifier = StackingClassifier(estimators=list(subset), final_estimator=pretrained_model)
        stk_classifiers.append(stk_classifier)

stk_results = pd.DataFrame(columns=['Classifier', 'Accuracy', 'Precision', 'Recall', 'F1 Score'])
stk_resultss = pd.DataFrame(columns=['Classifier', 'TP', 'FP', 'FN', 'TN'])


for stk in stk_classifiers:
    stk.fit(X_train_resampled, y_train_resampled)
    y_pred = stk.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    classifier_name = [name for name, _ in stk.estimators]
    clasf_names_str = ', '.join(classifier_name)
    cm = confusion_matrix(y_test, y_pred)
    tp, fp, fn, tn = cm.ravel()
    stk_results = stk_results.append({'Classifier': clasf_names_str, 'Accuracy': accuracy,
                                    'Precision': precision, 'Recall': recall, 'F1 Score': f1},
                                   ignore_index=True)
    stk_resultss = stk_resultss.append({'Classifier': clasf_names_str, 'TP': tp, 'FP': fp, 'FN': fn, 'TN': tn},ignore_index=True)

In [34]:
stk_combo = stk_results.style.format(precision=3)
def highlight_max(s):
    is_max = s == s.max()
    return ['background: lightgreen' if cell else '' for cell in is_max]
stk_combo.apply(highlight_max)


Unnamed: 0,Classifier,Accuracy,Precision,Recall,F1 Score
0,"GNB, DT, XGB",0.911,0.85,0.958,0.901
1,"GNB, DT, GPC",0.911,0.85,0.958,0.901
2,"GNB, DT, GBC",0.917,0.861,0.958,0.907
3,"GNB, XGB, GPC",0.911,0.85,0.958,0.901
4,"GNB, XGB, GBC",0.917,0.861,0.958,0.907
5,"GNB, GPC, GBC",0.917,0.861,0.958,0.907
6,"DT, XGB, GPC",0.911,0.85,0.958,0.901
7,"DT, XGB, GBC",0.917,0.852,0.972,0.908
8,"DT, GPC, GBC",0.917,0.852,0.972,0.908
9,"XGB, GPC, GBC",0.905,0.848,0.944,0.893
