In [None]:
# Importing the Python libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import RFE
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_curve, roc_auc_score
from sklearn.model_selection import StratifiedKFold
import warnings
warnings.filterwarnings('ignore')
from sklearn.tree import plot_tree

# Perform Machine learning using RNAseq dataset

In [None]:
# Read an excel file.

df = pd.read_excel('cancer_rcc.xlsx')
df.head(5)

In [None]:
# Prints information about a DataFrame

df.info()

In [None]:
# Count NaN values in DataFrame

df.isna().sum()

In [None]:
df.describe().round(2)

In [None]:
sns.pairplot(df, hue = 'cancer') 

In [None]:
X = df.drop('cancer', axis = 'columns')
y = df['cancer']

In [None]:
X.shape, y.shape

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 50)

In [None]:
y_train.value_counts(), y_test.value_counts()

In [None]:
n_features = [10, 5, 3, 2, 1]

model = DecisionTreeClassifier()
column_names = X.columns.tolist()

# Iterate over the n_features_to_select values
for n in n_features:
    rfe = RFE(estimator = model, n_features_to_select = n)

    rfe.fit(X_train, y_train)

    selected_feature_indices = [i for i, support in enumerate(rfe.support_) if support]

    X_train_selected = X_train.iloc[:, selected_feature_indices]
    X_test_selected = X_test.iloc[:, selected_feature_indices]

    model.fit(X_train_selected, y_train)

    y_pred = model.predict(X_test_selected)

    # Calculate the accuracy score
    accuracy = accuracy_score(y_test, y_pred)

    # Print the selected features and accuracy
    print(f"Number of Selected Features = {n}")
    print("Selected Features")
    for col in X_train_selected.columns:
        print(col)
    print("-------------------------------------------")    
    print("Accuracy:", accuracy)
    print("-------------------------------------------")
    print("Classification report")
    print()
    print(classification_report(y_test, y_pred))
    print("-------------------------------------------")

In [None]:
# Select two candidate features from RFE

cols = ['NDUFA4L2', 'SLC6A3']

In [None]:
X = df[cols]
y = df['cancer']

In [None]:
# Split data into 70% training set and 30% testing set

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y , 
                                                    random_state = 55, 
                                                    test_size = 0.25)

In [None]:
y_train.value_counts(), y_test.value_counts()

In [None]:
# Standardizing data with StandardScaler() function

sc = StandardScaler()
X_train =  sc.fit_transform(X_train)
X_test =  sc.fit_transform(X_test)

### Perform supervised machine learning algorithms
- Logistic regression
- K-nearest neighbors
- Decision tree
- Random forest
- Support vector machine
- Artificial neural network

In [None]:
models = {'Logistic Regression': LogisticRegression(), 
          'K-Nearest Neighbors ': KNeighborsClassifier(),
          'Decision Tree': DecisionTreeClassifier(),
          'Random Forest': RandomForestClassifier(),
          'Support Vector Machine': SVC(),
          'Multilayer Perceptron': MLPClassifier()}

In [None]:
#Define function for determining of model scores

def model_score(models, X_train, X_test, y_train, y_test): 
    np.random.seed(50)
    model_scores = {}
    for model_name, model in models.items():
        model.fit(X_train, y_train)
        model_scores[model_name] = model.score(X_test, y_test)
    model_scores = pd.DataFrame(model_scores, index=['Score']).transpose()
    model_scores = model_scores.sort_values('Score', ascending=False)
    return model_scores

In [None]:
supervised_model_scores = model_score(models, X_train, X_test, y_train, y_test)
supervised_model_scores.style.background_gradient(cmap = 'Reds')

In [None]:
dt = DecisionTreeClassifier(random_state=50)
param_grid = {'max_depth': [3, 5, 7, 10],
              'min_samples_split': [2, 5, 10],
              'min_samples_leaf': [1, 2, 4],
              'criterion': ['gini', 'entropy']}
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=50)

In [None]:
# Use GridSearchCV to find the best hyperparameters

grid_search = GridSearchCV(dt, param_grid=param_grid, cv=cv, n_jobs=-1, verbose=3)
grid_search.fit(X_train, y_train)
print("Best parameters for decision tree:")
print(grid_search.best_params_)
print("Best score for decision tree: {:.2f}%".format(grid_search.best_score_*100))

In [None]:
# Evaluate the decision tree model's performance on the testing data

from sklearn.metrics import accuracy_score, classification_report, plot_confusion_matrix, plot_roc_curve
y_pred = grid_search.predict(X_test)
test_accuracy = accuracy_score(y_test, y_pred)
print("Accuracy of decision tree on testing data: {:.2f}%".format(test_accuracy*100))
print()
print(classification_report(y_test, y_pred))

In [None]:
# Plot the confusion matrix and ROC curve

plot_confusion_matrix(grid_search, X_test, y_test, cmap = plt.cm.Blues)
plt.title('Confusion Matrix (Decision Tree)', fontsize = 16)
plt.xlabel('Predicted Label', fontsize = 14)
plt.ylabel('True Label', fontsize = 14)
plt.show()

In [None]:
plot_roc_curve(grid_search, X_test, y_test)
plt.show()

In [None]:
# Plot the decision tree

plt.figure(figsize = (10, 8))
plot_tree(grid_search.best_estimator_, feature_names = X.columns, class_names=['Class 0', 'Class 1'], filled=True)
plt.title('Decision Tree', fontsize=16)
plt.show()

# Perform Machine learning in IHC dataset 

In [None]:
df2 = pd.read_csv('CCRCC.csv')
df2.head()

In [None]:
df2.RCC_subtype.value_counts()  

In [None]:
df2.columns

In [None]:
cols_df2 = ['NDUFA4L2', 'DAT']

In [None]:
# Extract features and target variables

X2 = df2[cols_df2]
y2 = df2['RCC_subtype']

In [None]:
# To solve the imbalance problem between categories 0 and 1. 
# Apply SMOTE (Synthetic Minority Oversampling Technique) – Oversampling

sm = SMOTE(sampling_strategy = 0.90, random_state = 40)
X_resampled, y_resampled = sm.fit_resample(X2, y2)

In [None]:
X2.shape, y2.shape, X_resampled.shape, y_resampled.shape 

In [None]:
y2.value_counts(), y_resampled.value_counts()

In [None]:
# Split data into train and test set

X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size = 0.2, random_state = 50)

In [None]:
# Select ML models that are good for small datasets include logistic regression, decision trees, and random forests
# More complex models (like SVM or MLP) require a large amount of data to generalize well

models2 = {'Logistic Regression': LogisticRegression(), 
          'Decision Tree': DecisionTreeClassifier(),
          'Random Forest': RandomForestClassifier(),
          'Gradient Boosting': GradientBoostingClassifier()
}

In [None]:
supervised_model_scores = model_score(models2, X_train, X_test, y_train, y_test)
supervised_model_scores.style.background_gradient(cmap = 'Reds')

## Random Forest

In [None]:
# Tuning the best parameters for Random Forest using GridSearchCV

grid_values = { 
    'n_estimators': [100, 200, 300, 400, 500],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : [2, 3, 4, 5, 6, 7, 8, 9, 10],
    'criterion' :['gini', 'entropy']
}
cross_validation = StratifiedKFold(n_splits = 3, shuffle=True, random_state= 42)
rfc = RandomForestClassifier()

In [None]:
grid_search_RF = GridSearchCV(rfc, param_grid = grid_values, cv = cross_validation, n_jobs = -1, verbose = 3)
grid_search_RF.fit(X_train, y_train)

In [None]:
print("Best parameters for Random Forest:")
print(grid_search_RF.best_params_)
accuracy = grid_search_RF.best_score_ *100
print("Accuracy: {:.2f}%".format(accuracy) )

In [None]:
y_prediction = grid_search_RF.predict(X_test) 

test_accuracy= accuracy_score(y_test, y_prediction)*100

print("Accuracy of RF is: {:.2f}%".format(test_accuracy))
print()
print(classification_report(y_test, y_prediction))

In [None]:
# Plot the confusion matrix of Random Forest model

plot_confusion_matrix(grid_search_RF, X_test, y_test, cmap=plt.cm.Blues)
plt.title('Confusion Matrix (RF)', fontsize = 16)
plt.xlabel('Predicted Label', fontsize = 14)
plt.ylabel('True Label', fontsize = 14)
plt.show()

In [None]:
plot_roc_curve(grid_search_RF, X_test, y_test)
plt.show()

## Logistic Regression

In [None]:
logreg = LogisticRegression(max_iter=10000)
param_grid = {'C': [0.1, 1, 10, 100],
              'penalty': ['l1', 'l2', 'elasticnet', 'none'],
              'solver': ['lbfgs', 'liblinear', 'sag', 'saga']}
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=50)

In [None]:
# Use GridSearchCV to find the best hyperparameters

grid_search = GridSearchCV(logreg, param_grid=param_grid, cv=cv, n_jobs=-1, verbose=3)
grid_search.fit(X_train, y_train)
print("Best parameters for logistic regression:")
print(grid_search.best_params_)
print("Best score for logistic regression: {:.2f}%".format(grid_search.best_score_*100))

In [None]:
# Evaluate the logistic regression model's performance on the testing data

from sklearn.metrics import accuracy_score, classification_report, plot_confusion_matrix, plot_roc_curve
y_pred = grid_search.predict(X_test)
test_accuracy = accuracy_score(y_test, y_pred)
print("Accuracy of logistic regression on testing data: {:.2f}%".format(test_accuracy*100))
print()
print(classification_report(y_test, y_pred))

In [None]:
# Plot the confusion matrix and ROC curve

plot_confusion_matrix(grid_search, X_test, y_test, cmap=plt.cm.Blues)
plt.title('Confusion Matrix (Logistic Regression)', fontsize=16)
plt.xlabel('Predicted Label', fontsize=14)
plt.ylabel('True Label', fontsize=14)
plt.show()

In [None]:
plot_roc_curve(grid_search, X_test, y_test)
plt.show()

## Decision Tree

In [None]:
dt = DecisionTreeClassifier(random_state = 10)
param_grid = {'max_depth': [3, 5, 7, 10],
              'min_samples_split': [2, 5, 10],
              'min_samples_leaf': [1, 2, 4],
              'criterion': ['gini', 'entropy']}
cv = StratifiedKFold(n_splits = 3, shuffle = True, random_state = 10)

In [None]:
# Use GridSearchCV to find the best hyperparameters

grid_search = GridSearchCV(dt, param_grid=param_grid, cv=cv, n_jobs=-1, verbose=3)
grid_search.fit(X_train, y_train)
print("Best parameters for decision tree:")
print(grid_search.best_params_)
print("Best score for decision tree: {:.2f}%".format(grid_search.best_score_*100))

In [None]:
# Evaluate the decision tree model's performance on the testing data

from sklearn.metrics import accuracy_score, classification_report, plot_confusion_matrix, plot_roc_curve
y_pred = grid_search.predict(X_test)
test_accuracy = accuracy_score(y_test, y_pred)
print("Accuracy of decision tree on testing data: {:.2f}%".format(test_accuracy*100))
print()
print(classification_report(y_test, y_pred))

In [None]:
# Plot the confusion matrix and ROC curve

plot_confusion_matrix(grid_search, X_test, y_test, cmap=plt.cm.Blues)
plt.title('Confusion Matrix (Decision Tree)', fontsize=16)
plt.xlabel('Predicted Label', fontsize=14)
plt.ylabel('True Label', fontsize=14)
plt.show()

In [None]:
plot_roc_curve(grid_search, X_test, y_test)
plt.show()

## Gradient Boosting

In [None]:
gbc = GradientBoostingClassifier()

param_grid = { 
    'n_estimators': [100, 200, 300, 400, 500],
    'learning_rate': [0.1, 0.01, 0.001],
    'max_depth' : [3, 4, 5, 6, 7, 8],
    'subsample' : [0.6, 0.7, 0.8, 0.9, 1.0],
    'max_features': ['sqrt', 'log2']
}

cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=50)

In [None]:
grid_search_GB = GridSearchCV(gbc, param_grid = param_grid, cv = cv, n_jobs = -1, verbose = 3)
grid_search_GB.fit(X_train, y_train)

In [None]:
print("Best parameters for Gradient Boosting:")
print(grid_search_GB.best_params_)
accuracy = grid_search_GB.best_score_ * 100
print("Accuracy: {:.2f}%".format(accuracy))

In [None]:
y_prediction = grid_search_GB.predict(X_test)
test_accuracy = accuracy_score(y_test, y_prediction) * 100
print("Accuracy of GB is: {:.2f}%".format(test_accuracy))
print()
print(classification_report(y_test, y_prediction))

In [None]:
# Plot the confusion matrix of Gradient Boosting model

plot_confusion_matrix(grid_search_GB, X_test, y_test, cmap=plt.cm.Blues)
plt.title('Confusion Matrix (GB)', fontsize=16)
plt.xlabel('Predicted Label', fontsize=14)
plt.ylabel('True Label', fontsize=14)
plt.show()

In [None]:
plot_roc_curve(grid_search_GB, X_test, y_test)
plt.show()