# Trial version

In [None]:
import pandas as pd

dataset = pd.read_csv('Data/Train_call.txt', sep='\t')
labels = pd.read_csv('Data/Train_clinical.txt', sep='\t')

In [None]:
dataset.head(20)

In [None]:
dataset_transposed = dataset.T
dataset_transposed = dataset_transposed.drop(['Start', 'End', 'Nclone', 'Chromosome'])
dataset_transposed = dataset_transposed.reset_index()

trainset = pd.concat([dataset_transposed, labels], axis=1)
trainset = trainset.set_index('Sample')
trainset = trainset.drop(columns = ['index'])

In [None]:
trainset

## Bare-bones ML model trials

### Support Vector Classification

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix

In [None]:
# Separate the features and labels
#
X = trainset.iloc[:, :-1].values
y = trainset.iloc[:, -1].values

# Split the dataset into training and testing sets
#
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [None]:
# Scale the features
#
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
X_train.shape

In [None]:
# Train the SVM model
#
classifier = SVC(kernel='linear', random_state=0, probability=True)
classifier.fit(X_train, y_train)

In [None]:
# Make predictions on the testing set
#
y_pred = classifier.predict(X_test)
# Evaluate the performance of the model
#
accuracy = accuracy_score(y_test, y_pred)
# cm = confusion_matrix(y_test, y_pred)
print("Accuracy:", accuracy)
# print("Confusion Matrix:\n", cm)

### Naive Bayes Classification

In [None]:
from sklearn.naive_bayes import GaussianNB

In [None]:
gnb = GaussianNB()
gnb.fit(X_train, y_train)

y_pred = gnb.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: {:.2f}%".format(accuracy*100))

### Random Forest Classification

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification

In [None]:
rfc = RandomForestClassifier(n_estimators=100, random_state=42)
rfc.fit(X_train, y_train)

y_pred = rfc.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: {:.2f}%".format(accuracy*100))

# Final Version 

## Data Processing [Make sure to run this Block before running anything else]

In [None]:
from sklearn.model_selection import GridSearchCV, cross_val_score, cross_validate, KFold
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

import warnings
warnings.filterwarnings("ignore")

In [None]:
dataset = pd.read_csv('Data/Train_call.txt', sep='\t')
labels = pd.read_csv('Data/Train_clinical.txt', sep='\t')

In [None]:
dataset_transposed = dataset.T
dataset_transposed = dataset_transposed.drop(['Start', 'End', 'Nclone', 'Chromosome'])
dataset_transposed = dataset_transposed.reset_index()

trainset = pd.concat([dataset_transposed, labels], axis=1)
trainset = trainset.set_index('Sample')
trainset = trainset.drop(columns = ['index'])

In [None]:
X = trainset.iloc[:, :-1].values
y = trainset.iloc[:, -1].values

X_train_full, X_test_full, y_train_full, y_test_full = train_test_split(X, y, test_size=0.2, random_state=0)

In [None]:
n_fet = trainset.shape[1]

In [None]:
import numpy as np

unique_values, counts = np.unique(y_train, return_counts=True)
print(dict(zip(unique_values, counts)))

## NCV on SVM/RF 

In [None]:
import math

In [None]:
# Define the hyperparameters for SVC and Random Forest
#
svc_params = {'C': [0.001, 0.1, 1, 10], 'gamma': [0.1, 1, 10, 100], 'kernel': ['linear', 'poly', 'sigmoid', 'rbf']}
rf_params = {'n_estimators': [50, 100, 250, 500], 'max_depth': [5, 10, 30], 'min_samples_split': [2, 5, 10], 'max_features': [100, 500, 1000, round(math.sqrt(n_fet))]}

In [None]:
# Define the outer and inner CV loops
#
outer_cv = KFold(n_splits=10, shuffle=True)
inner_cv = KFold(n_splits=5, shuffle=True)

In [None]:
# For SVC
#
svc_grid = GridSearchCV(estimator=SVC(), param_grid=svc_params, cv=inner_cv)
svc_grid.fit(X, y)
svc_best_params = svc_grid.best_params_
svc_scores = cross_val_score(estimator=svc_grid, X=X_train, y=y_train, cv=outer_cv)

In [None]:
print("SVC best params:", svc_best_params)
print("SVC: Mean accuracy=%.3f, std=%.3f" % (svc_scores.mean(), svc_scores.std()))

In [None]:
# For RF
#
rf_grid = GridSearchCV(estimator=RandomForestClassifier(), param_grid=rf_params, cv=inner_cv)
rf_grid.fit(X, y)
rf_best_params = rf_grid.best_params_
rf_scores = cross_val_score(estimator=rf_grid, X=X_train, y=y_train, cv=outer_cv)

In [None]:
rfc = RandomForestClassifier(n_estimators = 100, max_depth = 5, max_features = 1000, min_samples_split = 10, random_state=42)
rfc.fit(X_train, y_train)

y_pred = rfc.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: {:.2f}%".format(accuracy*100))

In [None]:
print("\nRandom Forest best params:", rf_best_params)
print("Random Forest: Mean accuracy=%.3f, std=%.3f" % (rf_scores.mean(), rf_scores.std()))

## RFE Dim reduction

In [None]:
from numpy import mean, std
from sklearn.feature_selection import RFE, RFECV
from sklearn.datasets import make_classification
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RepeatedStratifiedKFold

estimator = SVC(C = 0.1, kernel='linear')

rfe = RFE(estimator, n_features_to_select=X_train.shape[1], verbose=1)
pipeline = Pipeline(steps=[('s',rfe),('e',estimator)])

rfe.fit(X_train, y_train)

cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=0)
n_scores = cross_val_score(pipeline, X_train, y_train, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')

print('Accuracy: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))

In [None]:
import numpy as np

support = np.where(rfe.support_)[0]
ranking = np.where(rfe.ranking_)[0]
print("Selected features: ", ranking)

In [None]:
for i in range(X_train.shape[1]):
  if rfe.support_[i] == True and rfe.ranking_[i] != 1:
    print('Column: %d, Selected %s, Rank: %.3f' % (i, rfe.support_[i], rfe.ranking_[i]))

## RFECV

In [None]:
import numpy as np
from numpy import mean, std
from sklearn.feature_selection import RFE, RFECV
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RepeatedStratifiedKFold, StratifiedKFold

"""
RFECV automatically finds the best estimators as well as the optimal number of them for best performance. 
"""

In [None]:
estimator = SVC(C = 0.1, gamma = 0.1, kernel='linear')
rfecv = RFECV(estimator)
pipeline = Pipeline(steps=[('s',rfecv),('e',estimator)])
rfecv = rfecv.fit(X, y)

In [None]:
rf = RandomForestClassifier(n_estimators= 150, max_depth = 5, max_features = 1000, min_samples_split = 10)
rfecv = RFECV(estimator=rf, cv=StratifiedKFold(10), scoring='accuracy')
rfecv.fit(X,y)
print('Optimal number of features: {}'.format(rfecv.n_features_))

features = [f for f,s in zip(X_train.columns, rfecv.support_) if s]
print(features)

In [None]:
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=0)
n_scores = cross_val_score(pipeline, X, y, scoring='accuracy', cv=cv)

In [None]:
print('Accuracy: %.3f; Std: %.3f' % (mean(n_scores), std(n_scores)))
print('Accuracy: %.3f; Std: %.3f' % (mean(n_scores)*100, std(n_scores)*100))

In [None]:
selected_indices = np.where(rfecv.support_ == True)[0]
print('# selected features = %.3f' % (rfecv.n_features_))
print('List the selected features = ', selected_indices)

## SVC with Selected features

In [None]:
import pandas as pd

"""
Best ranked features based on RFE with SVC (Run RFECV block above to find out).
PS: The RFECV simulation will take a long time on CPUs. 
"""

X_o = pd.DataFrame(X)
X_o = X_o[[192, 1061, 1569, 1678, 1900, 2024, 2026, 2184, 2213, 2750]]   # Selected features from RFECV.
X_o

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_o, y, test_size=0.2, random_state=0)

In [None]:
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [None]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix

classifier = SVC(kernel='linear', random_state=0)
classifier.fit(X_train, y_train)

# Make predictions on the testing set
#
y_pred = classifier.predict(X_test)

# Evaluate the performance of the model
#
accuracy = accuracy_score(y_test, np.full((20,), "HR+"))
print("Accuracy:", accuracy * 100, "%")

## SVC with Selected Features & Randomized batch simulations [1000 runs]

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix
import random

sum_acc = 0

for iter in range (1000):
    X_temp = pd.DataFrame(X)
    X_temp = X_temp[random.sample(range(1, 2833), 10)]
    
    X_train_temp, X_test_temp, y_train_temp, y_test_temp = train_test_split(X_temp, y, test_size=0.2, random_state=0)

    # Train the SVM model
    #
    classifier = SVC(kernel='linear', random_state=0)
    classifier.fit(X_train, y_train)

    # Make predictions on the testing set
    #
    y_pred = classifier.predict(X_test_temp)

    # Evaluate the performance of the model
    #
    accuracy = accuracy_score(y_test, y_pred)

    sum_acc += accuracy

print('Average Accuracy: ', sum_acc / 5, "%")

## Random Forest CV simulations

In [None]:
# For RF
#
rf_grid = GridSearchCV(estimator=RandomForestClassifier(), param_grid=rf_params, cv=inner_cv)
rf_grid.fit(X_train, y_train)
rf_best_params = rf_grid.best_params_
rf_scores = cross_val_score(estimator=rf_grid, X=X_test, y=y_test, cv=outer_cv)
print('avr RF scores:', rf_scores.mean()*100,'%')

## Creation of ROC plots


In [None]:
from sklearn.metrics import RocCurveDisplay, roc_curve
import matplotlib.pyplot as plt

In [None]:
# multi-class classification
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score

rf_grid_roc = GridSearchCV(estimator=RandomForestClassifier(), param_grid=rf_params, cv=inner_cv)
rf_grid_roc.fit(X_train, y_train)

In [None]:
from sklearn.preprocessing import LabelEncoder

# encode string labels to numerical labels
le = LabelEncoder()
y_train_encoded = le.fit_transform(y_train)

# map numerical labels to string labels
label_map = {i: label for i, label in enumerate(le.classes_)}

# generate 2 class dataset
X, y = make_classification(n_samples=1000, n_classes=3, n_features=20, n_informative=3, random_state=42)

# split into train/test sets
X_train, X_test, y_train_encoded, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

### Random Forest ROC plot

In [None]:
# fit model
clf = OneVsRestClassifier(rf_grid_roc)
clf.fit(X_train, y_train_encoded)
pred = clf.predict(X_test)
pred_prob = clf.predict_proba(X_test)

# roc curve for classes
fpr = {}
tpr = {}
thresh ={}
auc = {}

n_class = 3

for i in range(n_class):    
    fpr[i], tpr[i], thresh[i] = roc_curve(y_test, pred_prob[:,i], pos_label=i)
    auc[i] = roc_auc_score(y_test == i, pred_prob[:,i])
    
# plotting    
plt.plot(fpr[0], tpr[0], linestyle='-',color='orange', label=f'{label_map[0]} vs Rest (AUC = {auc[0]:.2f})')
plt.plot(fpr[1], tpr[1], linestyle='-',color='green', label=f'{label_map[1]} vs Rest (AUC = {auc[1]:.2f})')
plt.plot(fpr[2], tpr[2], linestyle='-',color='blue', label=f'{label_map[2]} vs Rest (AUC = {auc[2]:.2f})')
plt.plot([0, 1], [0, 1], linestyle='--', color='red', label='Chance (AUC = 0.5)')

plt.title('Multiclass One-vs-Rest Receiver Operating Characteristic')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive rate')
plt.legend(loc='best')
plt.savefig('Multiclass ROC',dpi=300);   


### SVC ROC plot

In [None]:
# fit model
clf_svc = OneVsRestClassifier(classifier)
clf_svc.fit(X_train, y_train_encoded)
pred_svc = clf_svc.predict(X_test)
pred_prob_svc = clf_svc.predict_proba(X_test)

# roc curve for classes
fpr_svc = {}
tpr_svc = {}
thresh_svc ={}
auc_svc = {}

n_class = 3

for i in range(n_class):    
    fpr_svc[i], tpr_svc[i], thresh_svc[i] = roc_curve(y_test, pred_prob_svc[:,i], pos_label=i)
    auc_svc[i] = roc_auc_score(y_test == i, pred_prob_svc[:,i])
    
# plotting    
plt.plot(fpr_svc[0], tpr_svc[0], linestyle='-',color='orange', label=f'{label_map[0]} vs Rest (AUC = {auc_svc[0]:.2f})')
plt.plot(fpr_svc[1], tpr_svc[1], linestyle='-',color='green', label=f'{label_map[1]} vs Rest (AUC = {auc_svc[1]:.2f})')
plt.plot(fpr_svc[2], tpr_svc[2], linestyle='-',color='blue', label=f'{label_map[2]} vs Rest (AUC = {auc_svc[2]:.2f})')
plt.plot([0, 1], [0, 1], linestyle='--', color='red', label='Chance (AUC = 0.5)')

plt.title('Multiclass One-vs-Rest Receiver Operating Characteristic')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive rate')
plt.legend(loc='best')
plt.savefig('Multiclass ROC',dpi=300);   


In [None]:
rfc = RandomForestClassifier(n_estimators = 100, max_depth = 5, max_features = 1000, min_samples_split = 10, random_state=42)
rfc.fit(X, y)

pickle.dump(rfc, open('model_akhil.pkl', 'wb'))

y_pred = rfc.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: {:.2f}%".format(accuracy*100))

# Finding Estimate

In [None]:
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score

# Load your dataset
X_o = pd.DataFrame(X)
# X_o = X_o[[48, 192, 486, 674, 746, 791, 854, 999, 1061, 1079]]
# X_o = X_o[[48, 192, 486, 674, 746, 791, 854, 999, 1061, 1079, 1243, 1561, 1569, 1656, 1672, 1677, 1678, 1900, 1973, 2021, 2024, 2026, 2058, 2184, 2207, 2210, 2213, 2428, 2495, 2547, 2742, 2750, 2817]]
y = y

# Define the number of folds
k = 15

# Initialize KFold
kf = KFold(n_splits=k)

# Initialize your model
model = RandomForestClassifier(n_estimators = 100, max_depth = 5, max_features = 1000, min_samples_split = 10, random_state=42)

# Initialize a list to store the accuracy scores for each fold
acc_scores = []

# Iterate over each fold
for train_index, test_index in kf.split(X_o):
    # Split your data into training and testing sets
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # Train your model on the training set
    model.fit(X_train, y_train)

    # Make predictions on the testing set
    y_pred = model.predict(X_test)

    # Calculate the accuracy score for this fold
    acc_score = accuracy_score(y_test, y_pred)

    # Append the accuracy score to the list of scores
    acc_scores.append(acc_score)

# Calculate the average accuracy score across all folds
avg_acc_score = sum(acc_scores) / k

# Calculate the estimated number of correct predictions on unseen data
num_correct_predictions = int(avg_acc_score * len(X))

# Print the estimated number of correct predictions
print("Estimated number of correct predictions on unseen data:", num_correct_predictions)

In [None]:
correct_est = str(int(avg_acc_score * 57))
print("Estimated for the #of correct predictions on Competition data: ", correct_est)

with open("estimate.txt", "w") as f:
    f.write(str(correct_est))