# Classification Task with Early Data Integration

In [None]:
# Import libraries for feature scaling, model selection, SVC, kNN and comparison metrics
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import balanced_accuracy_score, plot_roc_curve

In [None]:
# Import flux data
metabolic_data = pd.read_csv('fluxes.csv', encoding='utf-8')
# Disregard the null fluxes
metabolic_data = metabolic_data.loc[:, (metabolic_data.abs() >= 1e-7).any(axis=0)]
# Import gene expression data
gene_expression_data = pd.read_csv('gene_expression_data.csv', encoding='utf-8')

In [None]:
# Concatenate the gene expression and flux data matrices
dataset = pd.concat([metabolic_data, gene_expression_data], axis=1)
# Define the X and Y variables
X = dataset[dataset.columns[:-1]]
Y = dataset[dataset.columns[-1]]
# Transform all labels to numerical normalized labels
Y = LabelEncoder().fit_transform(Y)

In [None]:
# Specify the proportion of data to be used as the test set
percent_test = 0.3
# Ensure reproducibility of the results by setting the random_state parameter
rndm_state = 0
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=percent_test, shuffle=True, random_state=rndm_state)

In [None]:
# Perform feature scaling to normalize the training data
stdscaler = StandardScaler()
X_train = stdscaler.fit_transform(X_train)
# Normalize the test set with the same parametric values as the training set
X_test = stdscaler.transform(X_test)

In [None]:
# Perform parametric tuning for SVC by creating a dictionary containing parameters for kernels and C
svc_params = {
    'kernel': ['poly', 'rbf', 'sigmoid'],
    'C':[1, 5, 10]
    }

In [None]:
# Create a GridSearchCV object and fit it to the training data
grd_search_scv = GridSearchCV(estimator=SVC(), param_grid=svc_params, cv=10)
grd_search_scv.fit(X_train, Y_train)

In [None]:
# Find the optimal parameters and use them to fit the model
svc = SVC(C=grd_search_scv.best_params_['C'], kernel=grd_search_scv.best_params_['kernel'])
svc.fit(X_train, Y_train)

In [None]:
# Perform parametric tuning for kNN by creating a dictionary containing parameters for n_neighbors and p
knn_params = {
    'n_neighbors': [5, 7, 9],
    'p':[1, 2]
    }
# Create a GridSearchCV object and fit it to the training data
grd_search_knn = GridSearchCV(estimator=KNeighborsClassifier(), param_grid=knn_params, cv=10)
grd_search_knn.fit(X_train, Y_train)
# Find the optimal parameters and use them to fit the model
kNN = KNeighborsClassifier(n_neighbors=grd_search_knn.best_params_['n_neighbors'], p=grd_search_knn.best_params_['p'])
kNN.fit(X_train, Y_train)

In [None]:
# Compute balanced accuracy score for SVC
Y_pred = svc.predict(X_test)
svc_acc = balanced_accuracy_score(Y_test, Y_pred)
# Compute balanced accuracy score for kNN
Y_pred = kNN.predict(X_test)
knn_acc = balanced_accuracy_score(Y_test, Y_pred)
# Print balanced accuracy scores for both methods
print('SVC accuracy: {}, kNN accuracy: {}'.format(svc_acc, knn_acc))

# Get the current axes, creating one if necessary
ax = plt.gca()
# Plot receiver operating characteristic (ROC) curves for SVC and kNN and display them in the same figure
svc_roc = plot_roc_curve(svc, X_test, Y_test, ax=ax)
knn_roc = plot_roc_curve(kNN, X_test, Y_test, ax=ax)
knn_roc.figure_.suptitle('ROC curves')
plt.show()