### Predicting drug response on cell lines from gene expression data with SVMs

In [4]:
import numpy as np
from sklearn import datasets
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import classification_report
import pandas as pd

# Step 1: Load and preprocess the dataset 

gene_expression = pd.read_csv("/Users/deeaciobi/Documents/TU:e/Machine learning for medical imaging and biology/data/RNA_expression_curated.csv", sep=',', header=0, index_col=0)
drug_response = pd.read_csv("/Users/deeaciobi/Documents/TU:e/Machine learning for medical imaging and biology/data/drug_response_curated.csv", sep=',', header=0, index_col=0)

X = gene_expression.values  
y = (drug_response.values > 0).astype(int)  # Convert drug response to binary labels

# Step 2: Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

y_train = y_train.ravel()
y_test = y_test.ravel()

# Step 3: Define a grid of hyper-parameters to search over
param_grid = {
    'C': [0.1, 1, 10],
    'gamma': [0.01, 0.1, 1],
    'kernel': ['rbf', 'linear']
}

# Step 4: Use GridSearchCV to perform cross-validation and find the best hyper-parameters
svm = SVC()
grid_search = GridSearchCV(svm, param_grid, cv=5)
grid_search.fit(X_train, y_train)

# Step 5: Train the SVM classifier with the best parameters
best_svm = grid_search.best_estimator_

# Step 6: Evaluate the precision of predictions manually
y_pred = best_svm.predict(X_test)
precision_manual = np.sum(y_pred == y_test) / len(y_test)

# Step 7: Compare precision with the results from classification_report
classification_rep = classification_report(y_test, y_pred, zero_division=1)

print(f"Precision (Manual Calculation): {precision_manual}")
print(f"Classification Report:\n{classification_rep}")


Precision (Manual Calculation): 0.8
Classification Report:
              precision    recall  f1-score   support

           0       0.80      1.00      0.89        24
           1       1.00      0.00      0.00         6

    accuracy                           0.80        30
   macro avg       0.90      0.50      0.44        30
weighted avg       0.84      0.80      0.71        30



### Random forests

In [10]:
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt

# Step 1: Load and preprocess the dataset 

gene_expression = pd.read_csv("/Users/deeaciobi/Documents/TU:e/Machine learning for medical imaging and biology/data/RNA_expression_curated.csv", sep=',', header=0, index_col=0)
drug_response = pd.read_csv("/Users/deeaciobi/Documents/TU:e/Machine learning for medical imaging and biology/data/drug_response_curated.csv", sep=',', header=0, index_col=0)

X = gene_expression.values  
y = (drug_response.values > 0).astype(int)  # Convert drug response to binary labels

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert y to 1-dimensional array
y_train = y_train.ravel()
y_test = y_test.ravel()

# SVM Classifier
svm_param_grid = {
    'C': [0.1, 1, 10],
    'gamma': [0.01, 0.1, 1],
    'kernel': ['rbf', 'linear']
}

svm = SVC()
svm_grid_search = GridSearchCV(svm, svm_param_grid, cv=5)
svm_grid_search.fit(X_train, y_train)
best_svm = svm_grid_search.best_estimator_

y_pred_svm = best_svm.predict(X_test)
precision_svm = np.sum(y_pred_svm == y_test) / len(y_test)
classification_rep_svm = classification_report(y_test, y_pred_svm, zero_division=1)

# Random Forest Classifier
rf_param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5, 10]
}

rf = RandomForestClassifier()
rf_grid_search = GridSearchCV(rf, rf_param_grid, cv=5)
rf_grid_search.fit(X_train, y_train)
best_rf = rf_grid_search.best_estimator_

y_pred_rf = best_rf.predict(X_test)
precision_rf = np.sum(y_pred_rf == y_test) / len(y_test)
classification_rep_rf = classification_report(y_test, y_pred_rf, zero_division=1)


# Print results
print("SVM Classifier:")
print(f"Precision (Manual Calculation): {precision_svm}")
print(f"Classification Report:\n{classification_rep_svm}\n")

print("Random Forest Classifier:")
print(f"Precision (Manual Calculation): {precision_rf}")
print(f"Classification Report:\n{classification_rep_rf}\n")


SVM Classifier:
Precision (Manual Calculation): 0.8
Classification Report:
              precision    recall  f1-score   support

           0       0.80      1.00      0.89        24
           1       1.00      0.00      0.00         6

    accuracy                           0.80        30
   macro avg       0.90      0.50      0.44        30
weighted avg       0.84      0.80      0.71        30


Random Forest Classifier:
Precision (Manual Calculation): 0.8666666666666667
Classification Report:
              precision    recall  f1-score   support

           0       0.86      1.00      0.92        24
           1       1.00      0.33      0.50         6

    accuracy                           0.87        30
   macro avg       0.93      0.67      0.71        30
weighted avg       0.89      0.87      0.84        30


