# Understanding tumour growth variability in patient-derived breast cancer xenograft models identifies early responders and biomarkers of resistance to PARP inhibition

D. Voulgarelis$^{1,2,3}$, J.V. Forment$^{4}$, A. Herencia Ropero$^{5}$, D. Polychronopoulos$^{3}$, J. Cohen-Setton$^{3}$, A. Bender$^{6,7}$, V. Serra$^{5}$, M.J. O’Connor$^{4*}$, J. W.T. Yates$^{2†}$, K. C. Bulusu$^{3*}$

$^{1}$AstraZeneca Postdoc Programme; $^{2}$DMPK Oncology R&D, AstraZeneca, Cambridge, UK; $^{3}$Oncology Data Science, Oncology R&D, AstraZeneca, Cambridge, UK; $^{4}$Bioscience, Oncology R&D, AstraZeneca, Cambridge, UK; $^{5}$Experimental Therapeutics Group, Vall d’Hebron Institute of Oncology, Barcelona, Spain; $^{6}$Clinical Pharmacology & Safety Sciences, AstraZeneca, Cambridge, UK; $^{7}$Centre for Molecular Informatics, Department of Chemistry, University of Cambridge, Cambridge, UK

### Imports

In [1]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

from sklearn.model_selection import StratifiedKFold

### Loading the data

In [2]:
data_df = pd.read_csv('path_to_file') #'Resistance fraction mRNA and RECIST.csv'

In [3]:
data_columns = list(data_df.columns)[6:]

In [4]:
binary_RECISTs = []
for r in list(data_df['mRECIST']):
    if r == 'PD':
        binary_RECISTs.append(0)
    else:
        binary_RECISTs.append(1)
        
data_df['binary_RECIST'] = binary_RECISTs

In [5]:
np.unique(data_df['binary_RECIST'], return_counts=True)

(array([0, 1]), array([20,  7]))

### Splitting the data into train and test sets for cross validation
We are using a stratified split to ensure consistent distribution of classes across train and test sets.

In [6]:
'''
# Non-stratified version
indices = np.random.permutation(len(data_df))
test_sets = [
    indices[:5],
    indices[5:10],
    indices[10:15],
    indices[15:21],
    indices[21:]
]
train_sets = [np.concatenate([indices[:i], indices[j:]]) for i, j in zip([0,5,10,15,21],[5,10,15,21,27])]

print(sorted(train_sets[1]), sorted(test_sets[1]))
'''

'\n# Non-stratified version\nindices = np.random.permutation(len(data_df))\ntest_sets = [\n    indices[:5],\n    indices[5:10],\n    indices[10:15],\n    indices[15:21],\n    indices[21:]\n]\ntrain_sets = [np.concatenate([indices[:i], indices[j:]]) for i, j in zip([0,5,10,15,21],[5,10,15,21,27])]\n\nprint(sorted(train_sets[1]), sorted(test_sets[1]))\n'

In [7]:
strat_k = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

indices = np.arange(len(binary_RECISTs))

for i, (train_idx, test_idx) in enumerate(strat_k.split(indices, binary_RECISTs), 1):
    print(train_idx, test_idx)

[ 1  2  3  5  6  7  8 10 11 12 14 15 16 17 18 20 21 23 24 25 26] [ 0  4  9 13 19 22]
[ 0  1  2  3  4  5  7  9 10 11 12 13 14 15 16 18 19 20 22 24 25] [ 6  8 17 21 23 26]
[ 0  1  2  3  4  6  8  9 10 11 12 13 15 16 17 18 19 21 22 23 25 26] [ 5  7 14 20 24]
[ 0  1  4  5  6  7  8  9 10 11 13 14 15 16 17 19 20 21 22 23 24 26] [ 2  3 12 18 25]
[ 0  2  3  4  5  6  7  8  9 12 13 14 17 18 19 20 21 22 23 24 25 26] [ 1 10 11 15 16]


### Classification of RECIST values

In [8]:
feature_importances_gbc = pd.DataFrame(index = data_columns)
feature_importances_rf = pd.DataFrame(index = data_columns)
feature_importances_lr = pd.DataFrame(index = data_columns)

'''
for n in range(5): # if using the non-stratified version
    train_idxs = train_sets[n]
    test_idxs = test_sets[n]
'''

accs_gbc = []
accs_rf = []
accs_lr = []

f1s_gbc = []
f1s_rf = []
f1s_lr = []

aucs_gbc = []
aucs_rf = []
aucs_lr = []


for n, (train_idxs, test_idxs) in enumerate(strat_k.split(indices, binary_RECISTs), 1):
    
    X_train = data_df.loc[train_idxs][data_columns].values
    y_train = list(data_df.loc[train_idxs]['binary_RECIST'])
    
    X_test = data_df.loc[test_idxs][data_columns].values
    y_test = list(data_df.loc[test_idxs]['binary_RECIST'])
    
    print('Fold: ' + str(n), '\n')
    
    # Fitting a gradient boosting classifier and making predictions on the test set
    gbc = GradientBoostingClassifier().fit(X_train, y_train)
    y_pred_gbc = gbc.predict_proba(X_test)[:,1]
    y_pred_bin_gbc = gbc.predict(X_test)
    
    acc_gbc = accuracy_score(y_test, y_pred_bin_gbc)
    f1_gbc = f1_score(y_test, y_pred_bin_gbc)
    try:
        auc_gbc = roc_auc_score(y_test, y_pred_gbc)
    except:
        auc_gbc = 'AUC not applicable'
        
    print('Gradient Boosting:', 'Accuracy:', acc_gbc, ', F1:', f1_gbc, ', AUC:', auc_gbc)
    
    accs_gbc.append(acc_gbc)
    f1s_gbc.append(f1_gbc)
    aucs_gbc.append(auc_gbc)
        
    # Fitting a random forest classifier and making predictions on the test set
    rf = RandomForestClassifier().fit(X_train, y_train)
    y_pred_rf = rf.predict_proba(X_test)[:,1]
    y_pred_bin_rf = rf.predict(X_test)
    
    acc_rf = accuracy_score(y_test, y_pred_bin_rf)
    f1_rf = f1_score(y_test, y_pred_bin_rf)
    try:
        auc_rf = roc_auc_score(y_test, y_pred_rf)
    except:
        auc_rf = 'AUC not applicable'
    
    
    print('Random Forest:', 'Accuracy:', acc_rf, ', F1:', f1_rf, ', AUC:', auc_rf)
    
    accs_rf.append(acc_rf)
    f1s_rf.append(f1_rf)
    aucs_rf.append(auc_rf)
    
    # Fitting a Logistic Regression model and making predictions on the test set
    lr = LogisticRegression(max_iter=500).fit(X_train, y_train)
    y_pred_lr = lr.predict_proba(X_test)[:,1]
    y_pred_bin_lr = lr.predict(X_test)
    
    acc_lr = accuracy_score(y_test, y_pred_bin_lr)
    f1_lr = f1_score(y_test, y_pred_bin_lr)
    try:
        auc_lr = roc_auc_score(y_test, y_pred_lr)
    except:
        auc_lr = 'AUC not applicable'
    
    
    print('Logistic Regression:', 'Accuracy:', acc_lr, ', F1:', f1_lr, ', AUC:', auc_lr)
    
    accs_lr.append(acc_lr)
    f1s_lr.append(f1_lr)
    aucs_lr.append(auc_lr)
    
    # Extracting feature importances
    feature_importances_gbc['Fold ' + str(n)] = gbc.feature_importances_
    feature_importances_rf['Fold ' + str(n)] = rf.feature_importances_
    feature_importances_lr['Fold ' + str(n)] = lr.coef_[0]
    
    print('\n\n')

Fold: 1 

Gradient Boosting: Accuracy: 0.5 , F1: 0.4 , AUC: 0.5
Random Forest: Accuracy: 0.6666666666666666 , F1: 0.0 , AUC: 0.375
Logistic Regression: Accuracy: 0.6666666666666666 , F1: 0.0 , AUC: 0.75



Fold: 2 

Gradient Boosting: Accuracy: 0.5 , F1: 0.4 , AUC: 0.5
Random Forest: Accuracy: 0.6666666666666666 , F1: 0.0 , AUC: 0.75
Logistic Regression: Accuracy: 0.5 , F1: 0.0 , AUC: 0.5



Fold: 3 

Gradient Boosting: Accuracy: 0.8 , F1: 0.0 , AUC: 0.25
Random Forest: Accuracy: 0.8 , F1: 0.0 , AUC: 1.0
Logistic Regression: Accuracy: 1.0 , F1: 1.0 , AUC: 1.0



Fold: 4 

Gradient Boosting: Accuracy: 0.8 , F1: 0.6666666666666666 , AUC: 0.875
Random Forest: Accuracy: 0.8 , F1: 0.0 , AUC: 0.25
Logistic Regression: Accuracy: 0.8 , F1: 0.0 , AUC: 0.25



Fold: 5 

Gradient Boosting: Accuracy: 1.0 , F1: 1.0 , AUC: 1.0
Random Forest: Accuracy: 0.8 , F1: 0.0 , AUC: 1.0
Logistic Regression: Accuracy: 0.6 , F1: 0.0 , AUC: 0.75





In [9]:
mean_acc_gbc = np.mean(accs_gbc)
mean_f1_gbc = np.mean(f1s_gbc)
mean_auc_gbc = np.mean(aucs_gbc)

mean_acc_rf = np.mean(accs_rf)
mean_f1_rf = np.mean(f1s_rf)
mean_auc_rf = np.mean(aucs_rf)

mean_acc_lr = np.mean(accs_lr)
mean_f1_lr = np.mean(f1s_lr)
mean_auc_lr = np.mean(aucs_lr)

print(mean_acc_lr, mean_f1_lr, mean_auc_lr)

0.7133333333333334 0.2 0.65


### Aggregating important features

In [10]:
feature_importances_gbc['mean_importance'] = feature_importances_gbc.mean(axis=1)
feature_importances_rf['mean_importance'] = feature_importances_rf.mean(axis=1)
feature_importances_lr['mean_weight'] = feature_importances_lr.mean(axis=1) 
# Logistic regression feature extraction works slightly differently, as weights can be positive or negative.
# For logistic regression, negative weights imply a relationship with the '0' class in binary classification
# if your input data is positive.

In [11]:
feature_importances_gbc.sort_values(by='mean_importance', ascending=False)

Unnamed: 0,Fold 1,Fold 2,Fold 3,Fold 4,Fold 5,mean_importance
KIF23,0.0,1.021309e-07,0.594184,0.784314,0.292779,0.334255
DNMT1,1.0,0.000000e+00,0.004507,0.000000,0.000121,0.200925
SRSF2,0.0,4.895156e-01,0.000572,0.000000,0.000081,0.098034
ASNS,0.0,2.940222e-01,0.000000,0.000000,0.031644,0.065133
CTAG1B,0.0,0.000000e+00,0.110033,0.051964,0.045853,0.041570
...,...,...,...,...,...,...
IDH1,0.0,0.000000e+00,0.000000,0.000000,0.000000,0.000000
IGF2,0.0,0.000000e+00,0.000000,0.000000,0.000000,0.000000
IKZF1,0.0,0.000000e+00,0.000000,0.000000,0.000000,0.000000
IL6,0.0,0.000000e+00,0.000000,0.000000,0.000000,0.000000


In [12]:
feature_importances_rf.sort_values(by='mean_importance', ascending=False)

Unnamed: 0,Fold 1,Fold 2,Fold 3,Fold 4,Fold 5,mean_importance
BIRC5,0.032359,0.028957,0.023306,0.002444,0.055967,0.028607
KIF23,0.020000,0.012106,0.042954,0.033697,0.020818,0.025915
SMARCB1,0.048066,0.026071,0.011294,0.003492,0.010000,0.019785
PTPN11,0.018125,0.005882,0.038195,0.008167,0.020503,0.018175
ASNS,0.007812,0.024399,0.013668,0.024932,0.019967,0.018156
...,...,...,...,...,...,...
BRAF,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
RRM2,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
HGF,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
BIRC3,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


In [13]:
feature_importances_lr.sort_values(by='mean_weight', ascending=False)

Unnamed: 0,Fold 1,Fold 2,Fold 3,Fold 4,Fold 5,mean_weight
CTAG1B,0.155227,0.280230,0.422331,0.410267,0.394186,0.332448
ALCAM,0.176145,0.135400,0.207341,0.110831,0.127230,0.151389
CTAG2,0.253365,0.070837,0.073239,0.173859,0.161940,0.146648
PTEN,0.200457,0.005845,0.124930,0.165159,0.222162,0.143711
PRKAA2,0.154942,0.122740,0.164724,0.085900,0.159802,0.137622
...,...,...,...,...,...,...
IRS2,-0.055499,-0.235964,-0.139651,-0.174069,-0.147408,-0.150518
STMN1,-0.154497,-0.141567,-0.158995,-0.152297,-0.202972,-0.162065
ASNS,-0.173067,-0.164915,-0.139546,-0.139955,-0.196351,-0.162767
BRCA1,-0.235328,-0.127588,-0.241059,-0.185449,-0.157894,-0.189464


### Regression on FR

In [15]:
from sklearn.metrics import mean_squared_error

feature_importances_gbc_regression = pd.DataFrame(index = data_columns)
feature_importances_rf_regression = pd.DataFrame(index = data_columns)
feature_importances_lr_regression = pd.DataFrame(index = data_columns)

rmses_gbc = []
rmses_rf = []
rmses_lr = []

for n, (train_idxs, test_idxs) in enumerate(strat_k.split(indices, binary_RECISTs), 1):
    
    X_train = data_df.loc[train_idxs][data_columns].values
    y_train = list(data_df.loc[train_idxs]['FR'])
    
    X_test = data_df.loc[test_idxs][data_columns].values
    y_test = list(data_df.loc[test_idxs]['FR'])
    
    print('Fold: ' + str(n), '\n')
    
    # Fitting a gradient boosting regressor and making predictions on the test set
    gbc = GradientBoostingRegressor().fit(X_train, y_train)
    y_pred_gbc = gbc.predict(X_test)
    rmse_gbc = mean_squared_error(y_test, y_pred_gbc, squared=False)
        
    print('Gradient Boosting RMSE:', rmse_gbc)
        
    # Fitting a random forest regressor and making predictions on the test set
    rf = RandomForestRegressor().fit(X_train, y_train)
    y_pred_rf = rf.predict(X_test)
    rmse_rf = mean_squared_error(y_test, y_pred_rf, squared=False)
    
    print('Random Forest RMSE:', rmse_rf)
    
    # Fitting a Linear Regression model and making predictions on the test set
    lr = LinearRegression().fit(X_train, y_train)
    y_pred_lr = lr.predict(X_test)
    rmse_lr = mean_squared_error(y_test, y_pred_lr, squared=False)
    
    print('Linear Regression RMSE:', rmse_lr)
    
    # Extracting feature importances
    feature_importances_gbc_regression['Fold ' + str(n)] = gbc.feature_importances_
    feature_importances_rf_regression['Fold ' + str(n)] = rf.feature_importances_
    feature_importances_lr_regression['Fold ' + str(n)] = lr.coef_
    
    rmses_gbc.append(rmse_gbc)
    rmses_rf.append(rmse_rf)
    rmses_lr.append(rmse_lr)
    
    print('\n\n')

Fold: 1 

Gradient Boosting RMSE: 0.34793763606201467
Random Forest RMSE: 0.27566154421495137
Linear Regression RMSE: 0.3123230835447615



Fold: 2 

Gradient Boosting RMSE: 0.18344293812464446
Random Forest RMSE: 0.29153518163782144
Linear Regression RMSE: 0.38575968814548445



Fold: 3 

Gradient Boosting RMSE: 0.16844826689796374
Random Forest RMSE: 0.27914056290361317
Linear Regression RMSE: 0.2852446422234919



Fold: 4 

Gradient Boosting RMSE: 0.4265110954777959
Random Forest RMSE: 0.33817754229709196
Linear Regression RMSE: 0.3717796304668118



Fold: 5 

Gradient Boosting RMSE: 0.40887894835078453
Random Forest RMSE: 0.38255530960295503
Linear Regression RMSE: 0.29762767982652044





In [16]:
mean_rmse_gbc = np.mean(rmses_gbc)
mean_rmse_rf = np.mean(rmses_rf)
mean_rmse_lr = np.mean(rmses_lr)

print(mean_rmse_lr)

0.33054694484141406


### Aggregating important features

In [17]:
feature_importances_gbc_regression['mean_importance'] = feature_importances_gbc_regression.mean(axis=1)
feature_importances_rf_regression['mean_importance'] = feature_importances_rf_regression.mean(axis=1)
feature_importances_lr_regression['mean_weight'] = feature_importances_lr_regression.mean(axis=1) 

In [18]:
feature_importances_gbc_regression.sort_values(by='mean_importance', ascending=False)

Unnamed: 0,Fold 1,Fold 2,Fold 3,Fold 4,Fold 5,mean_importance
WEE1,0.002540,5.573566e-01,6.434921e-01,8.170467e-01,6.915182e-05,4.041010e-01
BIRC5,0.000044,7.903801e-05,1.459831e-05,3.247748e-09,8.058391e-01,1.611953e-01
DNMT1,0.621210,6.764859e-06,1.364842e-08,0.000000e+00,1.562516e-04,1.242746e-01
PIK3R1,0.103091,0.000000e+00,5.998884e-06,1.409289e-03,1.557876e-06,2.090153e-02
TOP1,0.090270,4.380599e-08,0.000000e+00,2.199939e-05,7.855052e-09,1.805831e-02
...,...,...,...,...,...,...
VEGFA,0.000000,7.614061e-11,0.000000e+00,0.000000e+00,0.000000e+00,1.522812e-11
DEFA1,0.000000,3.640093e-11,0.000000e+00,0.000000e+00,0.000000e+00,7.280187e-12
FGFR3,0.000000,1.305436e-11,0.000000e+00,0.000000e+00,0.000000e+00,2.610873e-12
CTAG2,0.000000,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00


In [19]:
feature_importances_rf_regression.sort_values(by='mean_importance', ascending=False)

Unnamed: 0,Fold 1,Fold 2,Fold 3,Fold 4,Fold 5,mean_importance
WEE1,9.846156e-02,1.039598e-01,6.635692e-02,3.278945e-01,4.096376e-02,1.275273e-01
BIRC5,4.068037e-02,2.707658e-02,9.182347e-02,4.127016e-02,3.169795e-01,1.035660e-01
NCOA2,8.957957e-03,4.436437e-02,9.189420e-02,8.512777e-03,7.673276e-05,3.076121e-02
SRSF2,3.145667e-02,5.947551e-02,4.591485e-02,2.383009e-05,8.306323e-03,2.903544e-02
PBK,4.208618e-02,0.000000e+00,1.590687e-02,8.169484e-03,6.403157e-02,2.603882e-02
...,...,...,...,...,...,...
FOXP1,5.106910e-08,7.268007e-07,1.020286e-05,1.149565e-05,0.000000e+00,4.495278e-06
ASCL1,2.140627e-06,4.126101e-06,2.921789e-06,1.542437e-06,4.431779e-06,3.032547e-06
EFNA2,8.759570e-07,1.718023e-06,0.000000e+00,2.962272e-06,5.048031e-08,1.121346e-06
FGF19,0.000000e+00,9.249390e-11,0.000000e+00,2.280883e-07,1.342105e-06,3.140572e-07


In [20]:
feature_importances_lr_regression.sort_values(by='mean_weight', ascending=False)

Unnamed: 0,Fold 1,Fold 2,Fold 3,Fold 4,Fold 5,mean_weight
CDK6,0.023344,0.025780,0.023612,0.021492,0.026634,0.024172
BCL2,0.019763,0.019984,0.016553,0.013392,0.016125,0.017163
ASNS,0.017238,0.014240,0.010751,0.013157,0.019186,0.014915
DNMT1,0.018822,0.016006,0.015459,0.009153,0.013001,0.014488
ERCC5,0.013456,0.017352,0.013816,0.010284,0.010970,0.013175
...,...,...,...,...,...,...
ALCAM,-0.016654,-0.013775,-0.018366,-0.009181,-0.016169,-0.014829
AEBP1,-0.009432,-0.004923,-0.027738,-0.018801,-0.019934,-0.016166
PTEN,-0.015857,-0.009255,-0.013903,-0.029915,-0.025227,-0.018831
CTAG2,-0.030920,-0.015034,-0.010777,-0.022720,-0.024873,-0.020865
