### Notebook:
- Fits linear model (gam) to discovery and validation cohorts
    - P(mutation) ~ $log$(PHBR-I) * $log$(PHBR-II) * sex
    - P(mutation) ~ $log$(PHBR-I) * $log$(PHBR-II) * age

In [1]:
library(mgcv)
library(oddsratio)

Loading required package: nlme
This is mgcv 1.8-23. For overview type 'help("mgcv-package")'.


In [22]:
# install.packages('oddsratio')

### 1. Discovery cohort
- TCGA

##### 1A. Load discovery cohort

In [4]:
discovery_data = read.csv('../generated_data/gam_input.expressed_mutations.2.csv.gz')
cat(dim(discovery_data))
head(discovery_data[c('y..has_mutation.', 'centered_log_phbrI', 'centered_log_phbrII', 'centered_sex', 'patient_ids', 'disease')])

472264 15

y..has_mutation.,centered_log_phbrI,centered_log_phbrII,centered_sex,patient_ids,disease
0,3.890015,1.7543377,-0.4079498,TCGA-02-0047,TCGA-GBM
0,0.5786314,0.7571816,-0.4079498,TCGA-02-0047,TCGA-GBM
0,1.6918168,0.3475884,-0.4079498,TCGA-02-0047,TCGA-GBM
0,0.3443303,-0.6320063,-0.4079498,TCGA-02-0047,TCGA-GBM
0,-3.7273039,1.1537346,-0.4079498,TCGA-02-0047,TCGA-GBM
0,-4.1717617,0.7393688,-0.4079498,TCGA-02-0047,TCGA-GBM


### 1B. Table 1

##### Age

In [5]:
g_age <- mgcv::gam(y..has_mutation. ~ centered_log_phbrI * centered_log_phbrII * centered_age, 
                   random=list(patient_ids=~1), 
                   family=binomial(link='logit'), 
                   data=discovery_data)
summary(g_age)


Family: binomial 
Link function: logit 

Formula:
y..has_mutation. ~ centered_log_phbrI * centered_log_phbrII * 
    centered_age

Parametric coefficients:
                                                     Estimate Std. Error
(Intercept)                                         -5.324911   0.022854
centered_log_phbrI                                   0.095362   0.018580
centered_log_phbrII                                  0.278578   0.023209
centered_age                                        -0.002545   0.001495
centered_log_phbrI:centered_log_phbrII              -0.067884   0.016343
centered_log_phbrI:centered_age                     -0.001148   0.001221
centered_log_phbrII:centered_age                    -0.004346   0.001532
centered_log_phbrI:centered_log_phbrII:centered_age  0.002300   0.001081
                                                     z value Pr(>|z|)    
(Intercept)                                         -232.999  < 2e-16 ***
centered_log_phbrI                    

##### Sex 


In [6]:
g_sex <- mgcv::gam(y..has_mutation. ~ centered_log_phbrI * centered_log_phbrII * centered_sex, 
                   random=list(patient_ids=~1), 
                   family=binomial(link='logit'), 
                   data=discovery_data)
summary(g_sex)


Family: binomial 
Link function: logit 

Formula:
y..has_mutation. ~ centered_log_phbrI * centered_log_phbrII * 
    centered_sex

Parametric coefficients:
                                                    Estimate Std. Error
(Intercept)                                         -5.32352    0.02283
centered_log_phbrI                                   0.09454    0.01859
centered_log_phbrII                                  0.27929    0.02324
centered_sex                                        -0.04618    0.04676
centered_log_phbrI:centered_log_phbrII              -0.06717    0.01638
centered_log_phbrI:centered_sex                      0.04004    0.03828
centered_log_phbrII:centered_sex                     0.11891    0.04800
centered_log_phbrI:centered_log_phbrII:centered_sex -0.01882    0.03381
                                                     z value Pr(>|z|)    
(Intercept)                                         -233.137  < 2e-16 ***
centered_log_phbrI                             

##### Odds ratio analysis
- Quantifies the influence of both PHBR-I and PHBR-II scores on probability of mutation using odds ratios between respective 25th and 75th percentiles

In [14]:
gam_compare = mgcv::gam(y..has_mutation. ~ s(centered_log_phbrI, centered_log_phbrII), 
                        data=discovery_data, 
                        random=list(patient_ids=~1), 
                        family='binomial')

In [15]:
low_x = quantile(discovery_data[['centered_log_phbrI']], 0.25, names=FALSE)
high_x = quantile(discovery_data[['centered_log_phbrI']], 0.75, names=FALSE)
low_z = quantile(discovery_data[['centered_log_phbrII']], 0.25, names=FALSE)
high_z = quantile(discovery_data[['centered_log_phbrII']], 0.75, names=FALSE)

In [23]:
results1 = or_gam(data=discovery_data, model=gam_compare, pred=c("centered_log_phbrI"), values=c(low_x, high_x))
results2 = or_gam(data=discovery_data, model=gam_compare, pred=c("centered_log_phbrII"), values=c(low_z, high_z))

In [24]:
OR <- CI_low <- CI_high <-  predicted <- vector("list",2)

In [25]:
OR[[1]] <- results1[['oddsratio']]
CI_low[[1]] <- results1[['CI_low (2.5%)']]
CI_high[[1]] <- results1[['CI_high (97.5%)']]
predicted[[1]] <- results1[['predictor']]

OR[[2]] <- results2[['oddsratio']]
CI_low[[2]] <- results2[['CI_low (2.5%)']]
CI_high[[2]] <- results2[['CI_high (97.5%)']]
predicted[[2]] <- results2[['predictor']]

In [26]:
cbind(cbind(cbind(OR, CI_low), CI_high), predicted)

OR,CI_low,CI_high,predicted
1.252424,1.232899,1.272258,centered_log_phbrI
2.1066,2.012956,2.204599,centered_log_phbrII


### 1C. Table S1: PHBR-II * sex * age

In [7]:
g_sex_age_II <- mgcv::gam(y..has_mutation. ~ centered_log_phbrII * centered_sex * centered_age, 
                          random=list(patient_ids=~1), 
                          family=binomial(link='logit'), 
                          data=discovery_data)
summary(g_sex_age_II)


Family: binomial 
Link function: logit 

Formula:
y..has_mutation. ~ centered_log_phbrII * centered_sex * centered_age

Parametric coefficients:
                                               Estimate Std. Error  z value
(Intercept)                                   -5.351721   0.021996 -243.299
centered_log_phbrII                            0.306101   0.021243   14.409
centered_sex                                  -0.053199   0.045179   -1.178
centered_age                                  -0.002007   0.001449   -1.386
centered_log_phbrII:centered_sex               0.122354   0.043764    2.796
centered_log_phbrII:centered_age              -0.003560   0.001398   -2.546
centered_sex:centered_age                      0.004639   0.002946    1.575
centered_log_phbrII:centered_sex:centered_age -0.003794   0.002852   -1.330
                                              Pr(>|z|)    
(Intercept)                                    < 2e-16 ***
centered_log_phbrII                            < 2e-

### 1D. Table S2: Without cancer types with significant mutational signature ratios

In [8]:
sig_mutsig_tumor_types = c('TCGA-LIHC', 'TCGA-GBM', 'TCGA-HNSC', 'TCGA-SKCM', 'TCGA-STAD')

# subset
discovery_data_mutsig = subset(discovery_data, !(disease %in% sig_mutsig_tumor_types))
cat(dim(discovery_data_mutsig))

319865 15

##### Age

In [9]:
g_age_mutsig <- mgcv::gam(y..has_mutation. ~ centered_log_phbrI * centered_log_phbrII * centered_age, 
                          random=list(patient_ids=~1), 
                          family=binomial(link='logit'), 
                          data=discovery_data_mutsig)
summary(g_age_mutsig)


Family: binomial 
Link function: logit 

Formula:
y..has_mutation. ~ centered_log_phbrI * centered_log_phbrII * 
    centered_age

Parametric coefficients:
                                                     Estimate Std. Error
(Intercept)                                         -5.316521   0.027690
centered_log_phbrI                                   0.149520   0.023222
centered_log_phbrII                                  0.306862   0.028765
centered_age                                        -0.001905   0.001758
centered_log_phbrI:centered_log_phbrII              -0.117922   0.021048
centered_log_phbrI:centered_age                     -0.002548   0.001483
centered_log_phbrII:centered_age                    -0.004670   0.001842
centered_log_phbrI:centered_log_phbrII:centered_age  0.002755   0.001349
                                                     z value Pr(>|z|)    
(Intercept)                                         -192.004  < 2e-16 ***
centered_log_phbrI                    

##### Sex

In [10]:
g_sex_mutsig <- mgcv::gam(y..has_mutation. ~ centered_log_phbrI * centered_log_phbrII * centered_sex, 
                          random=list(patient_ids=~1), 
                          family=binomial(link='logit'), 
                          data=discovery_data_mutsig)
summary(g_sex_mutsig)


Family: binomial 
Link function: logit 

Formula:
y..has_mutation. ~ centered_log_phbrI * centered_log_phbrII * 
    centered_sex

Parametric coefficients:
                                                    Estimate Std. Error
(Intercept)                                         -5.30919    0.02759
centered_log_phbrI                                   0.14787    0.02318
centered_log_phbrII                                  0.30264    0.02867
centered_sex                                        -0.06717    0.05558
centered_log_phbrI:centered_log_phbrII              -0.11556    0.02104
centered_log_phbrI:centered_sex                      0.04263    0.04695
centered_log_phbrII:centered_sex                     0.10378    0.05817
centered_log_phbrI:centered_log_phbrII:centered_sex -0.02519    0.04268
                                                     z value Pr(>|z|)    
(Intercept)                                         -192.450  < 2e-16 ***
centered_log_phbrI                             

### 1E. Table S3: Discovery cohort without expressed mutation requirement

In [11]:
discovery_data_S3 = read.csv('../generated_data/gam_input.all_mutations.2.csv.gz')
cat(dim(discovery_data))
head(discovery_data[c('y..has_mutation.', 'centered_log_phbrI', 'centered_log_phbrII', 'centered_sex', 'patient_ids', 'disease')])

472264 15

y..has_mutation.,centered_log_phbrI,centered_log_phbrII,centered_sex,patient_ids,disease
0,3.890015,1.7543377,-0.4079498,TCGA-02-0047,TCGA-GBM
0,0.5786314,0.7571816,-0.4079498,TCGA-02-0047,TCGA-GBM
0,1.6918168,0.3475884,-0.4079498,TCGA-02-0047,TCGA-GBM
0,0.3443303,-0.6320063,-0.4079498,TCGA-02-0047,TCGA-GBM
0,-3.7273039,1.1537346,-0.4079498,TCGA-02-0047,TCGA-GBM
0,-4.1717617,0.7393688,-0.4079498,TCGA-02-0047,TCGA-GBM


##### Age

In [12]:
g_age_allmut <- mgcv::gam(y..has_mutation. ~ centered_log_phbrI * centered_log_phbrII * centered_age, 
                          random=list(patient_ids=~1), 
                          family=binomial(link='logit'), 
                          data=discovery_data_S3)
summary(g_age_allmut)


Family: binomial 
Link function: logit 

Formula:
y..has_mutation. ~ centered_log_phbrI * centered_log_phbrII * 
    centered_age

Parametric coefficients:
                                                     Estimate Std. Error
(Intercept)                                         -5.361188   0.022126
centered_log_phbrI                                   0.075500   0.017359
centered_log_phbrII                                  0.273793   0.022449
centered_age                                        -0.001723   0.001451
centered_log_phbrI:centered_log_phbrII              -0.078044   0.015488
centered_log_phbrI:centered_age                     -0.001145   0.001144
centered_log_phbrII:centered_age                    -0.004537   0.001485
centered_log_phbrI:centered_log_phbrII:centered_age  0.002549   0.001026
                                                     z value Pr(>|z|)    
(Intercept)                                         -242.301  < 2e-16 ***
centered_log_phbrI                    

##### Sex

In [13]:
g_sex_allmut <- mgcv::gam(y..has_mutation. ~ centered_log_phbrI * centered_log_phbrII * centered_sex, 
                          random=list(patient_ids=~1), 
                          family=binomial(link='logit'), 
                          data=discovery_data_S3)
summary(g_sex_allmut)


Family: binomial 
Link function: logit 

Formula:
y..has_mutation. ~ centered_log_phbrI * centered_log_phbrII * 
    centered_sex

Parametric coefficients:
                                                    Estimate Std. Error
(Intercept)                                         -5.36063    0.02213
centered_log_phbrI                                   0.07401    0.01738
centered_log_phbrII                                  0.27435    0.02249
centered_sex                                        -0.06383    0.04540
centered_log_phbrI:centered_log_phbrII              -0.07681    0.01551
centered_log_phbrI:centered_sex                      0.03605    0.03584
centered_log_phbrII:centered_sex                     0.13477    0.04659
centered_log_phbrI:centered_log_phbrII:centered_sex -0.01739    0.03206
                                                     z value Pr(>|z|)    
(Intercept)                                         -242.271  < 2e-16 ***
centered_log_phbrI                             

# 2. Validation cohort

In [2]:
validation_data = read.csv('../generated_data/gam_input.validation.all.csv.gz')
head(validation_data)

y..has_mutation.,log_phbr_i,log_phbr_ii,phbr_i,phbr_ii,patient_ids,sex,age,centered_log_phbr_i,centered_log_phbr_ii,centered_sex,centered_age
0,0.436009,1.2207901,1.5465227,3.389865,1,1,48,0.1070133,-1.4116194,0.5801282,-11.02132
0,1.0962891,1.0089661,2.9930387,2.742764,1,1,48,0.7672935,-1.6234434,0.5801282,-11.02132
0,0.1366724,1.9118222,1.1464525,6.765405,1,1,48,-0.1923233,-0.7205873,0.5801282,-11.02132
0,1.4222226,3.7839778,4.1463258,43.990679,1,1,48,1.0932269,1.1515683,0.5801282,-11.02132
0,-0.230733,0.9104665,0.7939514,2.485482,1,1,48,-0.5597287,-1.721943,0.5801282,-11.02132
0,0.9009311,4.105375,2.4618944,60.665489,1,1,48,0.5719354,1.4729655,0.5801282,-11.02132


### 2A. Table S4

##### Age

In [3]:
g_age_val <- mgcv::gam(y..has_mutation. ~ centered_log_phbr_i * centered_log_phbr_ii * centered_age, 
                       random=list(patient_ids=~1), 
                       family=binomial(link='logit'), 
                       data=validation_data)
summary(g_age_val)


Family: binomial 
Link function: logit 

Formula:
y..has_mutation. ~ centered_log_phbr_i * centered_log_phbr_ii * 
    centered_age

Parametric coefficients:
                                                       Estimate Std. Error
(Intercept)                                           -7.430063   0.047020
centered_log_phbr_i                                    0.107785   0.038031
centered_log_phbr_ii                                   0.148932   0.043206
centered_age                                           0.006536   0.003273
centered_log_phbr_i:centered_log_phbr_ii              -0.039020   0.030469
centered_log_phbr_i:centered_age                       0.002881   0.002628
centered_log_phbr_ii:centered_age                     -0.008808   0.003020
centered_log_phbr_i:centered_log_phbr_ii:centered_age -0.001410   0.002094
                                                       z value Pr(>|z|)    
(Intercept)                                           -158.018  < 2e-16 ***
centered_log_p

##### Sex

In [4]:
g_sex_val <- mgcv::gam(y..has_mutation. ~ centered_log_phbr_i * centered_log_phbr_ii * centered_sex, 
                       random=list(patient_ids=~1), 
                       family=binomial(link='logit'), 
                       data=validation_data)
summary(g_sex_val)


Family: binomial 
Link function: logit 

Formula:
y..has_mutation. ~ centered_log_phbr_i * centered_log_phbr_ii * 
    centered_sex

Parametric coefficients:
                                                       Estimate Std. Error
(Intercept)                                           -7.447859   0.045993
centered_log_phbr_i                                    0.102842   0.036822
centered_log_phbr_ii                                   0.144100   0.042402
centered_sex                                           0.233978   0.091317
centered_log_phbr_i:centered_log_phbr_ii              -0.046528   0.029213
centered_log_phbr_i:centered_sex                       0.194812   0.074351
centered_log_phbr_ii:centered_sex                      0.003549   0.084854
centered_log_phbr_i:centered_log_phbr_ii:centered_sex -0.155578   0.060398
                                                       z value Pr(>|z|)    
(Intercept)                                           -161.936  < 2e-16 ***
centered_log_p