# Imports

In [1]:
import pandas as pd
import os
import math
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
import data_prep_utils

# Task 1

### Beat Drug

In [3]:
beat_drug_fp = os.path.join(os.getcwd(), "medical_genomics_2021_data", "beat_drug")
beat_drug_df = pd.read_csv(beat_drug_fp, sep='\t', lineterminator='\n')
print(beat_drug_df.shape)
beat_drug_df.head()

(79, 198)


Unnamed: 0,12-00066,12-00150,12-00211,12-00258,12-00294,12-00372,13-00028,13-00034,13-00098,13-00118,...,16-01102,16-01138,16-01151,16-01185,16-01216,16-01219,16-01227,16-01237,16-01262,16-01270
A-674563,0.302313,0.183573,10.0,2.250759,0.312102,1.112644,3.699257,1.522665,5.132768,1.83987,...,0.726144,0.454933,0.103835,0.192891,0.515219,0.531155,1.261865,0.674077,0.518016,0.259479
Afatinib (BIBW-2992),0.270839,0.095105,10.0,1.042656,0.412103,0.492191,1.126676,0.31333,4.442185,1.09213,...,1.146014,1.047075,0.556972,0.615777,0.456265,0.809377,1.279045,0.98327,2.017039,1.303088
Alisertib (MLN8237),2.292925,10.0,10.0,10.0,1.731348,6.149094,10.0,10.0,10.0,10.0,...,10.0,10.0,2.080804,8.928137,6.927002,9.425326,4.785701,10.0,10.0,10.0
Axitinib (AG-013736),0.137656,10.0,10.0,10.0,0.124162,10.0,10.0,10.0,10.0,10.0,...,10.0,10.0,10.0,10.0,0.418123,10.0,10.0,10.0,10.0,0.299026
AZD1480,2.429158,,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,...,10.0,6.874162,10.0,10.0,3.607693,7.004078,10.0,10.0,10.0,10.0


In [4]:
beat_drug_df.describe()

Unnamed: 0,12-00066,12-00150,12-00211,12-00258,12-00294,12-00372,13-00028,13-00034,13-00098,13-00118,...,16-01102,16-01138,16-01151,16-01185,16-01216,16-01219,16-01227,16-01237,16-01262,16-01270
count,69.0,70.0,78.0,79.0,79.0,78.0,79.0,79.0,79.0,79.0,...,79.0,77.0,78.0,79.0,77.0,78.0,77.0,79.0,79.0,78.0
mean,2.412734,6.335849,8.527381,7.40957,3.264417,5.883175,6.617408,6.929836,8.324497,7.984273,...,8.255971,6.404386,6.044692,6.118522,3.080462,7.342661,7.688149,7.247408,8.262772,4.240337
std,3.775972,4.299582,3.146445,3.745823,4.02559,4.27538,3.974038,4.222027,3.351549,3.474718,...,3.330924,4.001485,4.466019,4.325312,3.899469,3.839974,3.755343,3.908462,3.363649,4.264927
min,0.0014,0.006859,0.013717,0.013717,0.005953,0.013717,0.013717,0.013717,0.006859,0.1891,...,0.064853,0.013717,0.012358,0.013717,0.002706,0.097225,0.013717,0.042784,0.111219,0.010583
25%,0.118563,1.085082,10.0,3.894494,0.248112,1.028161,2.198463,1.468053,10.0,7.130972,...,9.681701,1.927824,0.918891,1.0,0.220118,3.142175,5.0,4.22276,10.0,0.459808
50%,0.283653,10.0,10.0,10.0,0.897805,6.654102,10.0,10.0,10.0,10.0,...,10.0,9.102755,10.0,10.0,0.702216,10.0,10.0,10.0,10.0,1.825458
75%,2.292925,10.0,10.0,10.0,7.096167,10.0,10.0,10.0,10.0,10.0,...,10.0,10.0,10.0,10.0,5.840653,10.0,10.0,10.0,10.0,10.0
max,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,...,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0


In [5]:
beat_drug_df.isna().sum().sum()

378

#### Transformation

In [6]:
trans_beat_drug = beat_drug_df.fillna(0.0).apply(np.log10).transpose().reset_index().rename(columns={"index": "sampleID"}) # ToDo: Change fillna
trans_beat_drug = trans_beat_drug.replace(-math.inf, 0.0).replace(math.inf, 1) # ToDo: Deal with that
trans_beat_drug.head()

Unnamed: 0,sampleID,A-674563,Afatinib (BIBW-2992),Alisertib (MLN8237),Axitinib (AG-013736),AZD1480,Barasertib (AZD1152-HQPA),BEZ235,BMS-345541,Bortezomib (Velcade),...,TG100-115,Tofacitinib (CP-690550),Tozasertib (VX-680),Trametinib (GSK1120212),Vandetanib (ZD6474),Vargetef,Vatalanib (PTK787),Vismodegib (GDC-0449),VX-745,YM-155
0,12-00066,-0.519543,-0.56729,0.36039,-0.861206,0.385456,1.0,-1.417073,0.09566,-0.744268,...,0.502356,0.869371,0.2967,-1.862741,0.216112,-1.333854,0.08879,0.0,-1.106552,-0.661768
1,12-00150,-0.736191,-1.021795,1.0,1.0,0.0,1.0,1.0,-0.10227,-0.996701,...,1.0,1.0,1.0,1.0,0.53073,-0.051009,0.409831,0.0,1.0,-0.197125
2,12-00211,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.888241,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
3,12-00258,0.352329,0.018141,1.0,1.0,1.0,1.0,0.511842,0.515246,0.310652,...,1.0,-1.862741,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
4,12-00294,-0.505703,-0.384994,0.238384,-0.906013,1.0,1.0,-1.597193,-0.022079,-0.317022,...,1.0,0.245287,-0.136855,-1.862741,0.858581,-0.771062,0.588563,1.0,-0.256898,-0.009582


### Beat RNASeq

In [7]:
beat_rnaseq_fp = os.path.join(os.getcwd(), "medical_genomics_2021_data", "beat_rnaseq")
beat_rnaseq_df = pd.read_csv(beat_rnaseq_fp, sep='\t', lineterminator='\n')
print(beat_rnaseq_df.shape)
beat_rnaseq_df.head()

(22843, 198)


Unnamed: 0,12-00066,12-00150,12-00211,12-00258,12-00294,12-00372,13-00028,13-00034,13-00098,13-00118,...,16-01102,16-01138,16-01151,16-01185,16-01216,16-01219,16-01227,16-01237,16-01262,16-01270
TSPAN6,0.070215,1.709459,0.0,0.0,0.0,0.921085,0.0,0.0,0.0,0.115955,...,0.052768,0.271781,0.118997,0.164221,0.648789,0.085224,0.395749,0.249059,0.142059,0.150957
DPM1,23.311415,32.137834,25.11747,29.351673,31.24077,22.940781,28.176101,34.303196,39.203413,24.089688,...,21.089525,25.547442,26.208989,32.023086,34.238344,28.607021,25.101824,38.785308,33.703438,16.575035
SCYL3,10.532266,17.778376,37.659392,15.647852,39.189992,33.73475,23.989937,17.804722,15.450059,25.510139,...,6.842223,34.244444,30.433366,26.234297,13.477109,20.823184,17.950065,7.245358,16.461058,16.21274
C1orf112,11.129094,11.25394,12.541923,8.190423,18.875313,11.110592,12.907338,6.758411,23.516117,12.465182,...,8.864988,24.520713,14.755571,13.219787,5.721136,16.874449,6.105849,4.845333,8.647826,6.944004
FGR,906.79296,49.004499,111.834945,1004.458236,267.8528,170.631056,68.186164,619.473686,11.03152,201.790997,...,451.393235,187.045931,102.961759,186.329197,583.113507,22.300408,24.508199,77.751751,45.565344,51.898882


In [8]:
beat_rnaseq_df.describe()

Unnamed: 0,12-00066,12-00150,12-00211,12-00258,12-00294,12-00372,13-00028,13-00034,13-00098,13-00118,...,16-01102,16-01138,16-01151,16-01185,16-01216,16-01219,16-01227,16-01237,16-01262,16-01270
count,22843.0,22843.0,22843.0,22843.0,22843.0,22843.0,22843.0,22843.0,22843.0,22843.0,...,22843.0,22843.0,22843.0,22843.0,22843.0,22843.0,22843.0,22843.0,22843.0,22843.0
mean,43.777087,43.777087,43.777087,43.777087,43.777087,43.777087,43.777087,43.777087,43.777087,43.777087,...,43.777087,43.777087,43.777087,43.777087,43.777087,43.777087,43.777087,43.777087,43.777087,43.777087
std,239.0362,179.037496,199.633444,255.030349,188.159584,165.383966,181.704322,357.619163,204.69839,190.984151,...,287.677166,262.096631,261.028608,229.596117,215.486602,290.598365,498.113439,430.670074,364.90668,272.110655
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.315968,0.398874,0.437118,0.286824,0.45798,0.546894,0.295178,0.25557,0.415165,0.434832,...,0.22866,0.483167,0.475986,0.492663,0.530827,0.426122,0.339214,0.316984,0.248603,0.332105
50%,3.896938,4.558558,5.245415,3.888061,5.659323,5.871919,4.53501,3.577982,4.181302,5.53686,...,2.603211,4.922261,4.581367,5.296126,4.305597,4.460082,2.996389,3.418903,3.480439,4.10602
75%,29.560559,32.565199,37.121401,30.435231,38.601161,37.534227,36.575262,29.986899,29.891861,38.047793,...,24.035727,33.489496,32.248063,34.979063,28.119088,32.939266,20.819249,23.762511,30.862263,28.591181
max,18392.67095,8743.627777,15104.91318,25132.07536,14059.75309,8427.297469,13429.99078,44188.0817,10933.57086,12421.37949,...,17424.5739,19765.17431,16845.03158,15343.55351,13268.70023,20064.25983,42172.56169,34482.24489,42426.73351,13589.05342


In [9]:
beat_rnaseq_df.isna().sum().sum()

0

#### Transformation

In [10]:
trans_beat_rna = beat_rnaseq_df.fillna(0.0).transform(lambda x: x + 1).apply(np.log2).transpose().reset_index().rename(columns={"index": "sampleID"})
trans_beat_rna = trans_beat_rna.replace(-math.inf, 0.0).replace(math.inf, 1) # ToDo: Deal with that
trans_beat_rna.iloc[:, :100].describe()
trans_beat_rna.head()

Unnamed: 0,sampleID,TSPAN6,DPM1,SCYL3,C1orf112,FGR,CFH,FUCA2,GCLC,NFYA,...,RP11-218F10.3,RP11-686O6.2,RP11-313P22.1,RP11-548H3.1,RP11-102N12.3,RP11-196O16.1,RP4-671G15.2,RP11-731C17.2,RP4-621B10.8,RP11-114I8.4
0,12-00066,0.097901,4.603562,3.527604,3.6004,9.826219,2.52093,5.487015,5.502739,5.868784,...,0.57677,0.189578,0.275776,1.327472,0.275776,0.396124,0.144468,1.200994,0.049781,0.643137
1,12-00150,1.438005,5.050407,4.231,3.615174,5.643986,3.211384,6.419366,5.334356,5.046681,...,0.192137,0.702038,0.192137,1.208258,1.311231,1.058426,0.0,0.702038,0.0,0.227674
2,12-00211,0.0,4.706943,5.272747,3.759361,6.81807,1.640779,5.908769,6.094865,6.294083,...,1.329468,0.381416,0.182054,1.609326,0.093897,0.981932,0.093897,1.857068,0.454038,1.386245
3,12-00258,0.0,4.923704,4.057264,3.200131,9.973637,4.972847,4.95517,5.492224,5.719514,...,0.363815,0.870742,0.173099,1.865981,0.654113,0.433561,0.089144,0.967966,0.467209,0.290525
4,12-00294,0.0,5.010814,5.328764,4.312906,8.070673,1.76045,5.260227,5.843724,5.932174,...,1.566947,0.258552,0.297473,1.689064,0.511232,0.046439,0.297473,2.513655,0.86221,0.913222


## Join

In [20]:
beat_rna_and_drug = pd.merge(trans_beat_rna, trans_beat_drug, on="sampleID")
print(f"Num of samples in rna: {trans_beat_rna.shape[0]}\nNum of samples in drug: {trans_beat_drug.shape[0]}\nNum of samples in joined: {beat_rna_and_drug.shape[0]}")
beat_rna_and_drug.head()
print(np.isinf(beat_rna_and_drug.iloc[:, 1:]).values.sum())
print(f"Cols in rna: {len(trans_beat_rna.columns)}\nCols in drugs: {len(trans_beat_drug.columns)}\
    \nCols in merged: {len(beat_rna_and_drug.columns)}")
print(trans_beat_rna.columns[-1])
print(trans_beat_drug.columns[1])
print(beat_rna_and_drug.columns[22843:22846])
drugs_reorg = beat_rna_and_drug.iloc[:, list(range(22844, 22923)) + [0]]
print(len(drugs_reorg.columns))
beat_rna_and_drug.head()

Num of samples in rna: 198
Num of samples in drug: 198
Num of samples in joined: 198
0
Cols in rna: 22844
Cols in drugs: 80    
Cols in merged: 22923
RP11-114I8.4
A-674563
Index(['RP11-114I8.4', 'A-674563', 'Afatinib (BIBW-2992)'], dtype='object')
80


Unnamed: 0,sampleID,TSPAN6,DPM1,SCYL3,C1orf112,FGR,CFH,FUCA2,GCLC,NFYA,...,TG100-115,Tofacitinib (CP-690550),Tozasertib (VX-680),Trametinib (GSK1120212),Vandetanib (ZD6474),Vargetef,Vatalanib (PTK787),Vismodegib (GDC-0449),VX-745,YM-155
0,12-00066,0.097901,4.603562,3.527604,3.6004,9.826219,2.52093,5.487015,5.502739,5.868784,...,0.502356,0.869371,0.2967,-1.862741,0.216112,-1.333854,0.08879,0.0,-1.106552,-0.661768
1,12-00150,1.438005,5.050407,4.231,3.615174,5.643986,3.211384,6.419366,5.334356,5.046681,...,1.0,1.0,1.0,1.0,0.53073,-0.051009,0.409831,0.0,1.0,-0.197125
2,12-00211,0.0,4.706943,5.272747,3.759361,6.81807,1.640779,5.908769,6.094865,6.294083,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
3,12-00258,0.0,4.923704,4.057264,3.200131,9.973637,4.972847,4.95517,5.492224,5.719514,...,1.0,-1.862741,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
4,12-00294,0.0,5.010814,5.328764,4.312906,8.070673,1.76045,5.260227,5.843724,5.932174,...,1.0,0.245287,-0.136855,-1.862741,0.858581,-0.771062,0.588563,1.0,-0.256898,-0.009582


## Linear Regression

In [None]:
t_1_LR = LinearRegression()
num_of_features = beat_rnaseq_df.shape[0]
print(f"Num of features (genes): {num_of_features}")
X = beat_rna_and_drug.iloc[:, 1:num_of_features + 1]
y = beat_rna_and_drug.iloc[:, num_of_features + 2]
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
t_1_LR.fit(x_train, y_train)
y_pred = t_1_LR.predict(x_test)
score = r2_score(y_test, y_pred)
print(score)

Num of features (genes): 22843
-0.22014216665460618


# Beat X TCGA

In [13]:
beat_rnaseq_df = data_prep_utils.get_df("beat_rnaseq")
tcga_rna = data_prep_utils.get_df("tcga_rna")

In [14]:
beat_rnaseq_df = beat_rnaseq_df.reset_index().rename(columns={"index": "gene"})
tcga_rna = tcga_rna.reset_index().rename(columns={"index": "gene"})

In [15]:
tcga_rna.head()

Unnamed: 0,gene,TCGA.AB.2803,TCGA.AB.2805,TCGA.AB.2806,TCGA.AB.2807,TCGA.AB.2808,TCGA.AB.2810,TCGA.AB.2811,TCGA.AB.2812,TCGA.AB.2813,...,TCGA.AB.3000,TCGA.AB.3001,TCGA.AB.3002,TCGA.AB.3005,TCGA.AB.3006,TCGA.AB.3007,TCGA.AB.3008,TCGA.AB.3009,TCGA.AB.3011,TCGA.AB.3012
0,A1BG,21.786275,5.25116,10.866631,18.869409,8.199836,13.152894,7.799243,7.607058,13.230136,...,12.968585,16.882928,13.293426,8.819212,5.905764,18.960694,8.283968,7.088477,12.933981,15.191255
1,A1CF,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.055844,0.0,0.0
2,A2BP1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,A2LD1,3.772166,3.007237,2.951634,2.491801,2.934105,1.334901,1.770624,5.297683,3.233179,...,1.918685,1.936251,1.171619,5.119481,2.640735,3.583209,2.029229,6.645094,1.965216,1.59301
4,A2M,24.673141,0.995091,0.40656,18.714176,15.931201,0.138116,0.198614,0.108805,12.367011,...,2.423215,9.456122,3.107597,4.403161,5.800682,0.544561,46.349288,14.462855,1.439471,27.238767


In [16]:
beat_rnaseq_df.head()

Unnamed: 0,gene,12-00066,12-00150,12-00211,12-00258,12-00294,12-00372,13-00028,13-00034,13-00098,...,16-01102,16-01138,16-01151,16-01185,16-01216,16-01219,16-01227,16-01237,16-01262,16-01270
0,TSPAN6,0.070215,1.709459,0.0,0.0,0.0,0.921085,0.0,0.0,0.0,...,0.052768,0.271781,0.118997,0.164221,0.648789,0.085224,0.395749,0.249059,0.142059,0.150957
1,DPM1,23.311415,32.137834,25.11747,29.351673,31.24077,22.940781,28.176101,34.303196,39.203413,...,21.089525,25.547442,26.208989,32.023086,34.238344,28.607021,25.101824,38.785308,33.703438,16.575035
2,SCYL3,10.532266,17.778376,37.659392,15.647852,39.189992,33.73475,23.989937,17.804722,15.450059,...,6.842223,34.244444,30.433366,26.234297,13.477109,20.823184,17.950065,7.245358,16.461058,16.21274
3,C1orf112,11.129094,11.25394,12.541923,8.190423,18.875313,11.110592,12.907338,6.758411,23.516117,...,8.864988,24.520713,14.755571,13.219787,5.721136,16.874449,6.105849,4.845333,8.647826,6.944004
4,FGR,906.79296,49.004499,111.834945,1004.458236,267.8528,170.631056,68.186164,619.473686,11.03152,...,451.393235,187.045931,102.961759,186.329197,583.113507,22.300408,24.508199,77.751751,45.565344,51.898882


In [45]:
beat_rnaseq_x_tcga = beat_rnaseq_df[beat_rnaseq_df["gene"].isin(tcga_rna["gene"])]

In [18]:
print(f"{beat_rnaseq_df.shape}\n{beat_rnaseq_x_tcga.shape}")

(22843, 199)
(14187, 199)


In [46]:
beat_rnaseq_x_tcga.set_index("gene", inplace=True)
# beat_rnaseq_x_tcga.columns = beat_rnaseq_x_tcga.columns.to_flat_index()

In [47]:
beat_rnaseq_x_tcga.head()

Unnamed: 0_level_0,12-00066,12-00150,12-00211,12-00258,12-00294,12-00372,13-00028,13-00034,13-00098,13-00118,...,16-01102,16-01138,16-01151,16-01185,16-01216,16-01219,16-01227,16-01237,16-01262,16-01270
gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
TSPAN6,0.070215,1.709459,0.0,0.0,0.0,0.921085,0.0,0.0,0.0,0.115955,...,0.052768,0.271781,0.118997,0.164221,0.648789,0.085224,0.395749,0.249059,0.142059,0.150957
DPM1,23.311415,32.137834,25.11747,29.351673,31.24077,22.940781,28.176101,34.303196,39.203413,24.089688,...,21.089525,25.547442,26.208989,32.023086,34.238344,28.607021,25.101824,38.785308,33.703438,16.575035
SCYL3,10.532266,17.778376,37.659392,15.647852,39.189992,33.73475,23.989937,17.804722,15.450059,25.510139,...,6.842223,34.244444,30.433366,26.234297,13.477109,20.823184,17.950065,7.245358,16.461058,16.21274
C1orf112,11.129094,11.25394,12.541923,8.190423,18.875313,11.110592,12.907338,6.758411,23.516117,12.465182,...,8.864988,24.520713,14.755571,13.219787,5.721136,16.874449,6.105849,4.845333,8.647826,6.944004
FGR,906.79296,49.004499,111.834945,1004.458236,267.8528,170.631056,68.186164,619.473686,11.03152,201.790997,...,451.393235,187.045931,102.961759,186.329197,583.113507,22.300408,24.508199,77.751751,45.565344,51.898882


In [48]:
beat_rnaseq_x_tcga_trans = data_prep_utils.transpose_df(beat_rnaseq_x_tcga)

In [49]:
beat_rnaseq_x_tcga_trans.head()

gene,SampleID,TSPAN6,DPM1,SCYL3,C1orf112,FGR,CFH,FUCA2,GCLC,NFYA,...,TERC,TSIX,SRXN1,SNORD11B,GTF2H5,NUDT3,SNORA28,DOC2B,SNURF,DGCR11
0,12-00066,0.070215,23.311415,10.532266,11.129094,906.79296,4.73952,43.849332,44.340838,57.435955,...,0.105323,0.0,1.053227,0.035108,10.18119,246.770983,0.28086,8.109845,0.070215,2.457529
1,12-00150,1.709459,32.137834,17.778376,11.25394,49.004499,8.262386,84.589743,39.346054,32.052361,...,0.256419,0.0,0.769257,0.028491,7.721058,161.116537,0.113964,0.455856,0.28491,1.851914
2,12-00211,0.0,25.11747,37.659392,12.541923,111.834945,2.118341,59.078171,67.349788,77.470749,...,0.235371,1.513101,0.638865,0.033624,11.196944,274.644567,0.60524,3.900437,0.235371,2.622708
3,12-00258,0.0,29.351673,15.647852,8.190423,1004.458236,30.403362,30.02093,44.011575,51.692089,...,0.223085,0.223085,1.402251,0.0,11.887268,106.475505,0.223085,0.350563,0.159347,1.784684
4,12-00294,0.0,31.24077,39.189992,18.875313,267.8528,2.388038,37.32536,56.429663,60.060789,...,0.163564,0.0,0.032713,0.130851,11.842051,272.759727,0.130851,0.0,0.327128,1.995484


In [None]:
beat_rnaseq_x_tcga_trans.columns[0]