In [1]:
import numpy as np 
import pandas as pd 

import os
import torch
from torch import nn, optim
import matplotlib.pyplot as plt

import re

In [2]:
# ccle

expression = pd.read_csv('../../data/CCLE+TCGA/DepMap24Q2/OmicsExpressionProteinCodingGenesTPMLogp1BatchCorrected.csv')
cn = pd.read_csv('../../data/CCLE+TCGA/DepMap24Q2/OmicsCNGene.csv')
mut = pd.read_csv('../../data/CCLE+TCGA/DepMap24Q2/OmicsSomaticMutations.csv', low_memory=False)

celllineinfo = pd.read_csv('../../data/CCLE+TCGA/DepMap24Q2/Model.csv')

# Expression, CNA

In [4]:
expression.rename(columns={"Unnamed: 0":"ModelID"}, inplace=True)
cn.rename(columns={"Unnamed: 0":"ModelID"}, inplace=True)

In [5]:
cellline_name = celllineinfo[['ModelID', 'StrippedCellLineName', 'DepmapModelType']]

exp_name = pd.merge(cellline_name, expression, how='inner', on='ModelID')
cn_name = pd.merge(cellline_name, cn, how='inner', on='ModelID')

In [6]:
exp_name.columns = exp_name.columns.str.replace(r"\(.*\)","", regex=True)
cn_name.columns = cn_name.columns.str.replace(r"\(.*\)","", regex=True)

exp_name

Unnamed: 0,ModelID,StrippedCellLineName,DepmapModelType,ZNF891,ARMC10,PTGER4,EIF1AD,ABCG5,CXCR4,CAPNS1,...,FAM13C,MUC20,MST1R,TELO2,THSD8,FCRL6,DNMT3B,ZCCHC10,PRSS2,ADAMTSL4
0,ACH-000001,NIHOVCAR3,HGSOC,1.023536,4.659173,1.109597,4.765514,0.008593,0.622993,8.674651,...,0.734130,3.768433,1.792734,4.453836,0.452506,0.037944,3.484582,4.814547,4.796283,0.447313
1,ACH-000002,HL60,AML,0.836636,4.548480,1.843107,4.292430,0.021226,6.536572,7.231733,...,0.020196,0.110305,-0.062923,6.587490,0.142297,-0.003950,0.909984,3.539725,-0.025975,0.387281
2,ACH-000003,CACO2,COAD,0.483737,4.171159,0.149463,5.072440,-0.004166,1.029053,8.022616,...,0.144566,0.869018,3.260954,4.807801,0.825749,-0.003950,5.826128,5.466383,1.771658,4.237364
3,ACH-000004,HEL,AML,1.176561,4.465143,-0.030949,4.480605,0.008593,4.000047,8.431914,...,0.090609,1.207451,0.197755,4.520240,0.979880,0.065201,5.549709,4.835700,0.702163,4.075715
4,ACH-000005,HEL9217,AML,1.688499,4.984734,-0.044769,4.801512,-0.004166,2.416750,8.187010,...,0.020196,0.496218,0.291776,3.044135,0.175605,0.277884,6.455340,5.447698,0.193754,2.398383
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1512,ACH-003161,ABMT9430,ZIMMPSC,1.090094,4.531712,1.241391,5.653260,0.049049,0.325238,9.326489,...,2.502624,0.566635,0.828714,4.494504,0.328631,0.007287,2.434028,5.194968,0.100699,1.440669
1513,ACH-003181,NRHLMS1,LMS,0.792679,4.865415,1.981531,4.654791,0.008207,0.226431,9.740253,...,0.279926,0.278288,0.348269,4.345693,0.646117,0.036904,1.158250,4.139542,0.100699,2.816797
1514,ACH-003183,NRHMFS3,MFS,0.439214,5.725886,1.263201,5.363120,0.008207,2.247287,8.624777,...,0.244900,0.385486,0.378412,3.289709,0.672173,0.007287,1.883266,3.199413,0.100699,1.194257
1515,ACH-003184,NRHLMS2,LMS,0.603562,4.531140,1.079348,4.444915,0.008207,0.255350,8.555376,...,0.868375,0.305871,0.407981,2.943492,0.735571,0.065946,0.943899,4.015194,0.129964,1.757523


In [7]:
A549_ex = exp_name[exp_name['StrippedCellLineName']=='A549']
A549_cna = cn_name[cn_name['StrippedCellLineName']=='A549']

HELA_ex = exp_name[exp_name['StrippedCellLineName']=='HELA']
HELA_cna = cn_name[cn_name['StrippedCellLineName']=='HELA']

JURKAT_ex = exp_name[exp_name['StrippedCellLineName']=='JURKAT']
JURKAT_cna = cn_name[cn_name['StrippedCellLineName']=='JURKAT']

K562_ex = exp_name[exp_name['StrippedCellLineName']=='K562']
K562_cna = cn_name[cn_name['StrippedCellLineName']=='K562']

HCT116_ex = exp_name[exp_name['StrippedCellLineName']=='HCT116']
HCT116_cna = cn_name[cn_name['StrippedCellLineName']=='HCT116']

MDAMB468_ex = exp_name[exp_name['StrippedCellLineName']=='MDAMB468']
MDAMB468_cna = cn_name[cn_name['StrippedCellLineName']=='MDAMB468']

c22RV1_ex = exp_name[exp_name['StrippedCellLineName']=='22RV1']
c22RV1_cna = cn_name[cn_name['StrippedCellLineName']=='22RV1']

A375_ex = exp_name[exp_name['StrippedCellLineName']=='A375']
A375_cna = cn_name[cn_name['StrippedCellLineName']=='A375']

GI1_ex = exp_name[exp_name['StrippedCellLineName']=='GI1']
GI1_cna = cn_name[cn_name['StrippedCellLineName']=='GI1']

HT29_ex = exp_name[exp_name['StrippedCellLineName']=='HT29']
HT29_cna = cn_name[cn_name['StrippedCellLineName']=='HT29']

IPC298_ex = exp_name[exp_name['StrippedCellLineName']=='IPC298']
IPC298_cna = cn_name[cn_name['StrippedCellLineName']=='IPC298']

MELJUSO_ex = exp_name[exp_name['StrippedCellLineName']=='MELJUSO']
MELJUSO_cna = cn_name[cn_name['StrippedCellLineName']=='MELJUSO']

MEWO_ex = exp_name[exp_name['StrippedCellLineName']=='MEWO']
MEWO_cna = cn_name[cn_name['StrippedCellLineName']=='MEWO']

OVCAR8_ex = exp_name[exp_name['StrippedCellLineName']=='OVCAR8']
OVCAR8_cna = cn_name[cn_name['StrippedCellLineName']=='OVCAR8']

OVCAR8_ex

Unnamed: 0,ModelID,StrippedCellLineName,DepmapModelType,ZNF891,ARMC10,PTGER4,EIF1AD,ABCG5,CXCR4,CAPNS1,...,FAM13C,MUC20,MST1R,TELO2,THSD8,FCRL6,DNMT3B,ZCCHC10,PRSS2,ADAMTSL4
687,ACH-000696,OVCAR8,HGSOC,0.922844,2.859775,0.948189,5.569033,-0.004166,3.270866,9.429063,...,0.295295,0.097996,2.781263,5.120405,1.312198,-0.00395,2.626343,5.137805,0.145017,5.547337


# Mutation

In [8]:
# From 24Q2
mut['VariantInfo'] = mut['VariantInfo'].map(lambda a: a.split("&"))
mut = pd.DataFrame.explode(mut, 'VariantInfo')

mut

Unnamed: 0,Chrom,Pos,Ref,Alt,AF,DP,RefCount,AltCount,GT,PS,...,GwasDisease,GwasPmID,GtexGene,ProveanPrediction,AMClass,AMPathogenicity,Rescue,ModelID,Hotspot,EntrezGeneID
0,chr1,818203,G,A,0.240,27,21,6,0/1,,...,,,,,,,False,ACH-000062,False,400728.0
0,chr1,818203,G,A,0.240,27,21,6,0/1,,...,,,,,,,False,ACH-000062,False,400728.0
1,chr1,924657,C,G,0.437,17,9,8,0/1,,...,,,,,,,False,ACH-000693,False,148398.0
2,chr1,924750,C,T,0.625,19,7,12,0/1,,...,,,,,,,False,ACH-000930,False,148398.0
3,chr1,924909,G,A,0.285,52,37,15,0/1,,...,,,,,,,False,ACH-001691,False,148398.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
676188,chrY,26414504,T,A,0.987,80,0,80,1|1,,...,,,,,,,False,ACH-000968,False,
676189,chrY,26543783,G,A,0.978,45,0,45,1|1,,...,,,,,,,False,ACH-000987,False,
676189,chrY,26543783,G,A,0.978,45,0,45,1|1,,...,,,,,,,False,ACH-000987,False,
676190,chrY,26614964,C,A,0.702,8,2,6,0/1,,...,,,,,,,False,ACH-000774,False,


In [9]:
mut_name = pd.merge(cellline_name, mut, how='inner', on='ModelID')

A549_mut = mut_name[mut_name['StrippedCellLineName']=='A549'].copy()
HCT116_mut = mut_name[mut_name['StrippedCellLineName']=='HCT116'].copy()
HELA_mut = mut_name[mut_name['StrippedCellLineName']=='HELA'].copy()
JURKAT_mut = mut_name[mut_name['StrippedCellLineName']=='JURKAT'].copy()
K562_mut = mut_name[(mut_name['StrippedCellLineName']=='K562')].copy()


A549_mut_short = A549_mut[['StrippedCellLineName', 'HugoSymbol', 'VariantInfo']]
HCT116_mut_short = HCT116_mut[['StrippedCellLineName', 'HugoSymbol', 'VariantInfo']]
HELA_mut_short = HELA_mut[['StrippedCellLineName', 'HugoSymbol', 'VariantInfo']]
JURKAT_mut_short = JURKAT_mut[['StrippedCellLineName', 'HugoSymbol', 'VariantInfo']]
K562_mut_short = K562_mut[['StrippedCellLineName', 'HugoSymbol', 'VariantInfo']]

A549_mut_short['HugoSymbol'] = A549_mut_short['HugoSymbol'].str.replace(" ", "")
HCT116_mut_short['HugoSymbol'] = HCT116_mut_short['HugoSymbol'].str.replace(" ", "")
HELA_mut_short['HugoSymbol'] = HELA_mut_short['HugoSymbol'].str.replace(" ", "")
JURKAT_mut_short['HugoSymbol'] = JURKAT_mut_short['HugoSymbol'].str.replace(" ", "")
K562_mut_short['HugoSymbol'] = K562_mut_short['HugoSymbol'].str.replace(" ", "")


A549_mut_cross = pd.crosstab([A549_mut_short['StrippedCellLineName'], A549_mut_short['HugoSymbol']], A549_mut_short['VariantInfo'])
A549_mut_cross.reset_index(inplace=True)
A549_mut_cross.rename(columns={"HugoSymbol":"Gene"}, inplace=True)

HCT116_mut_cross = pd.crosstab([HCT116_mut_short['StrippedCellLineName'], HCT116_mut_short['HugoSymbol']], HCT116_mut_short['VariantInfo'])
HCT116_mut_cross.reset_index(inplace=True)
HCT116_mut_cross.rename(columns={"HugoSymbol":"Gene"}, inplace=True)

HELA_mut_cross = pd.crosstab([HELA_mut_short['StrippedCellLineName'], HELA_mut_short['HugoSymbol']], HELA_mut_short['VariantInfo'])
HELA_mut_cross.reset_index(inplace=True)
HELA_mut_cross.rename(columns={"HugoSymbol":"Gene"}, inplace=True)

JURKAT_mut_cross = pd.crosstab([JURKAT_mut_short['StrippedCellLineName'], JURKAT_mut_short['HugoSymbol']], JURKAT_mut_short['VariantInfo'])
JURKAT_mut_cross.reset_index(inplace=True)
JURKAT_mut_cross.rename(columns={"HugoSymbol":"Gene"}, inplace=True)

K562_mut_cross = pd.crosstab([K562_mut_short['StrippedCellLineName'], K562_mut_short['HugoSymbol']], K562_mut_short['VariantInfo'])
K562_mut_cross.reset_index(inplace=True)
K562_mut_cross.rename(columns={"HugoSymbol":"Gene"}, inplace=True)


A549_mut_cross

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  A549_mut_short['HugoSymbol'] = A549_mut_short['HugoSymbol'].str.replace(" ", "")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  HCT116_mut_short['HugoSymbol'] = HCT116_mut_short['HugoSymbol'].str.replace(" ", "")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  HELA_mut_short['HugoSymbol'] = HELA_mut

VariantInfo,StrippedCellLineName,Gene,frameshift_variant,inframe_deletion,missense_variant,splice_acceptor_variant,splice_donor_variant,splice_region_variant,start_lost,stop_gained,stop_lost
0,A549,ABCB1,0,0,1,0,0,0,0,0,0
1,A549,ABRAXAS2,0,0,1,0,0,0,0,0,0
2,A549,ABTB3,0,0,1,0,0,0,0,0,0
3,A549,ACADL,0,0,1,0,0,0,0,0,0
4,A549,ADAM11,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
393,A549,ZNF749,0,0,1,0,0,0,0,0,0
394,A549,ZNF766,0,0,1,0,0,0,0,0,0
395,A549,ZNF90,0,0,1,0,0,0,0,0,0
396,A549,ZSCAN1,0,0,1,0,0,0,0,0,0


In [10]:
#mut_name = pd.merge(cellline_name, mut, how='inner', on='ModelID')

c22RV1_mut = mut_name[mut_name['StrippedCellLineName']=='22RV1'].copy()
A375_mut = mut_name[mut_name['StrippedCellLineName']=='A375'].copy()
GI1_mut = mut_name[mut_name['StrippedCellLineName']=='GI1'].copy()
HT29_mut = mut_name[mut_name['StrippedCellLineName']=='HT29'].copy()
IPC298_mut = mut_name[(mut_name['StrippedCellLineName']=='IPC298')].copy()
MELJUSO_mut = mut_name[(mut_name['StrippedCellLineName']=='MELJUSO')].copy()
MEWO_mut = mut_name[(mut_name['StrippedCellLineName']=='MEWO')].copy()
OVCAR8_mut = mut_name[(mut_name['StrippedCellLineName']=='OVCAR8')].copy()
MDAMB468_mut = mut_name[(mut_name['StrippedCellLineName']=='MDAMB468')].copy()

c22RV1_mut_short = c22RV1_mut[['StrippedCellLineName', 'HugoSymbol', 'VariantInfo']]
A375_mut_short = A375_mut[['StrippedCellLineName', 'HugoSymbol', 'VariantInfo']]
GI1_mut_short = GI1_mut[['StrippedCellLineName', 'HugoSymbol', 'VariantInfo']]
HT29_mut_short = HT29_mut[['StrippedCellLineName', 'HugoSymbol', 'VariantInfo']]
IPC298_mut_short = IPC298_mut[['StrippedCellLineName', 'HugoSymbol', 'VariantInfo']]
MELJUSO_mut_short = MELJUSO_mut[['StrippedCellLineName', 'HugoSymbol', 'VariantInfo']]
MEWO_mut_short = MEWO_mut[['StrippedCellLineName', 'HugoSymbol', 'VariantInfo']]
OVCAR8_mut_short = OVCAR8_mut[['StrippedCellLineName', 'HugoSymbol', 'VariantInfo']]
MDAMB468_mut_short = MDAMB468_mut[['StrippedCellLineName', 'HugoSymbol', 'VariantInfo']]

MDAMB468_mut_short

Unnamed: 0,StrippedCellLineName,HugoSymbol,VariantInfo
189974,MDAMB468,DVL1,missense_variant
189975,MDAMB468,PADI6,missense_variant
189976,MDAMB468,SLC30A2,missense_variant
189977,MDAMB468,GPN2,missense_variant
189978,MDAMB468,SESN2,missense_variant
...,...,...,...
190265,MDAMB468,BCOR,missense_variant
190266,MDAMB468,DGKK,missense_variant
190267,MDAMB468,STARD8,missense_variant
190268,MDAMB468,TBX22,missense_variant


In [11]:
c22RV1_mut_short['HugoSymbol'] = c22RV1_mut_short['HugoSymbol'].str.replace(" ", "")
c22RV1_mut_short['HugoSymbol'] = c22RV1_mut_short['HugoSymbol'].str.replace(" ", "")
A375_mut_short['HugoSymbol'] = A375_mut_short['HugoSymbol'].str.replace(" ", "")
GI1_mut_short['HugoSymbol'] = GI1_mut_short['HugoSymbol'].str.replace(" ", "")
HT29_mut_short['HugoSymbol'] = HT29_mut_short['HugoSymbol'].str.replace(" ", "")
IPC298_mut_short['HugoSymbol'] = IPC298_mut_short['HugoSymbol'].str.replace(" ", "")
MELJUSO_mut_short['HugoSymbol'] = MELJUSO_mut_short['HugoSymbol'].str.replace(" ", "")
MEWO_mut_short['HugoSymbol'] = MEWO_mut_short['HugoSymbol'].str.replace(" ", "")
OVCAR8_mut_short['HugoSymbol'] = OVCAR8_mut_short['HugoSymbol'].str.replace(" ", "")
MDAMB468_mut_short['HugoSymbol'] = MDAMB468_mut_short['HugoSymbol'].str.replace(" ", "")

MDAMB468_mut_short

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  c22RV1_mut_short['HugoSymbol'] = c22RV1_mut_short['HugoSymbol'].str.replace(" ", "")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  c22RV1_mut_short['HugoSymbol'] = c22RV1_mut_short['HugoSymbol'].str.replace(" ", "")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  A375_mut_short['HugoSymbol'] = A375

Unnamed: 0,StrippedCellLineName,HugoSymbol,VariantInfo
189974,MDAMB468,DVL1,missense_variant
189975,MDAMB468,PADI6,missense_variant
189976,MDAMB468,SLC30A2,missense_variant
189977,MDAMB468,GPN2,missense_variant
189978,MDAMB468,SESN2,missense_variant
...,...,...,...
190265,MDAMB468,BCOR,missense_variant
190266,MDAMB468,DGKK,missense_variant
190267,MDAMB468,STARD8,missense_variant
190268,MDAMB468,TBX22,missense_variant


In [12]:
c22RV1_mut_cross = pd.crosstab([c22RV1_mut_short['StrippedCellLineName'], c22RV1_mut_short['HugoSymbol']], c22RV1_mut_short['VariantInfo'])
c22RV1_mut_cross.reset_index(inplace=True)
c22RV1_mut_cross.rename(columns={"HugoSymbol":"Gene"}, inplace=True)

A375_mut_cross = pd.crosstab([A375_mut_short['StrippedCellLineName'], A375_mut_short['HugoSymbol']], A375_mut_short['VariantInfo'])
A375_mut_cross.reset_index(inplace=True)
A375_mut_cross.rename(columns={"HugoSymbol":"Gene"}, inplace=True)

GI1_mut_cross = pd.crosstab([GI1_mut_short['StrippedCellLineName'], GI1_mut_short['HugoSymbol']], GI1_mut_short['VariantInfo'])
GI1_mut_cross.reset_index(inplace=True)
GI1_mut_cross.rename(columns={"HugoSymbol":"Gene"}, inplace=True)

HT29_mut_cross = pd.crosstab([HT29_mut_short['StrippedCellLineName'], HT29_mut_short['HugoSymbol']], HT29_mut_short['VariantInfo'])
HT29_mut_cross.reset_index(inplace=True)
HT29_mut_cross.rename(columns={"HugoSymbol":"Gene"}, inplace=True)

IPC298_mut_cross = pd.crosstab([IPC298_mut_short['StrippedCellLineName'], IPC298_mut_short['HugoSymbol']], IPC298_mut_short['VariantInfo'])
IPC298_mut_cross.reset_index(inplace=True)
IPC298_mut_cross.rename(columns={"HugoSymbol":"Gene"}, inplace=True)

MELJUSO_mut_cross = pd.crosstab([MELJUSO_mut_short['StrippedCellLineName'], MELJUSO_mut_short['HugoSymbol']], MELJUSO_mut_short['VariantInfo'])
MELJUSO_mut_cross.reset_index(inplace=True)
MELJUSO_mut_cross.rename(columns={"HugoSymbol":"Gene"}, inplace=True)

MEWO_mut_cross = pd.crosstab([MEWO_mut_short['StrippedCellLineName'], MEWO_mut_short['HugoSymbol']], MEWO_mut_short['VariantInfo'])
MEWO_mut_cross.reset_index(inplace=True)
MEWO_mut_cross.rename(columns={"HugoSymbol":"Gene"}, inplace=True)

OVCAR8_mut_cross = pd.crosstab([OVCAR8_mut_short['StrippedCellLineName'], OVCAR8_mut_short['HugoSymbol']], OVCAR8_mut_short['VariantInfo'])
OVCAR8_mut_cross.reset_index(inplace=True)
OVCAR8_mut_cross.rename(columns={"HugoSymbol":"Gene"}, inplace=True)

MDAMB468_mut_cross = pd.crosstab([MDAMB468_mut_short['StrippedCellLineName'], MDAMB468_mut_short['HugoSymbol']], MDAMB468_mut_short['VariantInfo'])
MDAMB468_mut_cross.reset_index(inplace=True)
MDAMB468_mut_cross.rename(columns={"HugoSymbol":"Gene"}, inplace=True)

MDAMB468_mut_cross

VariantInfo,StrippedCellLineName,Gene,coding_sequence_variant,frameshift_variant,inframe_deletion,inframe_insertion,missense_variant,splice_acceptor_variant,splice_donor_variant,splice_region_variant,start_lost,stop_gained,stop_lost
0,MDAMB468,A1CF,0,0,0,0,1,0,0,0,0,0,0
1,MDAMB468,AASS,0,0,0,0,1,0,0,0,0,0,0
2,MDAMB468,ABCF2,0,0,0,0,1,0,0,0,0,0,0
3,MDAMB468,ABLIM2,0,0,0,0,1,0,0,0,0,0,0
4,MDAMB468,ACSF2,0,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
275,MDAMB468,ZFYVE9,0,0,0,0,1,0,0,0,0,0,0
276,MDAMB468,ZNF407,0,0,0,0,1,0,0,0,0,0,0
277,MDAMB468,ZNF445,0,1,0,0,0,0,0,0,0,0,0
278,MDAMB468,ZNF528,0,0,0,0,1,0,0,0,0,0,0


# Feature generation

In [13]:
def ToExpressionCNA(df_exp, df_cna):
    df_exp_t = df_exp.transpose()
    df_exp_t.columns = df_exp_t.iloc[0]
    df_exp_t.drop(df_exp_t.index[0:3], inplace=True)
    df_exp_t.reset_index(inplace=True)
    df_exp_t = df_exp_t.rename_axis(None, axis=1)
    df_exp_t = df_exp_t.add_suffix('_Exp')
    df_exp_t = df_exp_t.rename({'index_Exp':'Gene'}, axis=1)
    df_exp_t
    
    df_cna_t = df_cna.transpose()
    df_cna_t.columns = df_cna_t.iloc[0]
    df_cna_t.drop(df_cna_t.index[0:3], inplace=True)
    df_cna_t.reset_index(inplace=True)
    df_cna_t = df_cna_t.rename_axis(None, axis=1)
    df_cna_t = df_cna_t.add_suffix('_Cna')
    df_cna_t = df_cna_t.rename({'index_Cna':'Gene'}, axis=1)
    df_cna_t
    
    #df_excna = pd.merge(df_exp_t, df_cna_t, how='inner', on='Gene')
    df_excna = pd.merge(df_exp_t, df_cna_t, how='left', left_on='Gene', right_on='Gene')
    df_excna

    return df_excna

In [14]:
A549_excna = ToExpressionCNA(A549_ex, A549_cna)
HELA_excna = ToExpressionCNA(HELA_ex, HELA_cna)
JURKAT_excna = ToExpressionCNA(JURKAT_ex, JURKAT_cna)
K562_excna = ToExpressionCNA(K562_ex, K562_cna)
c22RV1_excna = ToExpressionCNA(c22RV1_ex, c22RV1_cna)
A375_excna = ToExpressionCNA(A375_ex, A375_cna)
GI1_excna = ToExpressionCNA(GI1_ex, GI1_cna)
HT29_excna = ToExpressionCNA(HT29_ex, HT29_cna)
IPC298_excna = ToExpressionCNA(IPC298_ex, IPC298_cna)
MELJUSO_excna = ToExpressionCNA(MELJUSO_ex, MELJUSO_cna)
MEWO_excna = ToExpressionCNA(MEWO_ex, MEWO_cna)
OVCAR8_excna = ToExpressionCNA(OVCAR8_ex, OVCAR8_cna)
MDAMB468_excna = ToExpressionCNA(MDAMB468_ex, MDAMB468_cna)
HCT116_excna = ToExpressionCNA(HCT116_ex, HCT116_cna)

A549_excna['Gene'] = A549_excna['Gene'].str.replace(" ", "")
HELA_excna['Gene'] = HELA_excna['Gene'].str.replace(" ", "")
JURKAT_excna['Gene'] = JURKAT_excna['Gene'].str.replace(" ", "")
K562_excna['Gene'] = K562_excna['Gene'].str.replace(" ", "")
c22RV1_excna['Gene'] = c22RV1_excna['Gene'].str.replace(" ", "")
A375_excna['Gene'] = A375_excna['Gene'].str.replace(" ", "")
GI1_excna['Gene'] = GI1_excna['Gene'].str.replace(" ", "")
HT29_excna['Gene'] = HT29_excna['Gene'].str.replace(" ", "")
IPC298_excna['Gene'] = IPC298_excna['Gene'].str.replace(" ", "")
MELJUSO_excna['Gene'] = MELJUSO_excna['Gene'].str.replace(" ", "")
MEWO_excna['Gene'] = MEWO_excna['Gene'].str.replace(" ", "")
OVCAR8_excna['Gene'] = OVCAR8_excna['Gene'].str.replace(" ", "")
MDAMB468_excna['Gene'] = MDAMB468_excna['Gene'].str.replace(" ", "")
HCT116_excna['Gene'] = HCT116_excna['Gene'].str.replace(" ", "")

HCT116_excna

Unnamed: 0,Gene,ACH-000971_Exp,ACH-000971_Cna
0,ZNF891,-0.019059,0.99098
1,ARMC10,5.285371,0.988569
2,PTGER4,0.804196,0.987542
3,EIF1AD,5.873076,0.989453
4,ABCG5,0.021226,0.982726
...,...,...,...
19132,FCRL6,0.024116,0.982712
19133,DNMT3B,3.576248,0.99314
19134,ZCCHC10,5.001986,0.988286
19135,PRSS2,0.001704,0.988569


In [15]:
c22RV1_mut_cross_short = c22RV1_mut_cross.drop(['StrippedCellLineName'], axis=1)
c22RV1_mut_cross_sum  = c22RV1_mut_cross_short.groupby(["Gene"], as_index=False).sum()
c22RV1_mut_cross_sum

c22RV1_excnamut = pd.merge(c22RV1_excna, c22RV1_mut_cross_sum, on='Gene', how='left')
c22RV1_excnamut.fillna(0, inplace=True)
c22RV1_excnamut.rename(columns={"ACH-000956_Exp":"Expression", "ACH-000956_Cna":"CNA"}, inplace=True)
c22RV1_excnamut

Unnamed: 0,Gene,Expression,CNA,coding_sequence_variant,frameshift_variant,inframe_deletion,inframe_insertion,intron_variant,missense_variant,non_coding_transcript_exon_variant,splice_acceptor_variant,splice_donor_variant,splice_region_variant,start_lost,stop_gained,stop_lost
0,ZNF891,1.700438,1.386429,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,ARMC10,5.613438,1.450845,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,PTGER4,1.339696,0.959258,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,EIF1AD,4.826810,0.967708,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,ABCG5,0.141149,0.488352,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19132,FCRL6,-0.003950,1.448325,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19133,DNMT3B,2.405204,0.962280,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19134,ZCCHC10,4.792525,0.971097,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19135,PRSS2,0.094555,1.433159,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [16]:
A375_mut_cross_short = A375_mut_cross.drop(['StrippedCellLineName'], axis=1)
A375_mut_cross_sum  = A375_mut_cross_short.groupby(["Gene"], as_index=False).sum()
A375_mut_cross_sum

A375_excnamut = pd.merge(A375_excna, A375_mut_cross_sum, on='Gene', how='left')
A375_excnamut.fillna(0, inplace=True)
A375_excnamut.rename(columns={"ACH-000219_Exp":"Expression", "ACH-000219_Cna":"CNA"}, inplace=True)
A375_excnamut

Unnamed: 0,Gene,Expression,CNA,frameshift_variant,inframe_deletion,intron_variant,missense_variant,non_coding_transcript_exon_variant,non_coding_transcript_variant,splice_acceptor_variant,splice_donor_variant,splice_region_variant,stop_gained,upstream_gene_variant
0,ZNF891,0.901792,1.092508,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,ARMC10,4.699055,1.160546,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,PTGER4,0.519341,1.167579,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,EIF1AD,5.456987,1.063112,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,ABCG5,0.008593,1.042177,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19132,FCRL6,0.037944,1.398191,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19133,DNMT3B,1.490064,1.056700,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19134,ZCCHC10,5.267407,1.161576,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19135,PRSS2,-0.054202,3.344684,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
GI1_mut_cross_short = GI1_mut_cross.drop(['StrippedCellLineName'], axis=1)
GI1_mut_cross_sum  = GI1_mut_cross_short.groupby(["Gene"], as_index=False).sum()
GI1_mut_cross_sum

GI1_excnamut = pd.merge(GI1_excna, GI1_mut_cross_sum, on='Gene', how='left')
GI1_excnamut.fillna(0, inplace=True)
GI1_excnamut.rename(columns={"ACH-000756_Exp":"Expression", "ACH-000756_Cna":"CNA"}, inplace=True)
GI1_excnamut

Unnamed: 0,Gene,Expression,CNA,5_prime_UTR_variant,coding_sequence_variant,frameshift_variant,inframe_deletion,inframe_insertion,intron_variant,missense_variant,splice_acceptor_variant,splice_donor_region_variant,splice_donor_variant,splice_region_variant,start_lost,stop_gained,upstream_gene_variant
0,ZNF891,0.964007,0.939406,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,ARMC10,4.435384,1.423603,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,PTGER4,0.208657,1.874872,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,EIF1AD,5.241849,1.210110,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,ABCG5,0.008593,0.915362,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19132,FCRL6,0.010153,1.439086,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19133,DNMT3B,1.222753,1.197913,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19134,ZCCHC10,4.700475,0.972241,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19135,PRSS2,0.217511,1.409193,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
HT29_mut_cross_short = HT29_mut_cross.drop(['StrippedCellLineName'], axis=1)
HT29_mut_cross_sum  = HT29_mut_cross_short.groupby(["Gene"], as_index=False).sum()
HT29_mut_cross_sum

HT29_excnamut = pd.merge(HT29_excna, HT29_mut_cross_sum, on='Gene', how='left')
HT29_excnamut.fillna(0, inplace=True)
HT29_excnamut.rename(columns={"ACH-000552_Exp":"Expression", "ACH-000552_Cna":"CNA"}, inplace=True)
HT29_excnamut

Unnamed: 0,Gene,Expression,CNA,frameshift_variant,inframe_insertion,intron_variant,missense_variant,non_coding_transcript_exon_variant,splice_acceptor_variant,splice_donor_variant,splice_region_variant,stop_gained,stop_lost
0,ZNF891,0.957231,1.010382,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,ARMC10,5.215082,1.357073,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,PTGER4,0.433141,1.407945,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,EIF1AD,4.930540,1.327831,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,ABCG5,0.021226,1.079358,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
19132,FCRL6,0.051638,0.988641,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19133,DNMT3B,0.916536,1.592958,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19134,ZCCHC10,4.916421,1.022519,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19135,PRSS2,1.581321,1.386769,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [19]:
IPC298_mut_cross_short = IPC298_mut_cross.drop(['StrippedCellLineName'], axis=1)
IPC298_mut_cross_sum  = IPC298_mut_cross_short.groupby(["Gene"], as_index=False).sum()
IPC298_mut_cross_sum

IPC298_excnamut = pd.merge(IPC298_excna, IPC298_mut_cross_sum, on='Gene', how='left')
IPC298_excnamut.fillna(0, inplace=True)
IPC298_excnamut.rename(columns={"ACH-000915_Exp":"Expression", "ACH-000915_Cna":"CNA"}, inplace=True)
IPC298_excnamut

Unnamed: 0,Gene,Expression,CNA,coding_sequence_variant,frameshift_variant,inframe_deletion,intron_variant,missense_variant,splice_acceptor_variant,splice_donor_5th_base_variant,splice_donor_variant,splice_region_variant,start_lost,stop_gained,stop_lost,upstream_gene_variant
0,ZNF891,0.679777,1.403095,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,ARMC10,4.925675,1.422447,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,PTGER4,0.757949,0.852100,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,EIF1AD,4.970687,0.904906,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,ABCG5,0.046124,1.129922,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19132,FCRL6,-0.003950,0.873788,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19133,DNMT3B,1.190467,1.218367,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19134,ZCCHC10,4.839013,0.869643,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19135,PRSS2,-0.054202,1.422447,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [20]:
MELJUSO_mut_cross_short = MELJUSO_mut_cross.drop(['StrippedCellLineName'], axis=1)
MELJUSO_mut_cross_sum  = MELJUSO_mut_cross_short.groupby(["Gene"], as_index=False).sum()
MELJUSO_mut_cross_sum

MELJUSO_excnamut = pd.merge(MELJUSO_excna, MELJUSO_mut_cross_sum, on='Gene', how='left')
MELJUSO_excnamut.fillna(0, inplace=True)
MELJUSO_excnamut.rename(columns={"ACH-000881_Exp":"Expression", "ACH-000881_Cna":"CNA"}, inplace=True)
MELJUSO_excnamut

Unnamed: 0,Gene,Expression,CNA,coding_sequence_variant,frameshift_variant,inframe_deletion,inframe_insertion,intron_variant,missense_variant,non_coding_transcript_variant,protein_altering_variant,splice_acceptor_variant,splice_donor_variant,splice_region_variant,start_lost,stop_gained,stop_lost
0,ZNF891,0.185593,0.979581,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,ARMC10,4.525603,0.977569,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,PTGER4,0.403189,0.841375,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,EIF1AD,5.235392,0.892918,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,ABCG5,0.351825,0.958348,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19132,FCRL6,0.024116,0.933237,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19133,DNMT3B,1.593196,1.283174,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19134,ZCCHC10,5.138710,1.317119,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19135,PRSS2,0.042240,1.385227,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [21]:
MEWO_mut_cross_short = MEWO_mut_cross.drop(['StrippedCellLineName'], axis=1)
MEWO_mut_cross_sum  = MEWO_mut_cross_short.groupby(["Gene"], as_index=False).sum()
MEWO_mut_cross_sum

MEWO_excnamut = pd.merge(MEWO_excna, MEWO_mut_cross_sum, on='Gene', how='left')
MEWO_excnamut.fillna(0, inplace=True)
MEWO_excnamut.rename(columns={"ACH-000987_Exp":"Expression", "ACH-000987_Cna":"CNA"}, inplace=True)
MEWO_excnamut

Unnamed: 0,Gene,Expression,CNA,5_prime_UTR_variant,NMD_transcript_variant,coding_sequence_variant,frameshift_variant,inframe_deletion,inframe_insertion,intron_variant,...,non_coding_transcript_variant,protein_altering_variant,splice_acceptor_variant,splice_donor_5th_base_variant,splice_donor_variant,splice_polypyrimidine_tract_variant,splice_region_variant,start_lost,stop_gained,stop_lost
0,ZNF891,0.310515,0.975758,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,ARMC10,4.945285,0.984556,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,PTGER4,0.351802,0.961415,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,EIF1AD,4.928580,0.971287,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,ABCG5,0.118045,0.997045,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19132,FCRL6,0.105128,1.007756,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19133,DNMT3B,0.999187,0.981418,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19134,ZCCHC10,4.768444,0.987627,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19135,PRSS2,0.015345,0.984556,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [22]:
OVCAR8_mut_cross_short = OVCAR8_mut_cross.drop(['StrippedCellLineName'], axis=1)
OVCAR8_mut_cross_sum  = OVCAR8_mut_cross_short.groupby(["Gene"], as_index=False).sum()
OVCAR8_mut_cross_sum

OVCAR8_excnamut = pd.merge(OVCAR8_excna, OVCAR8_mut_cross_sum, on='Gene', how='left')
OVCAR8_excnamut.fillna(0, inplace=True)
OVCAR8_excnamut.rename(columns={"ACH-000696_Exp":"Expression", "ACH-000696_Cna":"CNA"}, inplace=True)
OVCAR8_excnamut

Unnamed: 0,Gene,Expression,CNA,coding_sequence_variant,frameshift_variant,inframe_deletion,inframe_insertion,intron_variant,missense_variant,non_coding_transcript_variant,splice_acceptor_variant,splice_donor_5th_base_variant,splice_donor_variant,splice_polypyrimidine_tract_variant,splice_region_variant,start_lost,stop_gained
0,ZNF891,0.922844,1.260240,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,ARMC10,2.859775,0.433125,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,PTGER4,0.948189,1.348864,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,EIF1AD,5.569033,1.254125,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,ABCG5,-0.004166,0.889224,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19132,FCRL6,-0.003950,0.854034,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19133,DNMT3B,2.626343,1.277046,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19134,ZCCHC10,5.137805,1.239378,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19135,PRSS2,0.145017,2.531575,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [23]:
MDAMB468_mut_cross_short = MDAMB468_mut_cross.drop(['StrippedCellLineName'], axis=1)
MDAMB468_mut_cross_sum  = MDAMB468_mut_cross_short.groupby(["Gene"], as_index=False).sum()
MDAMB468_mut_cross_sum

MDAMB468_excnamut = pd.merge(MDAMB468_excna, MDAMB468_mut_cross_sum, on='Gene', how='left')
MDAMB468_excnamut.fillna(0, inplace=True)
MDAMB468_excnamut.rename(columns={"ACH-000849_Exp":"Expression", "ACH-000849_Cna":"CNA"}, inplace=True)
MDAMB468_excnamut

Unnamed: 0,Gene,Expression,CNA,coding_sequence_variant,frameshift_variant,inframe_deletion,inframe_insertion,missense_variant,splice_acceptor_variant,splice_donor_variant,splice_region_variant,start_lost,stop_gained,stop_lost
0,ZNF891,0.549187,0.884447,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,ARMC10,5.132134,1.138737,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,PTGER4,2.360305,1.075559,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,EIF1AD,5.507535,1.431356,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,ABCG5,-0.004166,1.427336,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19132,FCRL6,0.169270,1.354241,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19133,DNMT3B,2.136818,1.052460,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19134,ZCCHC10,4.665617,0.976861,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19135,PRSS2,0.384091,1.138737,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [24]:
HCT116_mut_cross_short = HCT116_mut_cross.drop(['StrippedCellLineName'], axis=1)
HCT116_mut_cross_sum  = HCT116_mut_cross_short.groupby(["Gene"], as_index=False).sum()
HCT116_mut_cross_sum

HCT116_excnamut = pd.merge(HCT116_excna, HCT116_mut_cross_sum, on='Gene', how='left')
HCT116_excnamut.fillna(0, inplace=True)
HCT116_excnamut.rename(columns={"ACH-000971_Exp":"Expression", "ACH-000971_Cna":"CNA"}, inplace=True)

HCT116_excnamut

Unnamed: 0,Gene,Expression,CNA,coding_sequence_variant,frameshift_variant,inframe_deletion,intron_variant,missense_variant,non_coding_transcript_exon_variant,non_coding_transcript_variant,protein_altering_variant,splice_acceptor_variant,splice_donor_5th_base_variant,splice_donor_variant,splice_region_variant,start_lost,stop_gained,stop_lost
0,ZNF891,-0.019059,0.990980,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,ARMC10,5.285371,0.988569,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,PTGER4,0.804196,0.987542,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,EIF1AD,5.873076,0.989453,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,ABCG5,0.021226,0.982726,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19132,FCRL6,0.024116,0.982712,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19133,DNMT3B,3.576248,0.993140,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19134,ZCCHC10,5.001986,0.988286,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19135,PRSS2,0.001704,0.988569,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [25]:
A549_mut_cross_short = A549_mut_cross.drop(['StrippedCellLineName'], axis=1)
A549_mut_cross_sum  = A549_mut_cross_short.groupby(["Gene"], as_index=False).sum()
A549_mut_cross_sum

A549_excnamut = pd.merge(A549_excna, A549_mut_cross_sum, on='Gene', how='left')
A549_excnamut.fillna(0, inplace=True)
A549_excnamut.rename(columns={"ACH-000681_Exp":"Expression", "ACH-000681_Cna":"CNA"}, inplace=True)

A549_excnamut

Unnamed: 0,Gene,Expression,CNA,frameshift_variant,inframe_deletion,missense_variant,splice_acceptor_variant,splice_donor_variant,splice_region_variant,start_lost,stop_gained,stop_lost
0,ZNF891,1.199543,1.135109,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,ARMC10,5.009358,1.154509,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,PTGER4,0.718209,1.181176,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,EIF1AD,4.630616,1.113310,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,ABCG5,-0.004166,1.056327,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
19132,FCRL6,-0.003950,1.121489,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19133,DNMT3B,3.215394,1.157622,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19134,ZCCHC10,5.203779,1.146539,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19135,PRSS2,-0.054202,1.732522,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [26]:
HELA_mut_cross_short = HELA_mut_cross.drop(['StrippedCellLineName'], axis=1)
HELA_mut_cross_sum  = HELA_mut_cross_short.groupby(["Gene"], as_index=False).sum()
HELA_mut_cross_sum

HELA_excnamut = pd.merge(HELA_excna, HELA_mut_cross_sum, on='Gene', how='left')
HELA_excnamut.fillna(0, inplace=True)
HELA_excnamut.rename(columns={"ACH-001086_Exp":"Expression", "ACH-001086_Cna":"CNA"}, inplace=True)

HELA_excnamut

Unnamed: 0,Gene,Expression,CNA,coding_sequence_variant,frameshift_variant,inframe_deletion,intron_variant,missense_variant,splice_acceptor_variant,splice_donor_variant,splice_region_variant,stop_gained,synonymous_variant
0,ZNF891,0.696291,0.865076,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,ARMC10,4.195089,1.165565,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,PTGER4,1.607464,2.697693,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,EIF1AD,4.798854,1.136561,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,ABCG5,-0.004166,0.865746,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
19132,FCRL6,-0.003950,1.120104,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
19133,DNMT3B,1.634147,1.108389,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19134,ZCCHC10,5.007916,1.130673,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19135,PRSS2,0.229243,0.864966,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [27]:
JURKAT_mut_cross_short = JURKAT_mut_cross.drop(['StrippedCellLineName'], axis=1)
JURKAT_mut_cross_sum  = JURKAT_mut_cross_short.groupby(["Gene"], as_index=False).sum()
JURKAT_mut_cross_sum

JURKAT_excnamut = pd.merge(JURKAT_excna, JURKAT_mut_cross_sum, on='Gene', how='left')
JURKAT_excnamut.fillna(0, inplace=True)
JURKAT_excnamut.rename(columns={"ACH-000995_Exp":"Expression", "ACH-000995_Cna":"CNA"}, inplace=True)

JURKAT_excnamut

Unnamed: 0,Gene,Expression,CNA,frameshift_variant,inframe_deletion,inframe_insertion,missense_variant,non_coding_transcript_variant,splice_acceptor_variant,splice_donor_variant,splice_region_variant,start_lost,stop_gained,synonymous_variant
0,ZNF891,1.739502,1.020269,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,ARMC10,5.006481,0.998743,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,PTGER4,4.598735,0.999992,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,EIF1AD,5.333405,1.006714,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,ABCG5,0.033735,0.562951,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19132,FCRL6,-0.003950,1.017233,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19133,DNMT3B,2.897058,1.059664,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19134,ZCCHC10,5.363947,0.999992,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19135,PRSS2,0.912276,0.998743,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [28]:
K562_mut_cross_short = K562_mut_cross.drop(['StrippedCellLineName'], axis=1)
K562_mut_cross_sum  = K562_mut_cross_short.groupby(["Gene"], as_index=False).sum()
K562_mut_cross_sum

K562_excnamut = pd.merge(K562_excna, K562_mut_cross_sum, on='Gene', how='left')
K562_excnamut.fillna(0, inplace=True)
K562_excnamut.rename(columns={"ACH-000551_Exp":"Expression", "ACH-000551_Cna":"CNA"}, inplace=True)

K562_excnamut

Unnamed: 0,Gene,Expression,CNA,frameshift_variant,inframe_deletion,inframe_insertion,missense_variant,splice_acceptor_variant,splice_donor_variant,splice_region_variant,start_lost,stop_gained
0,ZNF891,1.017047,0.985622,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,ARMC10,5.630714,1.299036,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,PTGER4,-0.058727,1.318882,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,EIF1AD,4.623501,0.980222,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,ABCG5,-0.004166,0.994354,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
19132,FCRL6,0.010153,1.026953,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19133,DNMT3B,4.814656,1.004611,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19134,ZCCHC10,5.270726,0.991217,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19135,PRSS2,0.693753,1.620288,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [29]:
c22RV1_excnamut['Mut'] = c22RV1_excnamut.iloc[:, 3:].sum(axis=1)
c22RV1_excnamut_final = c22RV1_excnamut[['Gene', 'Expression', 'CNA', 'Mut']].copy()
c22RV1_excnamut_final

A375_excnamut['Mut'] = A375_excnamut.iloc[:, 3:].sum(axis=1)
A375_excnamut_final = A375_excnamut[['Gene', 'Expression', 'CNA', 'Mut']].copy()
A375_excnamut_final

GI1_excnamut['Mut'] = GI1_excnamut.iloc[:, 3:].sum(axis=1)
GI1_excnamut_final = GI1_excnamut[['Gene', 'Expression', 'CNA', 'Mut']].copy()
GI1_excnamut_final

HT29_excnamut['Mut'] = HT29_excnamut.iloc[:, 3:].sum(axis=1)
HT29_excnamut_final = HT29_excnamut[['Gene', 'Expression', 'CNA', 'Mut']].copy()
HT29_excnamut_final

IPC298_excnamut['Mut'] = IPC298_excnamut.iloc[:, 3:].sum(axis=1)
IPC298_excnamut_final = IPC298_excnamut[['Gene', 'Expression', 'CNA', 'Mut']].copy()
IPC298_excnamut_final

MELJUSO_excnamut['Mut'] = MELJUSO_excnamut.iloc[:, 3:].sum(axis=1)
MELJUSO_excnamut_final = MELJUSO_excnamut[['Gene', 'Expression', 'CNA', 'Mut']].copy()
MELJUSO_excnamut_final

MEWO_excnamut['Mut'] = MEWO_excnamut.iloc[:, 3:].sum(axis=1)
MEWO_excnamut_final = MEWO_excnamut[['Gene', 'Expression', 'CNA', 'Mut']].copy()
MEWO_excnamut_final

OVCAR8_excnamut['Mut'] = OVCAR8_excnamut.iloc[:, 3:].sum(axis=1)
OVCAR8_excnamut_final = OVCAR8_excnamut[['Gene', 'Expression', 'CNA', 'Mut']].copy()
OVCAR8_excnamut_final

MDAMB468_excnamut['Mut'] = MDAMB468_excnamut.iloc[:, 3:].sum(axis=1)
MDAMB468_excnamut_final = MDAMB468_excnamut[['Gene', 'Expression', 'CNA', 'Mut']].copy()
MDAMB468_excnamut_final

HCT116_excnamut['Mut'] = HCT116_excnamut.iloc[:, 3:].sum(axis=1)
HCT116_excnamut_final = HCT116_excnamut[['Gene', 'Expression', 'CNA', 'Mut']].copy()
HCT116_excnamut_final

A549_excnamut['Mut'] = A549_excnamut.iloc[:, 3:].sum(axis=1)
A549_excnamut_final = A549_excnamut[['Gene', 'Expression', 'CNA', 'Mut']].copy()
A549_excnamut_final

HELA_excnamut['Mut'] = HELA_excnamut.iloc[:, 3:].sum(axis=1)
HELA_excnamut_final = HELA_excnamut[['Gene', 'Expression', 'CNA', 'Mut']].copy()
HELA_excnamut_final

JURKAT_excnamut['Mut'] = JURKAT_excnamut.iloc[:, 3:].sum(axis=1)
JURKAT_excnamut_final = JURKAT_excnamut[['Gene', 'Expression', 'CNA', 'Mut']].copy()
JURKAT_excnamut_final

K562_excnamut['Mut'] = K562_excnamut.iloc[:, 3:].sum(axis=1)
K562_excnamut_final = K562_excnamut[['Gene', 'Expression', 'CNA', 'Mut']].copy()
K562_excnamut_final

Unnamed: 0,Gene,Expression,CNA,Mut
0,ZNF891,1.017047,0.985622,0.0
1,ARMC10,5.630714,1.299036,0.0
2,PTGER4,-0.058727,1.318882,0.0
3,EIF1AD,4.623501,0.980222,0.0
4,ABCG5,-0.004166,0.994354,0.0
...,...,...,...,...
19132,FCRL6,0.010153,1.026953,0.0
19133,DNMT3B,4.814656,1.004611,0.0
19134,ZCCHC10,5.270726,0.991217,0.0
19135,PRSS2,0.693753,1.620288,0.0


In [30]:
c22RV1_excnamut_final['Gene'] = c22RV1_excnamut_final['Gene'].str.replace(" ", "")
A375_excnamut_final['Gene'] = A375_excnamut_final['Gene'].str.replace(" ", "")
GI1_excnamut_final['Gene'] = GI1_excnamut_final['Gene'].str.replace(" ", "")
HT29_excnamut_final['Gene'] = HT29_excnamut_final['Gene'].str.replace(" ", "")
IPC298_excnamut_final['Gene'] = IPC298_excnamut_final['Gene'].str.replace(" ", "")
MELJUSO_excnamut_final['Gene'] = MELJUSO_excnamut_final['Gene'].str.replace(" ", "")
MEWO_excnamut_final['Gene'] = MEWO_excnamut_final['Gene'].str.replace(" ", "")
OVCAR8_excnamut_final['Gene'] = OVCAR8_excnamut_final['Gene'].str.replace(" ", "")
MDAMB468_excnamut_final['Gene'] = MDAMB468_excnamut_final['Gene'].str.replace(" ", "")
HCT116_excnamut_final['Gene'] = HCT116_excnamut_final['Gene'].str.replace(" ", "")
A549_excnamut_final['Gene'] = A549_excnamut_final['Gene'].str.replace(" ", "")
HELA_excnamut_final['Gene'] = HELA_excnamut_final['Gene'].str.replace(" ", "")
JURKAT_excnamut_final['Gene'] = JURKAT_excnamut_final['Gene'].str.replace(" ", "")
K562_excnamut_final['Gene'] = K562_excnamut_final['Gene'].str.replace(" ", "")

# Protein

In [31]:
#https://cellmodelpassports.sanger.ac.uk/downloads
proteomics = pd.read_csv('../../data/CCLE+TCGA/proteomics_all_20220713.csv')
proteomics

Unnamed: 0,uniprot_id,model_id,model_name,protein_intensity,zscore,symbol
0,A0A075B6K4,SIDM00483,SK-GT-4,-0.577590,-1.284310,IGLV3-10
1,A0A075B6K4,SIDM00689,JM1,3.408550,0.129406,IGLV3-10
2,A0A075B6K4,SIDM01259,GR-ST,6.300060,1.154900,IGLV3-10
3,A0A075B6N1,SIDM00846,HeLa,0.127721,-1.034210,TRBV19
4,A0A075B6N1,SIDM00958,CML-T1,4.283890,1.352450,TRBV19
...,...,...,...,...,...,...
4925077,Q9Y6Y8,SIDM01248,CGTH-W-1,3.812190,0.723363,SEC23IP
4925078,Q9Y6Y8,SIDM01251,H9,3.854180,0.787636,SEC23IP
4925079,Q9Y6Y8,SIDM01259,GR-ST,3.802910,0.709159,SEC23IP
4925080,Q9Y6Y8,SIDM01261,YMB-1-E,4.540440,1.838070,SEC23IP


In [10]:
proteomics.loc[(proteomics['model_name']=='Hs-936-T')]

Unnamed: 0,uniprot_id,model_id,model_name,protein_intensity,zscore,symbol


In [32]:
#https://cellmodelpassports.sanger.ac.uk/downloads
#proteomics = pd.read_csv('/home/songyeon/20230427/dataset/DepMap23Q2/proteomics_all_20220713.csv')

cell_proteins = proteomics.loc[(proteomics['model_name']=='22RV1')]
cell_proteins = cell_proteins[['symbol', 'protein_intensity']]
cell_proteins.reset_index(inplace=True, drop=True)
cell_proteins.rename(columns={"symbol": "Gene"}, inplace=True)
cell_proteins

cell_proteins.pivot_table(index='Gene', columns=cell_proteins.groupby('Gene').cumcount(), values='protein_intensity')

c22RV1_excnamut_protein = pd.merge(c22RV1_excnamut_final, cell_proteins, left_on='Gene', right_on='Gene', how='left')
c22RV1_excnamut_protein.rename(columns={"protein_intensity":"ProteinIntensity"}, inplace=True)
c22RV1_excnamut_protein = c22RV1_excnamut_protein.fillna(0)
c22RV1_excnamut_protein

Unnamed: 0,Gene,Expression,CNA,Mut,ProteinIntensity
0,ZNF891,1.700438,1.386429,0.0,0.00000
1,ARMC10,5.613438,1.450845,0.0,3.62348
2,PTGER4,1.339696,0.959258,0.0,0.00000
3,EIF1AD,4.826810,0.967708,0.0,0.00000
4,ABCG5,0.141149,0.488352,0.0,0.00000
...,...,...,...,...,...
19132,FCRL6,-0.003950,1.448325,0.0,0.00000
19133,DNMT3B,2.405204,0.962280,0.0,0.00000
19134,ZCCHC10,4.792525,0.971097,0.0,0.00000
19135,PRSS2,0.094555,1.433159,0.0,0.00000


In [33]:
cell_proteins = proteomics.loc[(proteomics['model_name']=='A375')]
cell_proteins = cell_proteins[['symbol', 'protein_intensity']]
cell_proteins.reset_index(inplace=True, drop=True)
cell_proteins.rename(columns={"symbol": "Gene"}, inplace=True)
cell_proteins

cell_proteins.pivot_table(index='Gene', columns=cell_proteins.groupby('Gene').cumcount(), values='protein_intensity')

A375_excnamut_protein = pd.merge(A375_excnamut_final, cell_proteins, left_on='Gene', right_on='Gene', how='left')
A375_excnamut_protein.rename(columns={"protein_intensity":"ProteinIntensity"}, inplace=True)
A375_excnamut_protein = A375_excnamut_protein.fillna(0)
A375_excnamut_protein

Unnamed: 0,Gene,Expression,CNA,Mut,ProteinIntensity
0,ZNF891,0.901792,1.092508,0.0,0.00000
1,ARMC10,4.699055,1.160546,0.0,3.94049
2,PTGER4,0.519341,1.167579,0.0,0.00000
3,EIF1AD,5.456987,1.063112,0.0,1.04775
4,ABCG5,0.008593,1.042177,0.0,0.00000
...,...,...,...,...,...
19132,FCRL6,0.037944,1.398191,0.0,0.00000
19133,DNMT3B,1.490064,1.056700,0.0,0.00000
19134,ZCCHC10,5.267407,1.161576,0.0,0.00000
19135,PRSS2,-0.054202,3.344684,0.0,0.00000


In [34]:
cell_proteins = proteomics.loc[(proteomics['model_name']=='GI-1')]
cell_proteins = cell_proteins[['symbol', 'protein_intensity']]
cell_proteins.reset_index(inplace=True, drop=True)
cell_proteins.rename(columns={"symbol": "Gene"}, inplace=True)
cell_proteins

cell_proteins.pivot_table(index='Gene', columns=cell_proteins.groupby('Gene').cumcount(), values='protein_intensity')

GI1_excnamut_protein = pd.merge(GI1_excnamut_final, cell_proteins, left_on='Gene', right_on='Gene', how='left')
GI1_excnamut_protein.rename(columns={"protein_intensity":"ProteinIntensity"}, inplace=True)
GI1_excnamut_protein = GI1_excnamut_protein.fillna(0)
GI1_excnamut_protein

Unnamed: 0,Gene,Expression,CNA,Mut,ProteinIntensity
0,ZNF891,0.964007,0.939406,0.0,0.00000
1,ARMC10,4.435384,1.423603,0.0,3.15417
2,PTGER4,0.208657,1.874872,0.0,0.00000
3,EIF1AD,5.241849,1.210110,0.0,2.57237
4,ABCG5,0.008593,0.915362,0.0,0.00000
...,...,...,...,...,...
19132,FCRL6,0.010153,1.439086,0.0,0.00000
19133,DNMT3B,1.222753,1.197913,0.0,0.00000
19134,ZCCHC10,4.700475,0.972241,0.0,0.00000
19135,PRSS2,0.217511,1.409193,0.0,0.00000


In [35]:
cell_proteins = proteomics.loc[(proteomics['model_name']=='HT-29')]
cell_proteins = cell_proteins[['symbol', 'protein_intensity']]
cell_proteins.reset_index(inplace=True, drop=True)
cell_proteins.rename(columns={"symbol": "Gene"}, inplace=True)
cell_proteins

cell_proteins.pivot_table(index='Gene', columns=cell_proteins.groupby('Gene').cumcount(), values='protein_intensity')

HT29_excnamut_protein = pd.merge(HT29_excnamut_final, cell_proteins, left_on='Gene', right_on='Gene', how='left')
HT29_excnamut_protein.rename(columns={"protein_intensity":"ProteinIntensity"}, inplace=True)
HT29_excnamut_protein = HT29_excnamut_protein.fillna(0)
HT29_excnamut_protein

Unnamed: 0,Gene,Expression,CNA,Mut,ProteinIntensity
0,ZNF891,0.957231,1.010382,0.0,0.00000
1,ARMC10,5.215082,1.357073,0.0,4.22043
2,PTGER4,0.433141,1.407945,0.0,0.00000
3,EIF1AD,4.930540,1.327831,0.0,0.00000
4,ABCG5,0.021226,1.079358,0.0,0.00000
...,...,...,...,...,...
19132,FCRL6,0.051638,0.988641,0.0,0.00000
19133,DNMT3B,0.916536,1.592958,0.0,0.00000
19134,ZCCHC10,4.916421,1.022519,0.0,0.00000
19135,PRSS2,1.581321,1.386769,0.0,0.00000


In [36]:
cell_proteins = proteomics.loc[(proteomics['model_name']=='IPC-298')]
cell_proteins = cell_proteins[['symbol', 'protein_intensity']]
cell_proteins.reset_index(inplace=True, drop=True)
cell_proteins.rename(columns={"symbol": "Gene"}, inplace=True)
cell_proteins

cell_proteins.pivot_table(index='Gene', columns=cell_proteins.groupby('Gene').cumcount(), values='protein_intensity')

IPC298_excnamut_protein = pd.merge(IPC298_excnamut_final, cell_proteins, left_on='Gene', right_on='Gene', how='left')
IPC298_excnamut_protein.rename(columns={"protein_intensity":"ProteinIntensity"}, inplace=True)
IPC298_excnamut_protein = IPC298_excnamut_protein.fillna(0)
IPC298_excnamut_protein

Unnamed: 0,Gene,Expression,CNA,Mut,ProteinIntensity
0,ZNF891,0.679777,1.403095,0.0,0.00000
1,ARMC10,4.925675,1.422447,0.0,3.95862
2,PTGER4,0.757949,0.852100,0.0,0.00000
3,EIF1AD,4.970687,0.904906,0.0,2.19603
4,ABCG5,0.046124,1.129922,0.0,0.00000
...,...,...,...,...,...
19132,FCRL6,-0.003950,0.873788,0.0,0.00000
19133,DNMT3B,1.190467,1.218367,0.0,0.00000
19134,ZCCHC10,4.839013,0.869643,0.0,0.00000
19135,PRSS2,-0.054202,1.422447,0.0,0.00000


In [37]:
cell_proteins = proteomics.loc[(proteomics['model_name']=='MEL-JUSO')]
cell_proteins = cell_proteins[['symbol', 'protein_intensity']]
cell_proteins.reset_index(inplace=True, drop=True)
cell_proteins.rename(columns={"symbol": "Gene"}, inplace=True)
cell_proteins

cell_proteins.pivot_table(index='Gene', columns=cell_proteins.groupby('Gene').cumcount(), values='protein_intensity')

MELJUSO_excnamut_protein = pd.merge(MELJUSO_excnamut_final, cell_proteins, left_on='Gene', right_on='Gene', how='left')
MELJUSO_excnamut_protein.rename(columns={"protein_intensity":"ProteinIntensity"}, inplace=True)
MELJUSO_excnamut_protein = MELJUSO_excnamut_protein.fillna(0)
MELJUSO_excnamut_protein

Unnamed: 0,Gene,Expression,CNA,Mut,ProteinIntensity
0,ZNF891,0.185593,0.979581,0.0,0.00000
1,ARMC10,4.525603,0.977569,0.0,2.65473
2,PTGER4,0.403189,0.841375,0.0,0.00000
3,EIF1AD,5.235392,0.892918,0.0,1.65119
4,ABCG5,0.351825,0.958348,0.0,0.00000
...,...,...,...,...,...
19132,FCRL6,0.024116,0.933237,0.0,0.00000
19133,DNMT3B,1.593196,1.283174,0.0,0.00000
19134,ZCCHC10,5.138710,1.317119,0.0,0.00000
19135,PRSS2,0.042240,1.385227,0.0,0.00000


In [38]:
cell_proteins = proteomics.loc[(proteomics['model_name']=='Mewo')]
cell_proteins = cell_proteins[['symbol', 'protein_intensity']]
cell_proteins.reset_index(inplace=True, drop=True)
cell_proteins.rename(columns={"symbol": "Gene"}, inplace=True)
cell_proteins

cell_proteins.pivot_table(index='Gene', columns=cell_proteins.groupby('Gene').cumcount(), values='protein_intensity')

MEWO_excnamut_protein = pd.merge(MEWO_excnamut_final, cell_proteins, left_on='Gene', right_on='Gene', how='left')
MEWO_excnamut_protein.rename(columns={"protein_intensity":"ProteinIntensity"}, inplace=True)
MEWO_excnamut_protein = MEWO_excnamut_protein.fillna(0)
MEWO_excnamut_protein

Unnamed: 0,Gene,Expression,CNA,Mut,ProteinIntensity
0,ZNF891,0.310515,0.975758,0.0,0.00000
1,ARMC10,4.945285,0.984556,0.0,3.41281
2,PTGER4,0.351802,0.961415,0.0,0.00000
3,EIF1AD,4.928580,0.971287,0.0,0.00000
4,ABCG5,0.118045,0.997045,0.0,0.00000
...,...,...,...,...,...
19132,FCRL6,0.105128,1.007756,1.0,0.00000
19133,DNMT3B,0.999187,0.981418,0.0,0.00000
19134,ZCCHC10,4.768444,0.987627,0.0,0.00000
19135,PRSS2,0.015345,0.984556,0.0,0.00000


In [39]:
cell_proteins = proteomics.loc[(proteomics['model_name']=='OVCAR-8')]
cell_proteins = cell_proteins[['symbol', 'protein_intensity']]
cell_proteins.reset_index(inplace=True, drop=True)
cell_proteins.rename(columns={"symbol": "Gene"}, inplace=True)
cell_proteins

cell_proteins.pivot_table(index='Gene', columns=cell_proteins.groupby('Gene').cumcount(), values='protein_intensity')

OVCAR8_excnamut_protein = pd.merge(OVCAR8_excnamut_final, cell_proteins, left_on='Gene', right_on='Gene', how='left')
OVCAR8_excnamut_protein.rename(columns={"protein_intensity":"ProteinIntensity"}, inplace=True)
OVCAR8_excnamut_protein = OVCAR8_excnamut_protein.fillna(0)
OVCAR8_excnamut_protein

Unnamed: 0,Gene,Expression,CNA,Mut,ProteinIntensity
0,ZNF891,0.922844,1.260240,0.0,0.000000
1,ARMC10,2.859775,0.433125,0.0,0.000000
2,PTGER4,0.948189,1.348864,0.0,0.000000
3,EIF1AD,5.569033,1.254125,0.0,0.808256
4,ABCG5,-0.004166,0.889224,0.0,0.000000
...,...,...,...,...,...
19132,FCRL6,-0.003950,0.854034,0.0,0.000000
19133,DNMT3B,2.626343,1.277046,0.0,-0.222284
19134,ZCCHC10,5.137805,1.239378,0.0,0.000000
19135,PRSS2,0.145017,2.531575,0.0,0.000000


In [40]:
cell_proteins = proteomics.loc[(proteomics['model_name']=='MDA-MB-468')]
cell_proteins = cell_proteins[['symbol', 'protein_intensity']]
cell_proteins.reset_index(inplace=True, drop=True)
cell_proteins.rename(columns={"symbol": "Gene"}, inplace=True)
cell_proteins

cell_proteins.pivot_table(index='Gene', columns=cell_proteins.groupby('Gene').cumcount(), values='protein_intensity')

MDAMB468_excnamut_protein = pd.merge(MDAMB468_excnamut_final, cell_proteins, left_on='Gene', right_on='Gene', how='left')
MDAMB468_excnamut_protein.rename(columns={"protein_intensity":"ProteinIntensity"}, inplace=True)
MDAMB468_excnamut_protein = MDAMB468_excnamut_protein.fillna(0)
MDAMB468_excnamut_protein

Unnamed: 0,Gene,Expression,CNA,Mut,ProteinIntensity
0,ZNF891,0.549187,0.884447,0.0,0.00000
1,ARMC10,5.132134,1.138737,0.0,4.48645
2,PTGER4,2.360305,1.075559,0.0,0.00000
3,EIF1AD,5.507535,1.431356,0.0,1.58370
4,ABCG5,-0.004166,1.427336,0.0,0.00000
...,...,...,...,...,...
19132,FCRL6,0.169270,1.354241,0.0,0.00000
19133,DNMT3B,2.136818,1.052460,0.0,0.00000
19134,ZCCHC10,4.665617,0.976861,0.0,0.00000
19135,PRSS2,0.384091,1.138737,0.0,0.00000


In [41]:
cell_proteins = proteomics.loc[(proteomics['model_name']=='HCT-116')]
cell_proteins = cell_proteins[['symbol', 'protein_intensity']]
cell_proteins.reset_index(inplace=True, drop=True)
cell_proteins.rename(columns={"symbol": "Gene"}, inplace=True)
cell_proteins

cell_proteins.pivot_table(index='Gene', columns=cell_proteins.groupby('Gene').cumcount(), values='protein_intensity')

HCT116_excnamut_protein = pd.merge(HCT116_excnamut_final, cell_proteins, left_on='Gene', right_on='Gene', how='left')
HCT116_excnamut_protein.rename(columns={"protein_intensity":"ProteinIntensity"}, inplace=True)
HCT116_excnamut_protein = HCT116_excnamut_protein.fillna(0)
HCT116_excnamut_protein

Unnamed: 0,Gene,Expression,CNA,Mut,ProteinIntensity
0,ZNF891,-0.019059,0.990980,0.0,0.00000
1,ARMC10,5.285371,0.988569,0.0,3.54307
2,PTGER4,0.804196,0.987542,0.0,0.00000
3,EIF1AD,5.873076,0.989453,0.0,1.35707
4,ABCG5,0.021226,0.982726,0.0,0.00000
...,...,...,...,...,...
19132,FCRL6,0.024116,0.982712,0.0,0.00000
19133,DNMT3B,3.576248,0.993140,0.0,2.34951
19134,ZCCHC10,5.001986,0.988286,0.0,0.00000
19135,PRSS2,0.001704,0.988569,0.0,0.00000


In [42]:
cell_proteins = proteomics.loc[(proteomics['model_name']=='A549')]
cell_proteins = cell_proteins[['symbol', 'protein_intensity']]
cell_proteins.reset_index(inplace=True, drop=True)
cell_proteins.rename(columns={"symbol": "Gene"}, inplace=True)
cell_proteins

cell_proteins.pivot_table(index='Gene', columns=cell_proteins.groupby('Gene').cumcount(), values='protein_intensity')

A549_excnamut_protein = pd.merge(A549_excnamut_final, cell_proteins, left_on='Gene', right_on='Gene', how='left')
A549_excnamut_protein.rename(columns={"protein_intensity":"ProteinIntensity"}, inplace=True)
A549_excnamut_protein = A549_excnamut_protein.fillna(0)
A549_excnamut_protein

Unnamed: 0,Gene,Expression,CNA,Mut,ProteinIntensity
0,ZNF891,1.199543,1.135109,0.0,0.000000
1,ARMC10,5.009358,1.154509,0.0,3.252030
2,PTGER4,0.718209,1.181176,0.0,0.000000
3,EIF1AD,4.630616,1.113310,0.0,0.831381
4,ABCG5,-0.004166,1.056327,0.0,0.000000
...,...,...,...,...,...
19132,FCRL6,-0.003950,1.121489,0.0,0.000000
19133,DNMT3B,3.215394,1.157622,0.0,0.000000
19134,ZCCHC10,5.203779,1.146539,0.0,0.000000
19135,PRSS2,-0.054202,1.732522,0.0,0.000000


In [43]:
cell_proteins = proteomics.loc[(proteomics['model_name']=='Jurkat')]
cell_proteins = cell_proteins[['symbol', 'protein_intensity']]
cell_proteins.reset_index(inplace=True, drop=True)
cell_proteins.rename(columns={"symbol": "Gene"}, inplace=True)
cell_proteins

cell_proteins.pivot_table(index='Gene', columns=cell_proteins.groupby('Gene').cumcount(), values='protein_intensity')

JURKAT_excnamut_protein = pd.merge(JURKAT_excnamut_final, cell_proteins, left_on='Gene', right_on='Gene', how='left')
JURKAT_excnamut_protein.rename(columns={"protein_intensity":"ProteinIntensity"}, inplace=True)
JURKAT_excnamut_protein = JURKAT_excnamut_protein.fillna(0)
JURKAT_excnamut_protein

Unnamed: 0,Gene,Expression,CNA,Mut,ProteinIntensity
0,ZNF891,1.739502,1.020269,0.0,0.00000
1,ARMC10,5.006481,0.998743,0.0,2.47094
2,PTGER4,4.598735,0.999992,0.0,0.00000
3,EIF1AD,5.333405,1.006714,0.0,1.60377
4,ABCG5,0.033735,0.562951,0.0,0.00000
...,...,...,...,...,...
19132,FCRL6,-0.003950,1.017233,0.0,0.00000
19133,DNMT3B,2.897058,1.059664,2.0,0.00000
19134,ZCCHC10,5.363947,0.999992,0.0,0.00000
19135,PRSS2,0.912276,0.998743,0.0,0.00000


In [44]:
cell_proteins = proteomics.loc[(proteomics['model_name']=='K-562')]
cell_proteins = cell_proteins[['symbol', 'protein_intensity']]
cell_proteins.reset_index(inplace=True, drop=True)
cell_proteins.rename(columns={"symbol": "Gene"}, inplace=True)
cell_proteins

cell_proteins.pivot_table(index='Gene', columns=cell_proteins.groupby('Gene').cumcount(), values='protein_intensity')

K562_excnamut_protein = pd.merge(K562_excnamut_final, cell_proteins, left_on='Gene', right_on='Gene', how='left')
K562_excnamut_protein.rename(columns={"protein_intensity":"ProteinIntensity"}, inplace=True)
K562_excnamut_protein = K562_excnamut_protein.fillna(0)
K562_excnamut_protein

Unnamed: 0,Gene,Expression,CNA,Mut,ProteinIntensity
0,ZNF891,1.017047,0.985622,0.0,0.000000
1,ARMC10,5.630714,1.299036,0.0,3.211030
2,PTGER4,-0.058727,1.318882,0.0,0.000000
3,EIF1AD,4.623501,0.980222,0.0,2.267930
4,ABCG5,-0.004166,0.994354,0.0,0.000000
...,...,...,...,...,...
19132,FCRL6,0.010153,1.026953,0.0,0.000000
19133,DNMT3B,4.814656,1.004611,0.0,0.198405
19134,ZCCHC10,5.270726,0.991217,0.0,0.000000
19135,PRSS2,0.693753,1.620288,0.0,0.000000


In [45]:
cell_proteins = proteomics.loc[(proteomics['model_name']=='HeLa')]
cell_proteins = cell_proteins[['symbol', 'protein_intensity']]
cell_proteins.reset_index(inplace=True, drop=True)
cell_proteins.rename(columns={"symbol": "Gene"}, inplace=True)
cell_proteins

cell_proteins.pivot_table(index='Gene', columns=cell_proteins.groupby('Gene').cumcount(), values='protein_intensity')

HELA_excnamut_protein = pd.merge(HELA_excnamut_final, cell_proteins, left_on='Gene', right_on='Gene', how='left')
HELA_excnamut_protein.rename(columns={"protein_intensity":"ProteinIntensity"}, inplace=True)
HELA_excnamut_protein = HELA_excnamut_protein.fillna(0)
HELA_excnamut_protein

Unnamed: 0,Gene,Expression,CNA,Mut,ProteinIntensity
0,ZNF891,0.696291,0.865076,0.0,0.00000
1,ARMC10,4.195089,1.165565,0.0,2.53377
2,PTGER4,1.607464,2.697693,0.0,0.00000
3,EIF1AD,4.798854,1.136561,0.0,1.39979
4,ABCG5,-0.004166,0.865746,0.0,0.00000
...,...,...,...,...,...
19132,FCRL6,-0.003950,1.120104,1.0,0.00000
19133,DNMT3B,1.634147,1.108389,0.0,0.00000
19134,ZCCHC10,5.007916,1.130673,0.0,0.00000
19135,PRSS2,0.229243,0.864966,0.0,0.00000


In [46]:
c22RV1_excnamut_protein.isnull().any().any()

False

In [47]:
c22RV1_excnamut_protein.fillna(0, inplace=True)
A375_excnamut_protein.fillna(0, inplace=True)
GI1_excnamut_protein.fillna(0, inplace=True)
HT29_excnamut_protein.fillna(0, inplace=True)
IPC298_excnamut_protein.fillna(0, inplace=True)
MELJUSO_excnamut_protein.fillna(0, inplace=True)
MEWO_excnamut_protein.fillna(0, inplace=True)
OVCAR8_excnamut_protein.fillna(0, inplace=True)
MDAMB468_excnamut_protein.fillna(0, inplace=True)
HCT116_excnamut_protein.fillna(0, inplace=True)
A549_excnamut_protein.fillna(0, inplace=True)
JURKAT_excnamut_protein.fillna(0, inplace=True)
K562_excnamut_protein.fillna(0, inplace=True)
HELA_excnamut_protein.fillna(0, inplace=True)

In [50]:
HCT116_excnamut_protein

Unnamed: 0,Gene,Expression,CNA,Mut,ProteinIntensity
0,ZNF891,-0.019059,0.990980,0.0,0.00000
1,ARMC10,5.285371,0.988569,0.0,3.54307
2,PTGER4,0.804196,0.987542,0.0,0.00000
3,EIF1AD,5.873076,0.989453,0.0,1.35707
4,ABCG5,0.021226,0.982726,0.0,0.00000
...,...,...,...,...,...
19132,FCRL6,0.024116,0.982712,0.0,0.00000
19133,DNMT3B,3.576248,0.993140,0.0,2.34951
19134,ZCCHC10,5.001986,0.988286,0.0,0.00000
19135,PRSS2,0.001704,0.988569,0.0,0.00000


In [52]:
'''
c22RV1_excnamut_protein.to_csv('../../data/22RV1_ccle_excnamutprot_final_24Q2-ljoin.csv', index=False)
A375_excnamut_protein.to_csv('../../data/A375_ccle_excnamutprot_final_24Q2-ljoin.csv', index=False)
GI1_excnamut_protein.to_csv('../../data/GI1_ccle_excnamutprot_final_24Q2-ljoin.csv', index=False)
HT29_excnamut_protein.to_csv('../../data/HT29_ccle_excnamutprot_final_24Q2-ljoin.csv', index=False)
IPC298_excnamut_protein.to_csv('../../data/IPC298_ccle_excnamutprot_final_24Q2-ljoin.csv', index=False)
MELJUSO_excnamut_protein.to_csv('../../data/MELJUSO_ccle_excnamutprot_final_24Q2-ljoin.csv', index=False)
MEWO_excnamut_protein.to_csv('../../data/MEWO_ccle_excnamutprot_final_24Q2-ljoin.csv', index=False)
OVCAR8_excnamut_protein.to_csv('../../data/OVCAR8_ccle_excnamutprot_final_24Q2-ljoin.csv', index=False)
MDAMB468_excnamut_protein.to_csv('../../data/MDAMB468_ccle_excnamutprot_final_24Q2-ljoin.csv', index=False)
HCT116_excnamut_protein.to_csv('../../data/HCT116_ccle_excnamutprot_final_24Q2-ljoin.csv', index=False)
A549_excnamut_protein.to_csv('../../data/A549_ccle_excnamutprot_final_24Q2-ljoin.csv', index=False)
JURKAT_excnamut_protein.to_csv('../../data/JURKAT_ccle_excnamutprot_final_24Q2-ljoin.csv', index=False)
K562_excnamut_protein.to_csv('../../data/K562_ccle_excnamutprot_final_24Q2-ljoin.csv', index=False)
HELA_excnamut_protein.to_csv('../../data/HELA_ccle_excnamutprot_final_24Q2-ljoin.csv', index=False)
'''