# Contrastive Learning for Predicting Cancer Prognosis Using Gene Expression Values

## Generate CPTAC-3 GeneExp Match File

*CPTAC3_Preprocessing.ipynb* notebook is offering detailed step-by-step instructions on how to prepare CPTAC-3 Matched GeneExp file from orignial CPTAC-3 and DKFZ data files.

In [1]:
import pandas as pd
clinical_df = pd.read_csv('CPTAC3/clinical.project-CPTAC-3.2023-04-14.txt', sep='\t')

In [2]:
clinical_df.primary_diagnosis.unique()

array(['Renal cell carcinoma, NOS', 'Adenocarcinoma, NOS',
       'Squamous cell carcinoma, NOS', 'Glioblastoma',
       'Endometrioid adenocarcinoma, NOS',
       'Infiltrating duct carcinoma, NOS', nan], dtype=object)

In [3]:
clinical_df.tissue_or_organ_of_origin.unique()

array(['Kidney, NOS', 'Upper lobe, lung', 'Lower lobe, lung', 'Lung, NOS',
       'Frontal lobe', 'Middle lobe, lung', 'Corpus uteri',
       'Head of pancreas', 'Brain, NOS', 'Endometrium', 'Larynx, NOS',
       'Temporal lobe', 'Gum, NOS', 'Pancreas, NOS', 'Body of pancreas',
       'Floor of mouth, NOS', 'Cheek mucosa', 'Base of tongue, NOS',
       'Parietal lobe', 'Lip, NOS',
       'Overlapping lesion of lip, oral cavity and pharynx',
       'Tongue, NOS', 'Oropharynx, NOS', 'Tail of pancreas',
       'Occipital lobe', 'Tonsil, NOS', 'Head, face or neck, NOS', nan],
      dtype=object)

In [4]:
cancer_df = clinical_df[['primary_diagnosis', 'tissue_or_organ_of_origin']].drop_duplicates()
cancer_df

Unnamed: 0,primary_diagnosis,tissue_or_organ_of_origin
0,"Renal cell carcinoma, NOS","Kidney, NOS"
1,"Adenocarcinoma, NOS","Upper lobe, lung"
2,"Squamous cell carcinoma, NOS","Upper lobe, lung"
3,"Adenocarcinoma, NOS","Lower lobe, lung"
4,"Squamous cell carcinoma, NOS","Lung, NOS"
5,Glioblastoma,Frontal lobe
6,"Squamous cell carcinoma, NOS","Middle lobe, lung"
7,"Adenocarcinoma, NOS","Lung, NOS"
8,"Endometrioid adenocarcinoma, NOS",Corpus uteri
11,"Infiltrating duct carcinoma, NOS",Head of pancreas


In [5]:
cancer_map_dic = pd.read_excel('CPTAC3/cancer_map.xlsx')
cancer_map_dic

Unnamed: 0,primary_diagnosis,tissue_or_organ_of_origin,cancer
0,"Renal cell carcinoma, NOS","Kidney, NOS",KIRC
1,"Adenocarcinoma, NOS","Upper lobe, lung",LUAD
2,"Squamous cell carcinoma, NOS","Upper lobe, lung",LUSC
3,"Adenocarcinoma, NOS","Lower lobe, lung",LUAD
4,"Squamous cell carcinoma, NOS","Lung, NOS",LUSC
5,Glioblastoma,Frontal lobe,
6,"Squamous cell carcinoma, NOS","Middle lobe, lung",LUSC
7,"Adenocarcinoma, NOS","Lung, NOS",LUAD
8,"Endometrioid adenocarcinoma, NOS",Corpus uteri,
9,"Infiltrating duct carcinoma, NOS",Head of pancreas,


In [6]:
clinical_df['cancer'] = clinical_df.apply(
    lambda x: cancer_map_dic.loc[
        (cancer_map_dic['primary_diagnosis'] == x['primary_diagnosis']) &
        (cancer_map_dic['tissue_or_organ_of_origin'] == x['tissue_or_organ_of_origin']),
        'cancer'].values,
    axis=1
)
clinical_df = clinical_df.dropna(subset=['last_known_disease_status'])
clinical_df

Unnamed: 0,case_id,case_submitter_id,project_id,age_at_index,age_is_obfuscated,cause_of_death,cause_of_death_source,country_of_residence_at_enrollment,days_to_birth,days_to_death,...,treatment_dose,treatment_dose_units,treatment_effect,treatment_effect_indicator,treatment_frequency,treatment_intent_type,treatment_or_therapy,treatment_outcome,treatment_type,cancer
0,6bddd554-1643-462b-828c-d292d6f8b10f,C3L-00966,CPTAC-3,'--,'--,'--,'--,'--,-21878,'--,...,'--,'--,'--,'--,'--,'--,'--,'--,'--,[KIRC]
1,94bb61ca-3e9c-4754-b20f-a5eb4b3cd5dd,C3L-03462,CPTAC-3,'--,'--,'--,'--,'--,-22465,'--,...,'--,'--,'--,'--,'--,'--,'--,'--,'--,[LUAD]
2,6c28fff9-68e9-465f-bebb-7574901e333c,C3L-02969,CPTAC-3,'--,'--,'--,'--,'--,-28108,'--,...,'--,'--,'--,'--,'--,'--,'--,'--,'--,[LUSC]
3,c9486a50-8d6f-4871-b4c4-692e0da5803e,C3N-00572,CPTAC-3,'--,'--,'--,'--,'--,-17745,'--,...,'--,'--,'--,'--,'--,'--,'--,'--,'--,[LUAD]
4,6c5d5475-c22c-464c-abe1-3ea94bc80538,C3N-00555,CPTAC-3,'--,'--,'--,'--,'--,-21771,'--,...,'--,'--,'--,'--,'--,'--,'--,'--,'--,[LUSC]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1164,fdd6c700-c04d-4857-a522-8f83f67f5f95,C3N-00297,CPTAC-3,'--,'--,'--,'--,'--,-25254,'--,...,'--,'--,'--,'--,'--,'--,'--,'--,'--,[nan]
1165,fed33fff-0310-42ad-a7dc-d563792f18cc,C3N-03853,CPTAC-3,'--,'--,Unknown,'--,'--,-17976,393,...,'--,'--,'--,'--,'--,'--,'--,'--,'--,[nan]
1166,fed70927-708c-479e-942c-139414370672,C3L-02856,CPTAC-3,'--,'--,'--,'--,'--,-32731,'--,...,'--,'--,'--,'--,'--,'--,'--,'--,'--,[KIRC]
1167,ff527151-6797-414e-8760-edcb73306a28,C3L-03984,CPTAC-3,'--,'--,'--,'--,'--,-21834,'--,...,'--,'--,'--,'--,'--,'--,'--,'--,'--,[LUAD]


In [7]:
clinical_df.days_to_last_known_disease_status

0       1647.0
1       1436.0
2       1427.0
3        431.0
4        787.0
         ...  
1164    1734.0
1165     393.0
1166      15.0
1167    1086.0
1168    1801.0
Name: days_to_last_known_disease_status, Length: 1169, dtype: object

In [8]:
status = clinical_df.last_known_disease_status.drop_duplicates()
status

0               Tumor free
1               With tumor
3     Unknown tumor status
10            not reported
Name: last_known_disease_status, dtype: object

In [9]:
len(clinical_df[(clinical_df['last_known_disease_status'] == 'Tumor free') & (clinical_df['cancer'].str[0] == 'KIRC')])

124

In [13]:
len(clinical_df[(clinical_df['last_known_disease_status'] == 'Tumor free') & (clinical_df['cancer'].str[0] == 'LUSC')])

66

In [14]:
len(clinical_df[(clinical_df['last_known_disease_status'] == 'Tumor free') & (clinical_df['cancer'].str[0] == 'LUAD')])

127

In [9]:
import json

# Load JSON data from file
with open('CPTAC3/mapping.json') as f:
    json_data = json.load(f)

# Extract necessary information and create mapping dictionary
mapping_data = {item['file_name'].replace('tsv', 'txt'): item['cases'][0]['case_id'] for item in json_data}

mapping_data

{'df0ae930-d49f-4269-b4cb-942b0544f2c4.rna_seq.augmented_star_gene_counts.txt': 'faf81be5-f575-49a6-b8aa-40add7d3fa2f',
 'd097bf34-fa01-4b9c-8b71-e47effe7724e.rna_seq.augmented_star_gene_counts.txt': '936f72b9-e16d-448b-b6ba-23078dd211b8',
 '64be61be-941f-4bb5-a06a-d281c3d4883f.rna_seq.augmented_star_gene_counts.txt': '939f730c-2fe0-4777-a067-48fdabc82f17',
 '6cbe403c-5c2c-4cb7-8f8c-37424f2917a2.rna_seq.augmented_star_gene_counts.txt': 'ffef8d1d-f99d-4cc0-9f49-46488bfca131',
 '17488f1e-ff21-486e-bc32-5aa1379672ae.rna_seq.augmented_star_gene_counts.txt': 'fd0cd324-0557-4946-8ea9-55ad2032a31b',
 '3c05c010-995e-4576-a4a6-8843ecc67368.rna_seq.augmented_star_gene_counts.txt': 'ffef8d1d-f99d-4cc0-9f49-46488bfca131',
 '00167e25-7f06-44d8-be51-bd022bf36a4d.rna_seq.augmented_star_gene_counts.txt': '000ead0d-abf5-4606-be04-1ea31b999840',
 'f093ea86-510e-403e-8568-d0486f5ec824.rna_seq.augmented_star_gene_counts.txt': '000ead0d-abf5-4606-be04-1ea31b999840',
 '8fafe7f7-907b-41e3-acaa-34e44b7e8055.r

In [10]:
import os
from tqdm import tqdm
path = 'CPTAC3/MatchedGeneExp'
files = os.listdir(path)
KIRC_count = 0
LUAD_count = 0
LUSC_count = 0
for file in tqdm(files, desc='Processing files', unit='file'):
    id_get = mapping_data[file]
    # Retrieve the row with the given case_id
    selected_row = clinical_df.loc[clinical_df['case_id'] == id_get]

    # Print the selected row
    try:
        PFI = selected_row.last_known_disease_status.values[0]
        if PFI == 'Unknown tumor status' or PFI == 'not reported':
            PFI_status = 0
        else:
            PFI_status = 1
        PFI_time = selected_row.days_to_last_known_disease_status.values[0]
        cancer = selected_row.cancer.values[0][0]
        if cancer == "KIRC":
            KIRC_count += 1
        elif cancer == "LUAD":
            LUAD_count += 1
        elif cancer == "LUSC":
            LUSC_count += 1
        #print(PFI_status, PFI_time, cancer)
        data_df = pd.read_csv(os.path.join(path, file))

        if PFI == 'Tumor free':
            data_df.type = PFI
        data_df.PFI = PFI_status
        data_df.PFItime = PFI_time
        data_df.gen_id = cancer
        data_df.bar = id_get
        data_df.to_csv(os.path.join(path, file), index=False)
    except:
        print(file, selected_row.days_to_last_known_disease_status, selected_row.cancer, selected_row.last_known_disease_status)

# Print the counts
print("KIRC count:", KIRC_count)
print("LUAD count:", LUAD_count)
print("LUSC count:", LUSC_count)

Processing files:   7%|▋         | 149/2192 [01:12<16:49,  2.02file/s]

0fdb2f1a-f0d9-4ae6-9de4-98aa7b846330.rna_seq.augmented_star_gene_counts.txt Series([], Name: days_to_last_known_disease_status, dtype: object) Series([], Name: cancer, dtype: object) Series([], Name: last_known_disease_status, dtype: object)


Processing files:   9%|▉         | 208/2192 [01:40<16:07,  2.05file/s]

164118aa-fd54-4a7c-a86b-40bb1958e634.rna_seq.augmented_star_gene_counts.txt Series([], Name: days_to_last_known_disease_status, dtype: object) Series([], Name: cancer, dtype: object) Series([], Name: last_known_disease_status, dtype: object)


Processing files:  12%|█▏        | 268/2192 [02:09<15:32,  2.06file/s]

1cc0c765-652d-4fb9-be28-2d46e04db917.rna_seq.augmented_star_gene_counts.txt Series([], Name: days_to_last_known_disease_status, dtype: object) Series([], Name: cancer, dtype: object) Series([], Name: last_known_disease_status, dtype: object)


Processing files:  18%|█▊        | 399/2192 [03:12<14:17,  2.09file/s]

2b7911ee-4714-4669-8c3a-7cf3a6c153dc.rna_seq.augmented_star_gene_counts.txt Series([], Name: days_to_last_known_disease_status, dtype: object) Series([], Name: cancer, dtype: object) Series([], Name: last_known_disease_status, dtype: object)


Processing files:  23%|██▎       | 495/2192 [03:58<13:29,  2.10file/s]

382d5298-8c0d-40bf-9391-b962f037c04a.rna_seq.augmented_star_gene_counts.txt Series([], Name: days_to_last_known_disease_status, dtype: object) Series([], Name: cancer, dtype: object) Series([], Name: last_known_disease_status, dtype: object)


Processing files:  25%|██▌       | 551/2192 [04:25<13:18,  2.05file/s]

3f5e42a8-7295-4a29-9bca-771a361324ab.rna_seq.augmented_star_gene_counts.txt Series([], Name: days_to_last_known_disease_status, dtype: object) Series([], Name: cancer, dtype: object) Series([], Name: last_known_disease_status, dtype: object)


Processing files:  37%|███▋      | 808/2192 [06:29<11:01,  2.09file/s]

5c5b231f-5f96-4a4e-8020-a48218ec780a.rna_seq.augmented_star_gene_counts.txt Series([], Name: days_to_last_known_disease_status, dtype: object) Series([], Name: cancer, dtype: object) Series([], Name: last_known_disease_status, dtype: object)


Processing files:  40%|███▉      | 867/2192 [06:57<10:46,  2.05file/s]

62a23bb9-be3e-4e5b-8faf-ece06c60a4d9.rna_seq.augmented_star_gene_counts.txt Series([], Name: days_to_last_known_disease_status, dtype: object) Series([], Name: cancer, dtype: object) Series([], Name: last_known_disease_status, dtype: object)


Processing files:  49%|████▉     | 1070/2192 [08:35<09:03,  2.06file/s]

7bd0a15f-202f-44c4-8947-8ba06fa2e9a2.rna_seq.augmented_star_gene_counts.txt Series([], Name: days_to_last_known_disease_status, dtype: object) Series([], Name: cancer, dtype: object) Series([], Name: last_known_disease_status, dtype: object)


Processing files:  65%|██████▍   | 1420/2192 [11:23<06:14,  2.06file/s]

a935b4c0-dfcf-45fc-9c4e-efb9022c8eee.rna_seq.augmented_star_gene_counts.txt Series([], Name: days_to_last_known_disease_status, dtype: object) Series([], Name: cancer, dtype: object) Series([], Name: last_known_disease_status, dtype: object)


Processing files:  76%|███████▌  | 1664/2192 [13:20<04:19,  2.04file/s]

c49f1842-78f8-476c-9c3e-800521e71eb6.rna_seq.augmented_star_gene_counts.txt Series([], Name: days_to_last_known_disease_status, dtype: object) Series([], Name: cancer, dtype: object) Series([], Name: last_known_disease_status, dtype: object)


Processing files:  77%|███████▋  | 1677/2192 [13:26<04:05,  2.10file/s]

c639e759-ca72-4522-94a4-9dc59acf120f.rna_seq.augmented_star_gene_counts.txt Series([], Name: days_to_last_known_disease_status, dtype: object) Series([], Name: cancer, dtype: object) Series([], Name: last_known_disease_status, dtype: object)


Processing files:  85%|████████▌ | 1874/2192 [15:00<02:30,  2.11file/s]

dce9a213-293c-475f-8cdd-68146abd8269.rna_seq.augmented_star_gene_counts.txt Series([], Name: days_to_last_known_disease_status, dtype: object) Series([], Name: cancer, dtype: object) Series([], Name: last_known_disease_status, dtype: object)


Processing files:  86%|████████▌ | 1876/2192 [15:00<01:56,  2.71file/s]

dd262648-2f73-4b4f-aac5-5a2c5f2c6dc5.rna_seq.augmented_star_gene_counts.txt Series([], Name: days_to_last_known_disease_status, dtype: object) Series([], Name: cancer, dtype: object) Series([], Name: last_known_disease_status, dtype: object)


Processing files:  94%|█████████▍| 2070/2192 [16:33<00:57,  2.11file/s]

f1543bac-5876-406f-9c2d-9e21399ece17.rna_seq.augmented_star_gene_counts.txt Series([], Name: days_to_last_known_disease_status, dtype: object) Series([], Name: cancer, dtype: object) Series([], Name: last_known_disease_status, dtype: object)


Processing files: 100%|█████████▉| 2183/2192 [17:27<00:04,  2.06file/s]

ff02e459-d5eb-4a8e-980b-79cd4d96b29d.rna_seq.augmented_star_gene_counts.txt Series([], Name: days_to_last_known_disease_status, dtype: object) Series([], Name: cancer, dtype: object) Series([], Name: last_known_disease_status, dtype: object)


Processing files: 100%|██████████| 2192/2192 [17:31<00:00,  2.08file/s]

KIRC count: 461
LUAD count: 445
LUSC count: 204





In [10]:
df_list = []
for file in tqdm(files, desc='Processing files', unit='file'):
    data_df = pd.read_csv(os.path.join(path, file))
    df_list.append(data_df)
combined_df = pd.concat(df_list, axis=0)
combined_df

Processing files: 100%|██████████| 2192/2192 [15:49<00:00,  2.31file/s]


Unnamed: 0,bar,PFI,PFItime,gen_id,type,?|100130426,?|100133144,?|100134869,?|10357,?|10431,...,ZWILCH|55055,ZWINT|11130,ZXDA|7789,ZXDB|158586,ZXDC|79364,ZYG11A|440590,ZYG11B|79699,ZYX|7791,ZZEF1|23140,ZZZ3|26009
0,000ead0d-abf5-4606-be04-1ea31b999840,1.0,1816.0,KIRC,Tumor free,0.5915,3.6510,3.6510,0.0724,0.8032,...,1.2917,1.3374,3.3717,3.1189,12.1263,11.2384,19.4542,5.7679,11.8458,13.0883
0,a1da2f64-5a24-4e6b-abc1-e70aae23082c,1.0,6.0,,Tumor free,0.4182,8.1859,8.1859,0.5531,0.7329,...,3.3659,4.5507,4.1481,4.3419,9.0435,0.0125,16.8398,25.8328,14.5501,13.1862
0,d8ef8a50-9276-4733-a15c-ad0fdc0b2910,1.0,1697.0,LUSC,Tumor free,0.6102,4.1890,4.1890,0.0560,1.0201,...,1.3741,1.3346,2.6259,3.4351,7.8449,0.0152,8.2575,28.7278,9.5081,5.6744
0,211bc56d-ce9d-43ab-add5-850b13c35f50,1.0,1745.0,,Tumor free,0.0555,14.1692,14.1692,0.9782,2.1231,...,6.0434,6.0359,4.4431,9.9480,7.1779,0.9029,12.4283,14.4363,6.6687,16.1776
0,a59a90d9-f1b0-49dd-9c97-bcaa6ba55d44,1.0,1577.0,,Tumor free,0.2864,6.6492,6.6492,0.0701,1.0785,...,4.9611,11.9825,0.9002,1.2866,6.4122,1.2924,5.4850,11.5992,7.4487,10.3251
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,5cb7b327-5f17-44a8-aed3-989a1a99e780,0.0,-8.0,KIRC,,0.1318,6.8913,6.8913,0.2906,1.8137,...,2.8320,7.1144,6.4632,4.0346,5.8547,0.0197,11.4827,30.1338,9.5814,16.9404
0,1827eb4b-38f3-45b5-a833-156ba748c784,0.0,116.0,,,0.3544,3.4630,3.4630,0.4464,1.2959,...,5.9700,11.9148,4.3145,4.3234,10.3920,0.0756,19.7002,25.9744,13.2197,15.0347
0,21b7f29a-1195-4625-b8b8-51352733d5d9,1.0,75.0,,Tumor free,0.5336,12.3004,12.3004,1.5684,2.9941,...,4.7807,4.8258,2.0311,5.3098,5.3079,1.8062,11.5780,13.0523,6.9531,14.2014
0,da1fdb3c-3031-4072-8ca8-96f1729ac276,0.0,68.0,,,0.0923,5.6654,5.6654,0.7626,1.1594,...,3.5996,4.5029,2.5394,4.8938,5.5769,0.1171,11.1636,17.4697,9.5596,11.6200


In [18]:
combined_df.to_csv('CPTAC3/Matched_GeneExp.csv', index=False)

In [9]:
import pandas as pd
import mygene

mg = mygene.MyGeneInfo()

df_get = pd.read_csv('gene_id_get.csv')
# Example ENSEMBL IDs to convert
ensembl_ids = df_get.gene_id.values.tolist()

new_ens = []
for ensembl_id in ensembl_ids:
    new_ens.append(ensembl_id.split('.')[0])

geneSyms = mg.querymany(new_ens , scopes='ensembl.gene', fields='symbol', species='human')

df = pd.DataFrame.from_dict(geneSyms)
df.to_csv('gene_dict.csv')

querying 1-1000...done.
querying 1001-2000...done.
querying 2001-3000...done.
querying 3001-4000...done.
querying 4001-5000...done.
querying 5001-6000...done.
querying 6001-7000...done.
querying 7001-8000...done.
querying 8001-9000...done.
querying 9001-10000...done.
querying 10001-11000...done.
querying 11001-12000...done.
querying 12001-13000...done.
querying 13001-14000...done.
querying 14001-15000...done.
querying 15001-16000...done.
querying 16001-17000...done.
querying 17001-18000...done.
querying 18001-19000...done.
querying 19001-20000...done.
querying 20001-21000...done.
querying 21001-22000...done.
querying 22001-23000...done.
querying 23001-24000...done.
querying 24001-25000...done.
querying 25001-26000...done.
querying 26001-27000...done.
querying 27001-28000...done.
querying 28001-29000...done.
querying 29001-30000...done.
querying 30001-31000...done.
querying 31001-32000...done.
querying 32001-33000...done.
querying 33001-34000...done.
querying 34001-35000...done.
queryin

In [None]:
from mygene import MyGeneInfo
import pandas as pd
def convert_symbol_to_gene_id(symbol):
    mg = MyGeneInfo()
    gene = mg.query(symbol, scopes='symbol', fields='entrezgene', species='human')
    #gene = mg.query(symbol, scopes='ensembl.gene', fields='symbol', species='human')
    for item in gene['hits']:
        if 'entrezgene' in item:
            return item

df = pd.read_csv('gene_dict_symbol.csv', index_col = 0)

i = 0
# Loop through DataFrame rows
for index, row in df.iterrows():
    try:
        symbol = row['symbol'].replace(" ", "")
        _id = row['_id']

        if symbol != 'nan' and 'ENSG' in _id:
            new_id = convert_symbol_to_gene_id(symbol)['entrezgene']
            df.at[index, '_id'] = new_id
            print(symbol, _id, new_id)
            i += 1
    except:
        continue

print('Replace total:', str(i))

df.to_csv('gene_dict_symbol_new.csv', index_label=False)

ZNF285CP ENSG00000018607 646915
C2orf83 ENSG00000042304 56918
BORCS8-MEF2B ENSG00000064489 4207
PMS2P4 ENSG00000067601 5382
VDAC1P1 ENSG00000073905 642585
PMS2P1 ENSG00000078319 5379
CROCCP3 ENSG00000080947 114819
HSD17B7P2 ENSG00000099251 158160
CRYBB2P1 ENSG00000100058 1416
LRP5L ENSG00000100068 91355
TPTEP1 ENSG00000100181 387590
GOLGA8UP ENSG00000103832 100507067
KIR3DX1 ENSG00000104970 90011
SIGLEC5 ENSG00000105501 8778
ELOCP28 ENSG00000105694 100287483
NHP2P1 ENSG00000105988 414200
NSUN5P2 ENSG00000106133 260294
STAG3L4 ENSG00000106610 64940
TVP23BP2 ENSG00000108442 100421473
TRIM16L ENSG00000108448 147166
HSD17B1P1 ENSG00000108785 643646
LINC02566 ENSG00000115934 123706510
DLEU2L ENSG00000116652 79469
BTF3P11 ENSG00000118903 690
OTUD4P1 ENSG00000118976 360227
GLRXP3 ENSG00000118990 100132510
DPPA5P4 ENSG00000119660 646701
SEPTIN7P9 ENSG00000120555 285961
LINC00544 ENSG00000122043 440131
ARL4AP1 ENSG00000122872 387684
NME2P1 ENSG00000123009 283458
TYRL ENSG00000123447 7300
ZNF137

In [1]:
import os
import pandas as pd
CPTAC3_GeneExp_pth = 'CPTAC3\GeneExp'
files = os.listdir(CPTAC3_GeneExp_pth)
for file in files:
    file_get = os.listdir(os.path.join(CPTAC3_GeneExp_pth, file))
    tsv_name = ''
    for tsv_file in file_get:
        if tsv_file.endswith('.tsv'):
            tsv_name = tsv_file
            break
    gen_df = pd.read_csv(os.path.join(CPTAC3_GeneExp_pth, file, tsv_name), sep='\t', header=1)
    #print(gen_df.columns)
    header = gen_df['gene_id'].values.tolist()
    #header_new = []
    #for item in header:
    #    header_new.append(item.split('.')[0])
    content = gen_df['fpkm_unstranded'].tolist()
    df_get = pd.DataFrame({'gene_id': header, 'fpkm_unstranded': content})
    filtered_df = df_get[~df_get['gene_id'].str.contains('PAR_Y')]
    filtered_df['gene_id_new'] = filtered_df['gene_id'].apply(lambda x: x.split('.')[0])
    filtered_df = filtered_df.dropna().reset_index()
    break
filtered_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['gene_id_new'] = filtered_df['gene_id'].apply(lambda x: x.split('.')[0])


Unnamed: 0,index,gene_id,fpkm_unstranded,gene_id_new
0,4,ENSG00000000003.15,9.6881,ENSG00000000003
1,5,ENSG00000000005.6,0.4414,ENSG00000000005
2,6,ENSG00000000419.13,25.0492,ENSG00000000419
3,7,ENSG00000000457.14,3.8763,ENSG00000000457
4,8,ENSG00000000460.17,1.3492,ENSG00000000460
...,...,...,...,...
60611,60659,ENSG00000288669.1,0.0000,ENSG00000288669
60612,60660,ENSG00000288670.1,3.2120,ENSG00000288670
60613,60661,ENSG00000288671.1,0.0000,ENSG00000288671
60614,60662,ENSG00000288674.1,0.0215,ENSG00000288674


In [2]:
# Read the main DataFrame from CSV
main_df = filtered_df#.iloc[:5]

# Read the mapping DataFrame from CSV
mapping_df = pd.read_csv('CPTAC3/gene_dict_symbol_new.csv')

# Create a dictionary mapping from the mapping DataFrame using '_id' as the key and 'MappedValue' as the value
mapping_dict = mapping_df.set_index('query')['_id'].to_dict()

# Create a new column in the main DataFrame by mapping the values from the mapping DataFrame
main_df['MappedColumn'] = main_df['gene_id_new'].map(mapping_dict)

# Print the updated DataFrame
gen_mapping_dict = main_df.set_index('MappedColumn')['fpkm_unstranded'].to_dict()
gen_mapping_dict

{'7105': 9.6881,
 '64102': 0.4414,
 '8813': 25.0492,
 '57147': 3.8763,
 '55732': 1.3492,
 '2268': 2.6706,
 '3075': 6.5051,
 '2519': 10.378,
 '2729': 4.2608,
 '4800': 7.3507,
 '90529': 1.3639,
 '57185': 10.0599,
 '81887': 3.2384,
 '22875': 1.2562,
 '6405': 25.1762,
 '1080': 0.1996,
 '54467': 12.2033,
 '1595': 2.3411,
 '889': 1.5417,
 '5893': 6.3723,
 '84176': 0.0892,
 '572': 4.6465,
 '51056': 13.4176,
 '4267': 32.5872,
 '9957': 1.1192,
 '26': 0.4537,
 '51384': 7.2852,
 '23072': 0.0956,
 '8379': 0.4915,
 '3927': 12.1075,
 '29916': 6.0758,
 '55365': 4.6757,
 '4074': 16.8424,
 '90293': 3.2483,
 '56603': 1.0156,
 '3382': 4.4998,
 '79007': 2.1635,
 '57679': 4.8938,
 '843': 4.8202,
 '8837': 9.9657,
 '7035': 7.7103,
 '55471': 4.4528,
 '10181': 14.7791,
 '9108': 0.8933,
 '6542': 11.6465,
 '381': 32.7928,
 '23098': 0.816,
 '26073': 22.104,
 '23129': 13.1904,
 '204': 12.5557,
 '952': 1.0796,
 '2288': 17.9579,
 '23028': 17.5607,
 '10180': 17.9606,
 '84254': 3.1752,
 '5965': 8.0947,
 '55610': 4.513

In [3]:
header_df = pd.read_csv('CPTAC3/gen_header.csv')
gen_header_get = header_df.values.tolist()
df_convert = pd.DataFrame(columns=gen_header_get)
df_convert.columns

MultiIndex([(          'bar',),
            (          'PFI',),
            (      'PFItime',),
            (       'gen_id',),
            (         'type',),
            (  '?|100130426',),
            (  '?|100133144',),
            (  '?|100134869',),
            (      '?|10357',),
            (      '?|10431',),
            ...
            ( 'ZWILCH|55055',),
            (  'ZWINT|11130',),
            (    'ZXDA|7789',),
            (  'ZXDB|158586',),
            (   'ZXDC|79364',),
            ('ZYG11A|440590',),
            ( 'ZYG11B|79699',),
            (     'ZYX|7791',),
            (  'ZZEF1|23140',),
            (   'ZZZ3|26009',)],
           length=20536)

In [17]:
def get_value_from_dict(dictionary, key):
    # Get the value from the dictionary if the key exists
    if key in dictionary:
        return dictionary[key]

    # Return the original value if the key is not in the dictionary
    return key

# Assign column values based on the header using the mapping dictionary
count_pos = 0
count_nag = 0
value_list = []
for column_name, column_data in df_convert.iteritems():
    try:
        column_name_get = str(column_name)[2: -3]
        #print(column_name_get)
        id = str(column_name_get.split('|')[1])
        new_id = get_value_from_dict(new_geneid_dict, id)
        value = gen_mapping_dict[new_id]
        #print(column_name_get, value)
        df_convert[column_name] = value
        count_pos += 1
        value_list.append([value])
    except:
        #print(column_name_get, 'None')
        df_convert[column_name] = None
        value_list.append(['NA'])
        count_nag += 1

# Print the updated DataFrame
df_convert_get = pd.DataFrame(value_list).transpose()
df_convert_get.columns = df_convert.columns
df_convert_get

Unnamed: 0,bar,PFI,PFItime,gen_id,type,?|100130426,?|100133144,?|100134869,?|10357,?|10431,...,ZWILCH|55055,ZWINT|11130,ZXDA|7789,ZXDB|158586,ZXDC|79364,ZYG11A|440590,ZYG11B|79699,ZYX|7791,ZZEF1|23140,ZZZ3|26009
0,,,,,,0.0,35.6447,35.6447,0.4192,2.653,...,4.7816,12.871,6.2878,6.2764,21.0176,1.8602,23.4337,46.8844,20.498,24.2776


In [5]:
df_convert_new = df_convert_get.transpose()
df_convert_new.to_csv('gene_dict_sample.csv', index_label=False)

In [6]:
import pandas as pd

df = pd.read_csv('gene_dict_sample.csv')
df.columns = ['0', '1', '2']
# Split the index column by '|' into two separate columns
df[['Letter', 'Number']] = df['1'].str.split('|', expand=True)
filtered_df = df[df['2'].isna()]
# Drop the original index column
filtered_df = filtered_df.drop('0', axis=1)
# Print the modified DataFrame
filtered_df.to_csv('gene_dict_sample_split.csv', index_label=False)

ValueError: Length mismatch: Expected axis has 1 elements, new values have 3 elements

In [32]:
def write_list_to_txt(lst, filename):
    with open(filename, 'w') as file:
        for item in lst:
            file.write(str(item) + '\n')

def get_columns_with_na(df):
    na_columns = []
    for column in df.columns:
        if df.iloc[0][column] == 'NA':
            na_columns.append(column[0])
    return na_columns

# Assuming you have a dataframe named 'df'
columns_with_na = get_columns_with_na(df_convert_get)

# Specify the filename for the output text file
output_filename = 'CPTAC3/gene_miss.txt'

# Call the function to write the list to the text file
write_list_to_txt(columns_with_na, output_filename)

In [18]:
na_counts = (df_convert_get == 'NA').sum(axis=1)
na_counts

0    241
dtype: int64

In [13]:
import pandas as pd
import numpy as np
df_new_sample = pd.read_csv('CPTAC3/gene_dict_sample_new.csv')
geneid_dict = df_new_sample.set_index('GeneID')['ReplacementGeneID'].to_dict()
symbol_dict = df_new_sample.set_index('GeneID')['OfficialSymbol'].to_dict()

In [14]:
def remove_none_values(dictionary):
    # Create a new dictionary without None values
    new_dict = {str(key): value for key, value in dictionary.items() if not isinstance(value, float)}
    return new_dict

In [15]:
new_dict = remove_none_values(symbol_dict)
new_dict

{'100130426': 'VPS13A-AS1',
 '100133144': 'UBE2Q2P3',
 '10431': 'TIMM23',
 '136542': 'MOXD2',
 '155060': 'LOC155060',
 '317712': 'CSN1S2BP',
 '391343': 'LOC391343',
 '441362': 'REXO1L6P',
 '553137': 'LOC553137',
 '57714': 'RNF213',
 '645851': 'RSKR',
 '652919': 'RGPD6',
 '728603': 'FRMPD2B',
 '728788': 'ANKRD20A20P',
 '729884': 'TMPRSS11E',
 '728404': 'AGAP4',
 '11217': 'PALM2AKAP2',
 '244': 'ANXA8L1',
 '641522': 'ARL17B',
 '84726': 'PRRC2B',
 '414235': 'DIP2C-AS1',
 '387638': 'NEBL',
 '26148': 'LCOR',
 '100127889': 'CC2D2B',
 '255352': 'CFAP46',
 '374467': 'CFAP54',
 '338809': 'PLEKHG7',
 '79686': 'SYNE3',
 '414767': 'LINC00596',
 '196913': 'LINC01599',
 '80035': 'ANP32A-IT1',
 '414926': 'LINC00593',
 '81698': 'LINC00597',
 '284083': 'SEPTIN4',
 '149469': 'SZT2',
 '284836': 'LINC00319',
 '54094': 'CYP4F29P',
 '80215': 'RUNX1-IT1',
 '348738': 'RRM2',
 '100128378': 'LINC00696',
 '79614': 'NPR3',
 '85411': 'GFOD1',
 '63914': 'LINC01590',
 '90632': 'LINC00473',
 '79992': 'AGPAT4-IT1',
 '1

In [16]:
def remove_none_values_gene(dictionary):
    # Create a new dictionary without None values
    new_dict = {str(key): str(int(value)) for key, value in dictionary.items() if value > 0}
    return new_dict
new_geneid_dict = remove_none_values_gene(geneid_dict)
new_geneid_dict

{'100130426': '100286938',
 '100133144': '100134869',
 '10431': '100652748',
 '136542': '100289017',
 '317712': '100337616',
 '553137': '100422737',
 '57714': '57674',
 '645851': '124923',
 '652919': '729540',
 '728603': '728798',
 '729884': '28983',
 '728404': '119016',
 '11217': '445815',
 '244': '728113',
 '641522': '100506084',
 '387638': '10529',
 '26148': '84458',
 '100127889': '387707',
 '255352': '54777',
 '374467': '144535',
 '338809': '440107',
 '79686': '161176',
 '414767': '102724845',
 '284083': '5414',
 '149469': '23334',
 '54094': '54055',
 '348738': '6241',
 '79614': '4883',
 '85411': '54438',
 '129790': '100506380',
 '286223': '1903',
 '120329': '100506742',
 '221016': '79741',
 '728461': '102723547',
 '441520': '441519',
 '9142': '84631',
 '170063': '286464',
 '645090': '286464',
 '84663': '246126',
 '26222': '26220',
 '25787': '26220',
 '55747': '387680',
 '548321': '100132948',
 '642265': '647060',
 '83954': '387316',
 '200058': '5768',
 '643210': '79813',
 '392490'

In [4]:
import pandas as pd
cancer_type_list = ['LUAD', 'LUSC', 'KIRC']
for cancer in cancer_type_list:
    first_df = pd.read_csv('Analysis/Ana_CPTAC3_1000_First/Analysis_CPTAC3_Risk_{}_{}_validation.csv'.format(cancer, cancer))
    last_df = pd.read_csv('Analysis/Ana_CPTAC3_1000_Last/Analysis_CPTAC3_Risk_{}_{}_validation.csv'.format(cancer, cancer))
    first_df.columns = ['file_get', 'seed', 'eachCancer', 'cancer_group', 'test_layer', 'auc_roc', 'f1', 'accuracy', 'precision', 'recall', 'sensitivity', 'specificity', 'best_d', 'best_l', 'best_n']
    last_df.columns = ['file_get', 'seed', 'eachCancer', 'cancer_group', 'test_layer', 'auc_roc', 'f1', 'accuracy', 'precision', 'recall', 'sensitivity', 'specificity', 'best_d', 'best_l', 'best_n']
    # Combine the two dataframes vertically
    combined_df = pd.concat([first_df, last_df])

    # Sort the combined dataframe by 'auc_roc' column in descending order
    combined_df.sort_values(by='auc_roc', ascending=False, inplace=True)

    # Drop duplicate rows based on 'file_get' column and keep the first occurrence (highest auc_roc)
    combined_df.drop_duplicates(subset='file_get', keep='first', inplace=True)

    # Reset the index of the combined dataframe
    combined_df.sort_values(by='seed', ascending=True, inplace=True)
    combined_df.reset_index(drop=True, inplace=True)

    # Print the combined dataframe
    combined_df.to_csv('Analysis/Ana_CPTAC3_1000_Uniq/Analysis_CPTAC3_Risk_{}_{}_validation.csv'.format(cancer, cancer), index_label=False)
    # Calculate maximum, mean, and standard deviation of 'auc_roc' column
    auc_max = combined_df['auc_roc'].max()
    auc_mean = combined_df['auc_roc'].mean()
    auc_std = combined_df['auc_roc'].std()

    # Print the results
    print('Cancer:', cancer)
    print("Maximum AUC-ROC:", auc_max)
    print("Mean AUC-ROC:", auc_mean)
    print("Standard Deviation of AUC-ROC:", auc_std)

Cancer: LUAD
Maximum AUC-ROC: 0.6842105263157895
Mean AUC-ROC: 0.5716735659373151
Standard Deviation of AUC-ROC: 0.051889938075666255
Cancer: LUSC
Maximum AUC-ROC: 0.7125237191650853
Mean AUC-ROC: 0.6096537001897533
Standard Deviation of AUC-ROC: 0.04387686204567157
Cancer: KIRC
Maximum AUC-ROC: 0.6232110330470987
Mean AUC-ROC: 0.5580601092896174
Standard Deviation of AUC-ROC: 0.03564265398537981
