# Generate DKFZ Datasets Files

*GenerateDKFZ_Dataset.ipynb* notebook is offering detailed step-by-step instructions on how to generate DKFZ validation dataset from original DKFZ data files.

Required input files:
1. prostate_dkfz_2018_converted.txt # Preprocessed Gene Exp file from DKFZ_Preprocessing.ipynb
2. mart_export.txt # Downloaded Gene IDs file at https://miami.box.com/s/kz3qu3li4m36dqa43gqy706w63ihc58a
3. gene_dict_sample_new.csv # Gene IDs and Symbols mapping dictionary 1 at https://miami.box.com/s/bqoya8d6zr1ku7h6giawaupe4l9aixhw
4. gene_dict_symbol_new.csv # Gene IDs and Symbols mapping dictionary 2 at https://miami.box.com/s/ve3st08ha4u6868hzq3o8remmo7iw1or

Output files:
1. Matched_DKFZ_GeneExp_FPKM.csv #DKFZ matched gene exp dataset raw file, which contains expression values of those genes that are found in the DKFZ PRAD dataset.
2. CancerRNA_DKFZ_PRAD_Risk_{split_number}.txt #DKFZ Validation classifier task validation dataset files, contains normalized expression values, classifier labels and PFI/censor status.
3. CancerRNA_DKFZ_PRAD_WholeTimeSeq_{split_number}.txt #DKFZ Cox task validation dataset files, contains normalized expression values and PFI/censor status.

In [1]:
import pandas as pd
df_org = pd.read_csv('CPTAC-3&DKFZ/prostate_dkfz_2018/prostate_dkfz_2018_converted.txt')
df_org = df_org.drop(df_org.columns[0], axis=1)
df_org

Unnamed: 0,bar,PFI,PFItime,gen_id,type,?|100133144,?|10357,?|10431,?|155060,?|26823,...,ZWILCH|55055,ZWINT|11130,ZXDA|7789,ZXDB|158586,ZXDC|79364,ZYG11A|440590,ZYG11B|79699,ZYX|7791,ZZEF1|23140,ZZZ3|26009
0,ICGC_PCA198_T01,0.0,1413.0,PRAD,,,,,,,...,6.923315,21.582012,1.427626,3.551453,2.410733,4.907148,7.493623,38.762346,2.235283,6.103063
1,ICGC_PCA196_T01,,,PRAD,,,,,,,...,9.489437,5.539151,1.400698,3.768381,1.891544,2.400883,6.260822,40.297573,1.250914,5.547623
2,ICGC_PCA192_T01,1.0,63.0,PRAD,,,,,,,...,3.599968,4.036992,1.692921,3.092681,3.948183,1.454601,6.886100,28.368410,2.576761,6.063853
3,ICGC_PCA187_T01,0.0,684.0,PRAD,,,,,,,...,2.990770,2.765634,1.935469,4.041836,1.990928,0.548249,6.431129,26.571751,3.357384,5.128143
4,ICGC_PCA184_T01,0.0,915.0,PRAD,,,,,,,...,3.628763,3.001283,1.589094,3.592727,1.961094,0.750531,4.427128,30.653946,3.209377,5.026754
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
113,ICGC_PCA005_T01,,,PRAD,,,,,,,...,1.625334,0.740449,1.310961,2.643662,5.316592,0.611810,4.819813,43.841390,8.164817,2.468972
114,ICGC_PCA004_T01,1.0,15.0,PRAD,,,,,,,...,2.278294,1.637447,1.163010,2.229661,6.174050,1.147742,6.293188,39.581143,7.017233,5.860986
115,ICGC_PCA003_T01,1.0,231.0,PRAD,,,,,,,...,1.824603,1.298947,1.075595,2.221146,5.621287,1.471187,5.418815,55.138897,8.568954,4.161830
116,ICGC_PCA002_T01,,,PRAD,,,,,,,...,1.913815,1.274605,1.327418,3.363149,7.578699,0.656983,5.614364,52.193349,9.349462,4.307724


In [2]:
test_df = df_org.iloc[:, 5:]
upper_quartile = test_df.quantile(0.75, axis=1)
upper_quartile

0     1.163870
1     1.231330
2     1.552145
3     1.426210
4     1.472885
5     1.709435
6     1.272995
7     1.339455
8     1.241945
9     2.129945
10    1.920285
11    1.888790
12    1.459245
13    1.244810
14    1.963205
15    1.597950
16    2.263265
17    2.445435
18    1.754780
19    2.109390
20    1.155300
21    0.632282
22    1.292700
23    2.325305
24    1.161075
25    1.567155
26    1.761890
27    2.213600
28    1.211565
29    2.231050
30    1.814790
31    1.509360
32    1.518035
33    0.670114
34    1.919660
35    2.026960
36    2.128765
37    1.592420
38    1.896895
39    2.101295
40    1.557260
41    1.793805
42    1.498400
43    1.730295
44    2.230675
45    0.572507
46    2.610415
47    1.836145
48    1.501110
49    2.487145
50    1.754975
51    1.690215
52    2.176830
53    2.236855
54    1.928660
Name: 0.75, dtype: float64

In [3]:
length_df = pd.read_csv('CPTAC-3&DKFZ/mart_export.txt')
df_new_sample = pd.read_csv('CPTAC-3&DKFZ/gene_dict_sample_new.csv')
geneid_dict = df_new_sample.set_index('GeneID')['ReplacementGeneID'].to_dict()
symbol_dict = df_new_sample.set_index('GeneID')['OfficialSymbol'].to_dict()

def remove_none_values(dictionary):
    # Create a new dictionary without None values
    new_dict = {str(key): value for key, value in dictionary.items() if not isinstance(value, float)}
    return new_dict
def remove_none_values_gene(dictionary):
    # Create a new dictionary without None values
    new_dict = {str(key): str(int(value)) for key, value in dictionary.items() if value > 0}
    return new_dict

def get_value_from_dict(dictionary, key):
    # Get the value from the dictionary if the key exists
    if key in dictionary:
        return dictionary[key]

    # Return the original value if the key is not in the dictionary
    return key

new_geneid_dict = remove_none_values_gene(geneid_dict)
new_symbol_dict = remove_none_values(symbol_dict)

In [4]:
# Read the mapping DataFrame from CSV
mapping_df = pd.read_csv('CPTAC-3&DKFZ/gene_dict_symbol_new.csv')

# Create a dictionary mapping from the mapping DataFrame using '_id' as the key and 'MappedValue' as the value
mapping_dict = mapping_df.set_index('query')['_id'].to_dict()

# Create a new column in the main DataFrame by mapping the values from the mapping DataFrame
length_df['MappedColumn'] = length_df['Gene stable ID'].map(mapping_dict)

# Initialize an empty dictionary
mapped_length_dict = {}

# Iterate over each row in the dataframe
for index, row in length_df.iterrows():
    mapped_column = row['MappedColumn']
    transcript_length = row['Transcript length (including UTRs and CDS)']
    ensembl_canonical = row['Ensembl Canonical']

    # Check if the mapped_column is already present in the dictionary
    if mapped_column in mapped_length_dict:
        # If the current transcript_length is greater than the existing value, update the dictionary
        if transcript_length > mapped_length_dict[mapped_column]:
            mapped_length_dict[mapped_column] = transcript_length
    else:
        # If the mapped_column is not present, add it to the dictionary
        mapped_length_dict[mapped_column] = transcript_length

    # If the ensembl_canonical value is 1, update the dictionary regardless of the existing value
    if ensembl_canonical == 1:
        mapped_length_dict[mapped_column] = transcript_length
mapped_length_dict

{'4558': 71,
 '4549': 954,
 '4577': 69,
 '4550': 1559,
 '4567': 75,
 '4535': 956,
 '4565': 69,
 '4572': 72,
 '4569': 68,
 '4536': 1042,
 '4578': 68,
 '4553': 69,
 '4570': 73,
 '4511': 66,
 '4579': 66,
 '4512': 1542,
 '113219467': 69,
 '4555': 68,
 '4513': 684,
 '4566': 70,
 '4509': 207,
 '4508': 681,
 '4514': 784,
 '4563': 68,
 '4537': 346,
 '4573': 65,
 '4539': 297,
 '4538': 1378,
 '4564': 69,
 '4575': 59,
 '4568': 71,
 '4540': 1812,
 '4541': 525,
 '4556': 69,
 '4519': 1141,
 '4576': 66,
 '4571': 68,
 nan: 1052,
 '100874287': 443,
 '100419554': 1191,
 '64593': 1145,
 '359996': 1164,
 '377997': 868,
 'ENSG00000278478': 279,
 '643034': 1267,
 '83867': 1046,
 '100101120': 1367,
 'ENSG00000224518': 131,
 '386687': 783,
 '378953': 720,
 'ENSG00000275866': 385,
 '84672': 491,
 '106480250': 336,
 '100128190': 1337,
 '359799': 1064,
 '252955': 512,
 '359796': 298,
 '246122': 1367,
 '100873963': 5792,
 '643001': 1267,
 '9081': 1353,
 '642796': 509,
 'ENSG00000273589': 297,
 '100874520': 355,
 

In [5]:
length_list = []
count = 0
# Function to retrieve mapped value from dictionary
def get_mapped_value(name):
    last_name = name.split('|')[-1]
    return mapped_length_dict.get(last_name)

columns_to_normalize = df_org.columns[5:]
times_df = df_org[columns_to_normalize].copy()
# Iterate over columns
for column in times_df.columns:
    column_name, length_key = column.split('|')
    new_id = get_value_from_dict(new_geneid_dict, length_key)
    try:
        length_value = mapped_length_dict[new_id]
        # Apply length value to each value in the column
        length_list.append(length_value)
    except:
        count += 1
print(count)

117


In [6]:
import statistics
# Calculate the median
median = statistics.median(length_list)
print("Median:", median)

# Calculate the average (mean)
average = statistics.mean(length_list)
print("Average:", average)

'''
PRAD
Median: 3579.0
Average: 4254.6646272942935
'''

Median: 3579.0
Average: 4254.6646272942935


'\nPRAD\nMedian: 3579.0\nAverage: 4254.6646272942935\n'

In [7]:
Median_value = 4254.665

# Function to retrieve mapped value from dictionary
def get_mapped_value(name):
    last_name = name.split('|')[-1]
    return mapped_length_dict.get(last_name)

columns_to_normalize = df_org.columns[5:]
times_df = df_org[columns_to_normalize].copy()
# Iterate over columns
for column in times_df.columns:
    column_name, length_key = column.split('|')
    new_id = get_value_from_dict(new_geneid_dict, length_key)
    try:
        length_value = mapped_length_dict[new_id]
        # Apply length value to each value in the column
        times_df[column] = times_df[column].apply(lambda x: x * length_value)
    except:
        times_df[column] = times_df[column].apply(lambda x: x * Median_value)
        print(column)
times_df

?|155060
?|728788
BAT2L1|84726
C10orf108|414235
C14orf181|400223
C15orf28|80035
C15orf50|414926
C21orf125|284836
C3orf74|100128378
C6orf164|63914
C6orf59|79992
CA5BP|340591
CCL4L2|388372
CES4|51716
CRIPAK|285464
CTAGE4|100128553
DKFZP586I1420|222161
EPR1|8475
F8A1|8263
FAM153B|202134
FAM27A|548321
FAM27C|100132948
FBXO22OS|692224
FLJ14107|80094
FLJ37201|283011
FLJ42627|645644
FLJ45340|402483
GBAP1|2630
GOLGA6L5|374650
HCG26|352961
HCG2P7|80867
HIST2H2AA3|8337
HOMEZ|57594
IPW|3653
LILRA3|11026
LOC100128288|100128288
LOC100128842|100128842
LOC100129034|100129034
LOC100130872|100130872
LOC100130987|100130987
LOC100132287|100132287
LOC100133331|100133331
LOC100134868|100134868
LOC100190986|100190986
LOC100270710|100270710
LOC100270746|100270746
LOC100272146|100272146
LOC100272216|100272216
LOC100272217|100272217
LOC100302650|100302650
LOC143666|143666
LOC145845|145845
LOC146880|146880
LOC148696|148696
LOC202181|202181
LOC284441|284441
LOC284578|284578
LOC286359|286359
LOC341056|341056
LOC3

Unnamed: 0,?|100133144,?|10357,?|10431,?|155060,?|26823,?|390284,?|57714,?|645851,?|653553,?|728788,...,ZWILCH|55055,ZWINT|11130,ZXDA|7789,ZXDB|158586,ZXDC|79364,ZYG11A|440590,ZYG11B|79699,ZYX|7791,ZZEF1|23140,ZZZ3|26009
0,0.0,,,2298.280685,,,,,,,...,828.902745,459.432885,1747.039397,1872.28349,5955.20442,322.704333,8570.10035,4716.83196,13949.5356,6837.94916
1,0.0,,,2393.517106,,,,,,,...,623.45928,98.067257,1124.373762,1975.145095,5660.35855,714.947135,7109.335723,10791.13976,16178.64066,6631.22628
2,0.0,,,2534.210369,,,,,,,...,591.6651,636.217035,1969.346342,2876.729933,7443.41455,935.29023,9064.54331,10830.77588,13796.34984,13104.01204
3,0.0,,,3650.379185,,,,,,,...,375.34676,250.63541,1220.809866,1914.43406,6347.57805,1183.130095,9474.54336,8011.48696,17680.80132,9965.33804
4,0.0,,,1370.261665,,,,,,,...,1554.460025,690.285727,1715.366755,3609.198593,5852.341,3508.202915,10579.54846,4161.92628,12744.68832,14530.74616
5,0.0,,,1047.962281,,,,,,,...,495.62108,302.5823,2349.252089,2836.66229,9368.54094,2362.210795,12529.87839,3814.82616,13305.37572,9813.37364
6,0.0,,,1940.280408,,,,,,,...,805.937885,140.704495,1766.793309,2895.65122,6925.41652,521.678535,9968.98632,8154.99244,10488.420306,9356.64688
7,35.959533,,,1655.983693,,,,,,,...,685.908025,127.503436,1655.229973,2825.225326,5268.89671,1471.681715,6942.054074,2081.092364,9813.829662,12195.3034
8,0.0,,,1195.790617,,,,,,,...,338.092853,566.089714,1108.235701,2522.326191,5727.12184,523.76324,7829.290925,5462.76636,12459.29958,9888.2658
9,0.0,,,1788.473961,,,,,,,...,1231.578695,69.883765,3935.368515,4632.090694,10469.34163,1041.734645,19051.28137,15978.61444,26194.99428,10274.46056


In [8]:
df_org[columns_to_normalize] = times_df
df_org.to_csv('CPTAC3/Validation/prostate_dkfz_2018/Matched_DKFZ_GeneExp_FPKM.csv', index=False)

In [9]:
import pandas as pd
df = pd.read_csv('CPTAC3/Validation/prostate_dkfz_2018/Matched_DKFZ_GeneExp_FPKM.csv')
df.dropna(subset=['PFI'], inplace=True)
df

Unnamed: 0,bar,PFI,PFItime,gen_id,type,?|100133144,?|10357,?|10431,?|155060,?|26823,...,ZWILCH|55055,ZWINT|11130,ZXDA|7789,ZXDB|158586,ZXDC|79364,ZYG11A|440590,ZYG11B|79699,ZYX|7791,ZZEF1|23140,ZZZ3|26009
0,ICGC_PCA198_T01,0.0,1413.0,PRAD,,,,,,,...,24889.318359,42322.324748,7179.528800,19415.795970,8141.045020,21812.273458,61020.572115,86362.506975,25629.755267,39132.842227
2,ICGC_PCA192_T01,1.0,63.0,PRAD,,,,,,,...,12941.885411,7916.542222,8513.702100,16907.684396,13333.013248,6465.702249,56073.511645,63204.818071,29545.140432,38881.422391
3,ICGC_PCA187_T01,0.0,684.0,PRAD,,,,,,,...,10751.818429,5423.408955,9733.473112,22096.716055,6723.365488,2436.968771,52368.684587,59201.861218,38495.763586,32881.651047
4,ICGC_PCA184_T01,0.0,915.0,PRAD,,,,,,,...,13045.402200,5885.515686,7991.553362,19641.435811,6622.615023,3336.111604,36050.105540,68296.991460,36798.718274,32231.547125
5,ICGC_PCA176_T01,0.0,717.0,PRAD,,,,,,,...,10621.084284,4315.540045,6816.752324,20168.030584,11103.203635,491.859932,42183.745156,157134.917528,36116.735623,28019.530131
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
110,ICGC_PCA008_T01,0.0,2193.0,PRAD,,,,,,,...,1747.912615,5199.613249,7602.767401,9531.275183,30149.135682,649.394319,18066.539741,170949.011621,141138.153423,12705.215378
112,ICGC_PCA006_T01,1.0,132.0,PRAD,,,,,,,...,8453.060816,8429.822799,3492.098362,7201.323538,15350.761753,2962.121093,20356.578922,68303.601836,51299.834529,21800.158308
114,ICGC_PCA004_T01,1.0,15.0,PRAD,,,,,,,...,8190.466560,3211.033776,5848.775535,12189.558218,20849.766470,5101.711060,51245.428154,88186.787439,80459.596025,37580.641234
115,ICGC_PCA003_T01,1.0,231.0,PRAD,,,,,,,...,6559.448093,2547.235897,5409.166991,12143.005200,18983.084762,6539.425058,44125.409150,122849.461411,98251.622307,26685.654562


In [10]:
import numpy as np
given_value = 1000  # From TCGA

# Select columns from column 6 to the end
columns_to_normalize = df.columns[5:]
df_to_norm = df.iloc[:, 5:]
upper_quartile = df_to_norm.quantile(0.75, axis=1)
upper_quartile_rate = upper_quartile / given_value

# Normalize selected columns
normalized_df = df.copy()
normalized_df[columns_to_normalize] = normalized_df[columns_to_normalize].div(upper_quartile_rate, axis=0)
normalized_df[columns_to_normalize] = normalized_df[columns_to_normalize].applymap(lambda x: np.log2(x + 1))
normalized_df

Unnamed: 0,bar,PFI,PFItime,gen_id,type,?|100133144,?|10357,?|10431,?|155060,?|26823,...,ZWILCH|55055,ZWINT|11130,ZXDA|7789,ZXDB|158586,ZXDC|79364,ZYG11A|440590,ZYG11B|79699,ZYX|7791,ZZEF1|23140,ZZZ3|26009
0,UTPC019,1,648,PRAD,,0.0,,,8.892365,,...,7.426457,6.581837,8.49768,8.5973,10.264096,6.078563,10.788899,9.928071,11.491417,10.463357
1,UTPC021,1,828,PRAD,,0.0,,,8.802965,,...,6.871342,4.267441,7.716596,8.526476,10.042861,7.067305,10.371404,10.973093,11.557097,10.271043
2,UTPC104,1,0,PRAD,,0.0,,,8.603454,,...,6.516897,6.620532,8.240698,8.785906,10.155434,7.171731,10.439478,10.696139,11.045102,10.97086
3,UTPC141,1,546,PRAD,,0.0,,,9.245326,,...,5.984169,5.412831,7.669841,8.316352,10.042475,7.624837,10.619874,10.378055,11.519504,10.692691
4,VA-PC-90-4,1,1080,PRAD,,0.0,,,7.825965,,...,8.007174,6.843029,8.148751,9.219243,9.91565,9.178366,10.769175,9.424494,11.037652,11.226781
5,VA-PC-95-42,1,355,PRAD,,0.0,,,7.201261,,...,6.131868,5.433023,8.360435,8.631673,10.352765,8.368347,10.771963,9.058163,10.858552,10.41964
6,VA-PC-96-43,1,1466,PRAD,,0.0,,,8.544626,,...,7.282529,4.807687,8.409873,9.120973,10.377478,6.660062,10.902692,10.613097,10.975934,10.811286
7,VA-PC-96-44,1,780,PRAD,,2.937048,,,8.265049,,...,7.000065,4.621121,8.264395,9.033785,9.931643,8.095414,10.329148,8.593744,10.828274,11.141556
8,VA-PC-97-46,1,83,PRAD,,0.0,,,7.839201,,...,6.032622,6.76733,7.729999,8.912674,10.094054,6.656284,10.54477,10.025939,11.214681,10.881405
9,VA-PC-91-65,1,21,PRAD,,0.0,,,7.614087,,...,7.079193,3.107056,8.747834,8.982494,10.157338,6.839615,11.020486,10.766874,11.4797,10.130254


In [11]:
test_df = normalized_df.iloc[:, 5:]
upper_quartile = test_df.quantile(0.75, axis=1)
upper_quartile

0     10.728568
1     10.876657
2     11.158916
3     11.042272
4     11.052027
5     11.293227
6     10.832788
7     10.884588
8     10.842352
9     11.648963
10    11.517333
11    11.437349
12    11.106361
13    10.868634
14    11.530470
15    11.233822
16    11.746057
17    11.821845
18    11.364739
19    11.647845
20    10.760241
21     9.859863
22    10.885978
23    11.797245
24    10.785829
25    11.176818
26    11.360411
27    11.700945
28    10.829970
29    11.705119
30    11.339997
31    11.146864
32    11.172108
33    10.033877
34    11.447612
35    11.557638
36    11.646858
37    11.205466
38    11.502602
39    11.618569
40    11.142149
41    11.383952
42    11.127531
43    11.342898
44    11.705824
45     9.764536
46    11.747509
47    11.365127
48    11.146241
49    11.828821
50    11.373038
51    11.306986
52    11.631569
53    11.720346
54    11.464103
Name: 0.75, dtype: float64

In [12]:
#Norm by VCP, RAB7A, GPI: 38.12376439497399, without log all: 21419.06700584365, without log risk: 21633.678655542713
#Norm by all 10: 117.54838057590766, without log all: 43416.288172294895, without log risk: 43753.68415198343

#'CPTAC3/Validation/prostate_dkfz_2018/Matched_DKFZ_GeneExp_FPKM.csv': 3 genes 757193.2278522917 ; all genes: 2142440.9625751544

import numpy as np

#house_keeper_list = ['C1orf43', 'CHMP2A', 'EMC7', 'GPI', 'PSMB2', 'PSMB4', 'RAB7A', 'REEP5', 'SNRPD3', 'VCP', 'VPS29']
house_keeper_list = ['VCP', 'RAB7A', 'GPI']

TCGA_3genes_mean = 21419.06700584365  # From TCGA
TCGA_10genes_mean = 43416.288172294895  # From TCGA

# Select columns from column 6 to the end
columns_to_normalize = df.columns[5:]

'''
DKFZ_3genes_mean = 757193.2278522917
DKFZ_10genes_mean = 2142440.9625751544
'''

# Normalize selected columns
normalized_df = df.copy()

value_df = normalized_df.iloc[:, 5:].mean()

total_value = 0.0
for index, row in value_df.to_frame().iterrows():
    # Access row elements using row['column_name'] or row[index]
    gene_name = index.split('|')[0]
    if gene_name in house_keeper_list:
        value = row[0]
        total_value += value

print(total_value)

norm_rate = total_value / TCGA_3genes_mean

normalized_df[columns_to_normalize] = normalized_df[columns_to_normalize].div(norm_rate, axis=0)
normalized_df[columns_to_normalize] = normalized_df[columns_to_normalize].applymap(lambda x: np.log2(x + 1))

normalized_df

759632.3025202297


Unnamed: 0,bar,PFI,PFItime,gen_id,type,?|100133144,?|10357,?|10431,?|155060,?|26823,...,ZWILCH|55055,ZWINT|11130,ZXDA|7789,ZXDB|158586,ZXDC|79364,ZYG11A|440590,ZYG11B|79699,ZYX|7791,ZZEF1|23140,ZZZ3|26009
0,ICGC_PCA198_T01,0.0,1413.0,PRAD,,,,,,,...,9.456960,10.222006,7.668449,9.099242,7.848936,9.266863,10.749513,11.250376,9.499193,10.109065
2,ICGC_PCA192_T01,1.0,63.0,PRAD,,,,,,,...,8.515375,7.808770,7.913234,8.900081,8.558214,7.518149,10.627610,10.800222,9.704030,10.099775
3,ICGC_PCA187_T01,0.0,684.0,PRAD,,,,,,,...,8.248710,7.266054,8.105652,9.285524,7.574224,6.123383,10.529060,10.705885,10.085407,9.858217
4,ICGC_PCA184_T01,0.0,915.0,PRAD,,,,,,,...,8.526837,7.383287,7.822315,9.115882,7.552557,6.570874,9.990801,10.911950,10.020424,9.829439
5,ICGC_PCA176_T01,0.0,717.0,PRAD,,,,,,,...,8.231119,6.938799,7.594022,9.153984,8.294956,3.894215,10.217278,12.113636,9.993462,9.627636
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
110,ICGC_PCA008_T01,0.0,2193.0,PRAD,,,,,,,...,5.652063,7.205662,7.750689,8.075478,9.733191,4.271331,8.995528,12.235172,11.958777,8.488821
112,ICGC_PCA006_T01,1.0,132.0,PRAD,,,,,,,...,7.902965,7.899010,6.636123,7.672800,8.761018,6.401252,9.167385,10.912089,10.499330,9.266062
114,ICGC_PCA004_T01,1.0,15.0,PRAD,,,,,,,...,7.857630,6.516335,7.374307,8.429216,9.201862,7.178426,10.497800,11.280521,11.148279,10.050729
115,ICGC_PCA003_T01,1.0,231.0,PRAD,,,,,,,...,7.538804,6.186331,7.262285,8.423712,9.066786,7.534417,10.282147,11.758615,11.436381,9.557359


In [13]:
#house_keeper_list = ['C1orf43', 'CHMP2A', 'EMC7', 'GPI', 'PSMB2', 'PSMB4', 'RAB7A', 'REEP5', 'SNRPD3', 'VCP', 'VPS29']
#house_keeper_list = ['VCP', 'RAB7A', 'GPI']

value_df = normalized_df.iloc[:, 5:].mean()

total_value = 0.0
for index, row in value_df.to_frame().iterrows():
    # Access row elements using row['column_name'] or row[index]
    gene_name = index.split('|')[0]
    if gene_name in house_keeper_list:
        value = row[0]
        total_value += value

print(total_value)

38.12376439497399


In [14]:
df_risk = normalized_df
df_risk = df_risk.fillna(0)

# Calculate the mean of each row
#row_means = df_risk[columns_to_normalize].mean(axis=1)

# Apply the fillna method to each row, replacing NaN values with the corresponding row mean
#df_risk[columns_to_normalize] = df_risk[columns_to_normalize].apply(lambda row: row.fillna(row_means[row.name]), axis=1)


df_risk['PFItime'] = df_risk['PFItime'].astype(float)
df_risk['PFItime'] = df_risk['PFItime'].astype(int)
df_risk = df_risk[df_risk.PFItime >= 0]
df_risk

Unnamed: 0,bar,PFI,PFItime,gen_id,type,?|100133144,?|10357,?|10431,?|155060,?|26823,...,ZWILCH|55055,ZWINT|11130,ZXDA|7789,ZXDB|158586,ZXDC|79364,ZYG11A|440590,ZYG11B|79699,ZYX|7791,ZZEF1|23140,ZZZ3|26009
0,ICGC_PCA198_T01,0.0,1413,PRAD,0.0,0.0,0.0,0.0,0.0,0.0,...,9.456960,10.222006,7.668449,9.099242,7.848936,9.266863,10.749513,11.250376,9.499193,10.109065
2,ICGC_PCA192_T01,1.0,63,PRAD,0.0,0.0,0.0,0.0,0.0,0.0,...,8.515375,7.808770,7.913234,8.900081,8.558214,7.518149,10.627610,10.800222,9.704030,10.099775
3,ICGC_PCA187_T01,0.0,684,PRAD,0.0,0.0,0.0,0.0,0.0,0.0,...,8.248710,7.266054,8.105652,9.285524,7.574224,6.123383,10.529060,10.705885,10.085407,9.858217
4,ICGC_PCA184_T01,0.0,915,PRAD,0.0,0.0,0.0,0.0,0.0,0.0,...,8.526837,7.383287,7.822315,9.115882,7.552557,6.570874,9.990801,10.911950,10.020424,9.829439
5,ICGC_PCA176_T01,0.0,717,PRAD,0.0,0.0,0.0,0.0,0.0,0.0,...,8.231119,6.938799,7.594022,9.153984,8.294956,3.894215,10.217278,12.113636,9.993462,9.627636
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
110,ICGC_PCA008_T01,0.0,2193,PRAD,0.0,0.0,0.0,0.0,0.0,0.0,...,5.652063,7.205662,7.750689,8.075478,9.733191,4.271331,8.995528,12.235172,11.958777,8.488821
112,ICGC_PCA006_T01,1.0,132,PRAD,0.0,0.0,0.0,0.0,0.0,0.0,...,7.902965,7.899010,6.636123,7.672800,8.761018,6.401252,9.167385,10.912089,10.499330,9.266062
114,ICGC_PCA004_T01,1.0,15,PRAD,0.0,0.0,0.0,0.0,0.0,0.0,...,7.857630,6.516335,7.374307,8.429216,9.201862,7.178426,10.497800,11.280521,11.148279,10.050729
115,ICGC_PCA003_T01,1.0,231,PRAD,0.0,0.0,0.0,0.0,0.0,0.0,...,7.538804,6.186331,7.262285,8.423712,9.066786,7.534417,10.282147,11.758615,11.436381,9.557359


In [15]:
# Count the number of samples for each class in the "gen_id" column
class_counts = df_risk['gen_id'].value_counts()
# Print the class counts
print(class_counts)

PRAD    105
Name: gen_id, dtype: int64


In [16]:
datasetname = 'DKFZ'

import numpy as np
cancer_type_list = ['PRAD']
for cancer_get in cancer_type_list:
    data_get = df_risk[df_risk.gen_id == cancer_get]
    n = 2
    div_point = [1] * n
    threshold = 3 * 365
    data_get = data_get[(data_get.PFI == 1) | (data_get.PFItime > threshold)]
    data_get.insert(4, 'predicted_label', 0, True)
    timelabel = []
    for item in data_get['PFItime'].tolist():
        i = 0
        if item < threshold:
            timelabel.append(0)
        else:
            timelabel.append(1)

    data_get['predicted_label'] = np.array(timelabel)
    data_get.to_csv('CPTAC-3&DKFZ/Validation/3Genes_Log_Norm_FZ/CancerRNA_{}_{}_Risk_{}.txt'.format(datasetname, cancer_get, n), index=None)

In [17]:
cancer_type_list = ['PRAD']
for cancer_get in cancer_type_list:
    data_get = df_risk[df_risk.gen_id == cancer_get]
    timelist = data_get['PFItime'].tolist()
    timelist.sort()
    n = 3
    div_point = []
    timelabel = []
    for i in range(n):
        div_point.append(timelist[int(len(timelist) / n * i)])
    print(div_point)
    for item in data_get['PFItime'].tolist():
        i = 0
        while item >= div_point[i] and i < (n - 1):
            i += 1
        if item >= div_point[i] and i == (n - 1):
            i += 1
        timelabel.append(i - 1)
    data_get['predicted_label'] = np.array(timelabel)
    data_get.to_csv('CPTAC-3&DKFZ/Validation/3Genes_Log_Norm_FZ/CancerRNA_{}_{}_WholeTimeSeq_{}.txt'.format(datasetname, cancer_get, n), index=None)

[15, 738, 1443]
