In [1]:
import numpy as np
import pandas as pd
import os
from tqdm import tqdm

In [2]:
mutation_data_path = 'mutation_data.csv'
df = pd.read_csv(mutation_data_path)
df.drop(df.columns[df.columns.str.contains('unnamed',case = False)],axis = 1, inplace = True)

In [3]:
df

Unnamed: 0,Patient ID,image_path,IDH status,IDH-1P10Q Subtype,ATRX status,MGMT promoter status,TERT expression status,Cancer Type Detailed
0,0_0_TCGA-02-0006_BS1_001,data_small/0_0_TCGA-02-0006_BS1_001.jpg,0.0,0,,0.0,0,G
1,0_0_TCGA-02-0007_BS1_001,data_small/0_0_TCGA-02-0007_BS1_001.jpg,0.0,0,,0.0,0,G
2,0_0_TCGA-02-0010_BS1_001,data_small/0_0_TCGA-02-0010_BS1_001.jpg,1.0,0,,0.0,0,G
3,0_0_TCGA-02-0010_BS1_001_3,data_small/0_0_TCGA-02-0010_BS1_001_3.jpg,1.0,0,,0.0,0,G
4,0_0_TCGA-02-0016_BS1_001,data_small/0_0_TCGA-02-0016_BS1_001.jpg,0.0,0,,,0,G
...,...,...,...,...,...,...,...,...
6055,7_2_TCGA-HT-7603_BS1_001_3,data_small/7_2_TCGA-HT-7603_BS1_001_3.jpg,1.0,0,1.0,1.0,0,O
6056,7_2_TCGA-HT-7605_BS1_001_3,data_small/7_2_TCGA-HT-7605_BS1_001_3.jpg,1.0,1,0.0,1.0,0,O
6057,7_2_TCGA-HT-7616_BS1_001_3,data_small/7_2_TCGA-HT-7616_BS1_001_3.jpg,1.0,1,0.0,1.0,0,O
6058,7_2_TCGA-HT-7677_BS1_001_3,data_small/7_2_TCGA-HT-7677_BS1_001_3.jpg,1.0,1,0.0,1.0,1,O


In [4]:
# lazy grouping
groups = df.groupby('Cancer Type Detailed')
group_g = groups.get_group('G')
group_a = groups.get_group('A')
group_o = groups.get_group('O')

In [5]:
print("Number of Glioblastoma (G) samples: ", len(group_g))
print("Number of Astrocytoma (A) samples: ", len(group_a))
print("Number of Oligodendroglioma (O) samples: ", len(group_o))

Number of Glioblastoma (G) samples:  1824
Number of Astrocytoma (A) samples:  1980
Number of Oligodendroglioma (O) samples:  2256


Find the most common value for each mutation for each group

In [17]:
group_g_mutations = dict()
columns = group_g.columns
mutation_names = columns[2:-1]
for mutation in mutation_names:
    vals, counts = np.unique(group_g[mutation].values, return_counts=True)
    print("Mutation: ", mutation)
    print(f"Values: {vals}\tCounts: {counts}")
    most_common = np.argmax(counts)
    most_common_val = vals[most_common]
    group_g_mutations[mutation] = most_common_val
group_g_mutations

Mutation:  IDH status
Values: [ 0.  1. nan]	Counts: [1088  268  468]
Mutation:  IDH-1P10Q Subtype
Values: [0 1]	Counts: [1764   60]
Mutation:  ATRX status
Values: [0. 1.]	Counts: [ 323 1501]
Mutation:  MGMT promoter status
Values: [0. 1.]	Counts: [ 502 1322]
Mutation:  TERT expression status
Values: [0 1]	Counts: [1692  132]


{'IDH status': 0.0,
 'IDH-1P10Q Subtype': 0,
 'ATRX status': 1.0,
 'MGMT promoter status': 1.0,
 'TERT expression status': 0}

In [11]:
group_a_mutations = dict()
columns = group_a.columns
mutation_names = columns[2:-1]
for mutation in mutation_names:
    vals, counts = np.unique(group_a[mutation].values, return_counts=True)
    print("Mutation: ", mutation)
    print(f"Values: {vals}\tCounts: {counts}")
    most_common = np.argmax(counts)
    most_common_val = vals[most_common]
    group_a_mutations[mutation] = most_common_val
group_a_mutations

Mutation:  IDH status
Values: [ 0.  1. nan]	Counts: [ 567 1346   67]
Mutation:  IDH-1P10Q Subtype
Values: [0 1]	Counts: [1857  123]
Mutation:  ATRX status
Values: [ 0.  1. nan]	Counts: [ 783 1130   67]
Mutation:  MGMT promoter status
Values: [0. 1.]	Counts: [ 627 1353]
Mutation:  TERT expression status
Values: [0 1]	Counts: [1735  245]


{'IDH status': 1.0,
 'IDH-1P10Q Subtype': 0,
 'ATRX status': 1.0,
 'MGMT promoter status': 1.0,
 'TERT expression status': 0}

In [12]:
group_o_mutations = dict()
columns = group_o.columns
mutation_names = columns[2:-1]
for mutation in mutation_names:
    vals, counts = np.unique(group_o[mutation].values, return_counts=True)
    print("Mutation: ", mutation)
    print(f"Values: {vals}\tCounts: {counts}")
    most_common = np.argmax(counts)
    most_common_val = vals[most_common]
    group_o_mutations[mutation] = most_common_val
group_o_mutations

Mutation:  IDH status
Values: [0. 1.]	Counts: [ 128 2128]
Mutation:  IDH-1P10Q Subtype
Values: [0 1]	Counts: [ 439 1817]
Mutation:  ATRX status
Values: [0. 1.]	Counts: [1816  440]
Mutation:  MGMT promoter status
Values: [0. 1.]	Counts: [  26 2230]
Mutation:  TERT expression status
Values: [0 1]	Counts: [ 693 1563]


{'IDH status': 1.0,
 'IDH-1P10Q Subtype': 1,
 'ATRX status': 0.0,
 'MGMT promoter status': 1.0,
 'TERT expression status': 1}

We need to add the ATRX mutation status to 1 (mutated/lost) for NaN values in the ATRX column of all samples that have Glioblastoma (GBM)
Reference: [CNS 2021 WHO guidelines](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC8328013/)

In [14]:
# Fill missing ATRX value of group G
group_g['ATRX status'] = group_g['ATRX status'].fillna(1)
group_g

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  group_g['ATRX status'] = group_g['ATRX status'].fillna(1)


Unnamed: 0,Patient ID,image_path,IDH status,IDH-1P10Q Subtype,ATRX status,MGMT promoter status,TERT expression status,Cancer Type Detailed
0,0_0_TCGA-02-0006_BS1_001,data_small/0_0_TCGA-02-0006_BS1_001.jpg,0.0,0,1.0,0.0,0,G
1,0_0_TCGA-02-0007_BS1_001,data_small/0_0_TCGA-02-0007_BS1_001.jpg,0.0,0,1.0,0.0,0,G
2,0_0_TCGA-02-0010_BS1_001,data_small/0_0_TCGA-02-0010_BS1_001.jpg,1.0,0,1.0,0.0,0,G
3,0_0_TCGA-02-0010_BS1_001_3,data_small/0_0_TCGA-02-0010_BS1_001_3.jpg,1.0,0,1.0,0.0,0,G
4,0_0_TCGA-02-0016_BS1_001,data_small/0_0_TCGA-02-0016_BS1_001.jpg,0.0,0,1.0,,0,G
...,...,...,...,...,...,...,...,...
5421,3_2_TCGA-02-0099_BS1_002_3,data_small/3_2_TCGA-02-0099_BS1_002_3.jpg,0.0,0,1.0,1.0,0,G
5422,3_2_TCGA-02-0099_BS1_003_3,data_small/3_2_TCGA-02-0099_BS1_003_3.jpg,0.0,0,1.0,1.0,0,G
5423,3_2_TCGA-06-6694_BS1_001_3,data_small/3_2_TCGA-06-6694_BS1_001_3.jpg,0.0,0,0.0,1.0,0,G
5424,3_2_TCGA-06-6701_BS1_001_3,data_small/3_2_TCGA-06-6701_BS1_001_3.jpg,1.0,0,1.0,1.0,0,G


For the MGMT status in Glioblastoma group, we need to replace NaN values with 1 values as most GBMs have MGMT status 1 (methylated)

In [16]:
# Fill missing MGMT value of Group G
group_g['MGMT promoter status'] = group_g['MGMT promoter status'].fillna(1)
group_g

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  group_g['MGMT promoter status'] = group_g['MGMT promoter status'].fillna(1)


Unnamed: 0,Patient ID,image_path,IDH status,IDH-1P10Q Subtype,ATRX status,MGMT promoter status,TERT expression status,Cancer Type Detailed
0,0_0_TCGA-02-0006_BS1_001,data_small/0_0_TCGA-02-0006_BS1_001.jpg,0.0,0,1.0,0.0,0,G
1,0_0_TCGA-02-0007_BS1_001,data_small/0_0_TCGA-02-0007_BS1_001.jpg,0.0,0,1.0,0.0,0,G
2,0_0_TCGA-02-0010_BS1_001,data_small/0_0_TCGA-02-0010_BS1_001.jpg,1.0,0,1.0,0.0,0,G
3,0_0_TCGA-02-0010_BS1_001_3,data_small/0_0_TCGA-02-0010_BS1_001_3.jpg,1.0,0,1.0,0.0,0,G
4,0_0_TCGA-02-0016_BS1_001,data_small/0_0_TCGA-02-0016_BS1_001.jpg,0.0,0,1.0,1.0,0,G
...,...,...,...,...,...,...,...,...
5421,3_2_TCGA-02-0099_BS1_002_3,data_small/3_2_TCGA-02-0099_BS1_002_3.jpg,0.0,0,1.0,1.0,0,G
5422,3_2_TCGA-02-0099_BS1_003_3,data_small/3_2_TCGA-02-0099_BS1_003_3.jpg,0.0,0,1.0,1.0,0,G
5423,3_2_TCGA-06-6694_BS1_001_3,data_small/3_2_TCGA-06-6694_BS1_001_3.jpg,0.0,0,0.0,1.0,0,G
5424,3_2_TCGA-06-6701_BS1_001_3,data_small/3_2_TCGA-06-6701_BS1_001_3.jpg,1.0,0,1.0,1.0,0,G


Now we need to combine the three groups into one dataframe and then write it into a csv file.

Unnamed: 0,Patient ID,image_path,IDH status,IDH-1P10Q Subtype,ATRX status,MGMT promoter status,TERT expression status,Cancer Type Detailed
0,0_0_TCGA-02-0006_BS1_001,data_small/0_0_TCGA-02-0006_BS1_001.jpg,0.0,0,1.0,0.0,0,G
1,0_0_TCGA-02-0007_BS1_001,data_small/0_0_TCGA-02-0007_BS1_001.jpg,0.0,0,1.0,0.0,0,G
2,0_0_TCGA-02-0010_BS1_001,data_small/0_0_TCGA-02-0010_BS1_001.jpg,1.0,0,1.0,0.0,0,G
3,0_0_TCGA-02-0010_BS1_001_3,data_small/0_0_TCGA-02-0010_BS1_001_3.jpg,1.0,0,1.0,0.0,0,G
4,0_0_TCGA-02-0016_BS1_001,data_small/0_0_TCGA-02-0016_BS1_001.jpg,0.0,0,1.0,1.0,0,G
...,...,...,...,...,...,...,...,...
5421,3_2_TCGA-02-0099_BS1_002_3,data_small/3_2_TCGA-02-0099_BS1_002_3.jpg,0.0,0,1.0,1.0,0,G
5422,3_2_TCGA-02-0099_BS1_003_3,data_small/3_2_TCGA-02-0099_BS1_003_3.jpg,0.0,0,1.0,1.0,0,G
5423,3_2_TCGA-06-6694_BS1_001_3,data_small/3_2_TCGA-06-6694_BS1_001_3.jpg,0.0,0,0.0,1.0,0,G
5424,3_2_TCGA-06-6701_BS1_001_3,data_small/3_2_TCGA-06-6701_BS1_001_3.jpg,1.0,0,1.0,1.0,0,G
