In [1]:
import numpy as np
import pandas as pd
import os
from tqdm import tqdm

In [2]:
mutation_data_path = 'mutation_data_big.csv'
df = pd.read_csv(mutation_data_path)
df.drop(df.columns[df.columns.str.contains('unnamed',case = False)],axis = 1, inplace = True)

In [None]:
df

In [3]:
# lazy grouping
groups = df.groupby('Cancer Type Detailed')
group_g = groups.get_group('G')
group_a = groups.get_group('A')
group_o = groups.get_group('O')

In [4]:
print("Number of Glioblastoma (G) samples: ", len(group_g))
print("Number of Astrocytoma (A) samples: ", len(group_a))
print("Number of Oligodendroglioma (O) samples: ", len(group_o))

Number of Glioblastoma (G) samples:  18973
Number of Astrocytoma (A) samples:  21159
Number of Oligodendroglioma (O) samples:  24968


Find the most common value for each mutation for each group

In [5]:
group_g_mutations = dict()
columns = group_g.columns
mutation_names = columns[2:-1]
for mutation in mutation_names:
    vals, counts = np.unique(group_g[mutation].values, return_counts=True)
    print("Mutation: ", mutation)
    print(f"Values: {vals}\tCounts: {counts}")
    most_common = np.argmax(counts)
    most_common_val = vals[most_common]
    group_g_mutations[mutation] = most_common_val
group_g_mutations

Mutation:  IDH status
Values: [ 0.  1. nan]	Counts: [10644  3755  4574]
Mutation:  IDH-1P10Q Subtype
Values: [0 1]	Counts: [18334   639]
Mutation:  ATRX status
Values: [ 0.  1. nan]	Counts: [ 3652  3358 11963]
Mutation:  MGMT promoter status
Values: [ 0.  1. nan]	Counts: [4544 7015 7414]
Mutation:  TERT expression status
Values: [0 1]	Counts: [17847  1126]


{'IDH status': 0.0,
 'IDH-1P10Q Subtype': 0,
 'ATRX status': nan,
 'MGMT promoter status': nan,
 'TERT expression status': 0}

In [6]:
group_a_mutations = dict()
columns = group_a.columns
mutation_names = columns[2:-1]
for mutation in mutation_names:
    vals, counts = np.unique(group_a[mutation].values, return_counts=True)
    print("Mutation: ", mutation)
    print(f"Values: {vals}\tCounts: {counts}")
    most_common = np.argmax(counts)
    most_common_val = vals[most_common]
    group_a_mutations[mutation] = most_common_val
group_a_mutations

Mutation:  IDH status
Values: [ 0.  1. nan]	Counts: [ 5878 14815   466]
Mutation:  IDH-1P10Q Subtype
Values: [0 1]	Counts: [19584  1575]
Mutation:  ATRX status
Values: [ 0.  1. nan]	Counts: [ 8827 11866   466]
Mutation:  MGMT promoter status
Values: [0. 1.]	Counts: [ 7222 13937]
Mutation:  TERT expression status
Values: [0 1]	Counts: [18037  3122]


{'IDH status': 1.0,
 'IDH-1P10Q Subtype': 0,
 'ATRX status': 1.0,
 'MGMT promoter status': 1.0,
 'TERT expression status': 0}

In [7]:
group_o_mutations = dict()
columns = group_o.columns
mutation_names = columns[2:-1]
for mutation in mutation_names:
    vals, counts = np.unique(group_o[mutation].values, return_counts=True)
    print("Mutation: ", mutation)
    print(f"Values: {vals}\tCounts: {counts}")
    most_common = np.argmax(counts)
    most_common_val = vals[most_common]
    group_o_mutations[mutation] = most_common_val
group_o_mutations

Mutation:  IDH status
Values: [0. 1.]	Counts: [ 1547 23421]
Mutation:  IDH-1P10Q Subtype
Values: [0 1]	Counts: [ 4979 19989]
Mutation:  ATRX status
Values: [0. 1.]	Counts: [20318  4650]
Mutation:  MGMT promoter status
Values: [0. 1.]	Counts: [  138 24830]
Mutation:  TERT expression status
Values: [0 1]	Counts: [ 8129 16839]


{'IDH status': 1.0,
 'IDH-1P10Q Subtype': 1,
 'ATRX status': 0.0,
 'MGMT promoter status': 1.0,
 'TERT expression status': 1}

We need to add the ATRX mutation status to 1 (mutated/lost) for NaN values in the ATRX column of all samples that have Glioblastoma (GBM)
Reference: [CNS 2021 WHO guidelines](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC8328013/)

In [8]:
# Fill missing ATRX value of group G
group_g['ATRX status'] = group_g['ATRX status'].fillna(1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  group_g['ATRX status'] = group_g['ATRX status'].fillna(1)


For the MGMT status in Glioblastoma group, we need to replace NaN values with 1 values as most GBMs have MGMT status 1 (methylated)

In [9]:
# Fill missing MGMT value of Group G
group_g['MGMT promoter status'] = group_g['MGMT promoter status'].fillna(1)
group_g

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  group_g['MGMT promoter status'] = group_g['MGMT promoter status'].fillna(1)


Unnamed: 0,Patient ID,image_path,IDH status,IDH-1P10Q Subtype,ATRX status,MGMT promoter status,TERT expression status,Cancer Type Detailed
0,0_0_TCGA-02-0006_BS1_001,data_all/0_0_TCGA-02-0006_BS1_001.jpg,0.0,0,1.0,0.0,0,G
1,0_0_TCGA-02-0007_BS1_001,data_all/0_0_TCGA-02-0007_BS1_001.jpg,0.0,0,1.0,0.0,0,G
2,0_0_TCGA-02-0010_BS1_001,data_all/0_0_TCGA-02-0010_BS1_001.jpg,1.0,0,1.0,0.0,0,G
3,0_0_TCGA-02-0010_BS1_001_3,data_all/0_0_TCGA-02-0010_BS1_001_3.jpg,1.0,0,1.0,0.0,0,G
4,0_0_TCGA-02-0016_BS1_001,data_all/0_0_TCGA-02-0016_BS1_001.jpg,0.0,0,1.0,1.0,0,G
...,...,...,...,...,...,...,...,...
64736,51_23_TCGA-06-6701_BS1_001,data_all/51_23_TCGA-06-6701_BS1_001.jpg,1.0,0,1.0,1.0,0,G
64756,51_24_TCGA-06-6701_BS1_001,data_all/51_24_TCGA-06-6701_BS1_001.jpg,1.0,0,1.0,1.0,0,G
64776,51_25_TCGA-06-6701_BS1_001,data_all/51_25_TCGA-06-6701_BS1_001.jpg,1.0,0,1.0,1.0,0,G
64795,51_26_TCGA-06-6701_BS1_001,data_all/51_26_TCGA-06-6701_BS1_001.jpg,1.0,0,1.0,1.0,0,G


For Group G, the IDH status should be 0 (wildtype). We fill all Nan IDH status for Group G with 0

In [10]:
group_g['IDH status'] = group_g['IDH status'].fillna(0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  group_g['IDH status'] = group_g['IDH status'].fillna(0)


For Group A, there are two mutations having NaN values - IDH status and ATRX. We will missing IDH status with 1 and missing ATRX status with 1 as majority is 1 for both mutations

In [11]:
group_a['IDH status'] = group_a['IDH status'].fillna(1)
group_a['ATRX status'] = group_a['ATRX status'].fillna(1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  group_a['IDH status'] = group_a['IDH status'].fillna(1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  group_a['ATRX status'] = group_a['ATRX status'].fillna(1)


Now we need to combine the three groups into one dataframe and then write it into a csv file.

In [12]:
group_g = group_g.append(group_a)
group_g = group_g.append(group_o)
group_g

  group_g = group_g.append(group_a)
  group_g = group_g.append(group_o)


Unnamed: 0,Patient ID,image_path,IDH status,IDH-1P10Q Subtype,ATRX status,MGMT promoter status,TERT expression status,Cancer Type Detailed
0,0_0_TCGA-02-0006_BS1_001,data_all/0_0_TCGA-02-0006_BS1_001.jpg,0.0,0,1.0,0.0,0,G
1,0_0_TCGA-02-0007_BS1_001,data_all/0_0_TCGA-02-0007_BS1_001.jpg,0.0,0,1.0,0.0,0,G
2,0_0_TCGA-02-0010_BS1_001,data_all/0_0_TCGA-02-0010_BS1_001.jpg,1.0,0,1.0,0.0,0,G
3,0_0_TCGA-02-0010_BS1_001_3,data_all/0_0_TCGA-02-0010_BS1_001_3.jpg,1.0,0,1.0,0.0,0,G
4,0_0_TCGA-02-0016_BS1_001,data_all/0_0_TCGA-02-0016_BS1_001.jpg,0.0,0,1.0,1.0,0,G
...,...,...,...,...,...,...,...,...
65089,52_26_TCGA-HT-7616_BS1_001,data_all/52_26_TCGA-HT-7616_BS1_001.jpg,1.0,1,0.0,1.0,0,O
65092,52_27_TCGA-EZ-7264_BS1_001,data_all/52_27_TCGA-EZ-7264_BS1_001.jpg,1.0,1,0.0,1.0,1,O
65093,52_27_TCGA-FG-7641_BS1_001,data_all/52_27_TCGA-FG-7641_BS1_001.jpg,1.0,1,0.0,1.0,1,O
65095,52_27_TCGA-HT-7605_BS1_001,data_all/52_27_TCGA-HT-7605_BS1_001.jpg,1.0,1,0.0,1.0,0,O


In [13]:
group_g.isna().sum()

Patient ID                0
image_path                0
IDH status                0
IDH-1P10Q Subtype         0
ATRX status               0
MGMT promoter status      0
TERT expression status    0
Cancer Type Detailed      0
dtype: int64

In [14]:
group_g['image_path'] = group_g['image_path'].str.replace('data_all', 'data')
group_g

Unnamed: 0,Patient ID,image_path,IDH status,IDH-1P10Q Subtype,ATRX status,MGMT promoter status,TERT expression status,Cancer Type Detailed
0,0_0_TCGA-02-0006_BS1_001,data/0_0_TCGA-02-0006_BS1_001.jpg,0.0,0,1.0,0.0,0,G
1,0_0_TCGA-02-0007_BS1_001,data/0_0_TCGA-02-0007_BS1_001.jpg,0.0,0,1.0,0.0,0,G
2,0_0_TCGA-02-0010_BS1_001,data/0_0_TCGA-02-0010_BS1_001.jpg,1.0,0,1.0,0.0,0,G
3,0_0_TCGA-02-0010_BS1_001_3,data/0_0_TCGA-02-0010_BS1_001_3.jpg,1.0,0,1.0,0.0,0,G
4,0_0_TCGA-02-0016_BS1_001,data/0_0_TCGA-02-0016_BS1_001.jpg,0.0,0,1.0,1.0,0,G
...,...,...,...,...,...,...,...,...
65089,52_26_TCGA-HT-7616_BS1_001,data/52_26_TCGA-HT-7616_BS1_001.jpg,1.0,1,0.0,1.0,0,O
65092,52_27_TCGA-EZ-7264_BS1_001,data/52_27_TCGA-EZ-7264_BS1_001.jpg,1.0,1,0.0,1.0,1,O
65093,52_27_TCGA-FG-7641_BS1_001,data/52_27_TCGA-FG-7641_BS1_001.jpg,1.0,1,0.0,1.0,1,O
65095,52_27_TCGA-HT-7605_BS1_001,data/52_27_TCGA-HT-7605_BS1_001.jpg,1.0,1,0.0,1.0,0,O


In [15]:
filename = 'mutation_data_all.csv'
df.to_csv('./'+filename)