SDS-1473

EGFR data to assign variant grouping to S768I, L861Q and G719X mutations

In [1]:
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

In [2]:
#read the data present in csv file
data = pd.read_csv("/Users/diwyanshu/Desktop/PROJECTS/PT360/EGFR/EGFR_Q2_allData.csv", sep = ',', low_memory=False)

#data.head()

EGFR Variant representative

S768I - NC_000007.13:g.55249005G>T, ENST00000275493.2:c.2303G>T, NM_005228.4:c.2303G>T, NP_005219.2:p.Ser768Ile,
        p.S768I, c.2303G>T

L861Q - NM_005228.4:c.2582T>A, NP_005219.2:p.Leu861Gln, NC_000007.13:g.55259524T>A, ENST00000275493.2:c.2582T>A,
        p.L861Q, c.2582T>A

G719 - c.2156G>, p.Gly719, g.55241708G>
        p.G719, c.2156G>

In [3]:
#EGFR variant representative from PT360 dataset and NCIt

L861Q = ["p.L861Q", "c.2582T>A;p.L861Q", "c.2582T>A;p.L816Q",
        "L861Q", "c.2582T>A", "2582T>A", "p.Leu861Gln", "Leu861Gln",
        "Activating L861Q Mutation", "Exon 21 L861Q", "NP_005219.2:p.L861Q", "NP_005219.2:p.Leu861Gln"]
#NCIt Code: C98497

S768I = ["p.S768I", "c.2303G>T;p.S768I", "c.2303C>T;p.S768I", "c.2305G>T;p.S768I", "c.2303G>T",
         "c.2303 G>T(p.S7681)", "_S768I_;p.Ser768IIe", "Exon  20 S7681",
         "S768I", "c.2303G>T", "2303G>T", "p.Ser768Ile", "Ser768Ile",
        "Activating S768I Mutation", "Exon 20 S768I", "NP_005219.2:p.S768I",
         "NP_005219.2:p.Ser768Ile", "S768I Mutation"]
#NCIt Code: C125210


G719X = ["p.G719A", "p.G719S", "p.G719C", "p.G719X", "G719X", "c.2156G>C;p.G719A", "p.G719D", "c.2155G>T;p.G719C",
        "c.2155G>A;p.G719S", "p.G719", "c.2156G>C", "Exon 18 G719", "c.2154_2155delinsTT;p.G719C",
        "Exon 18 G719X", "G719X 18", "G719", "c.2155G>A;p.G719X", "Codon G719", "c.2155G>C;p.G719A",
        "c.2155G>T;p.G719X", "EXON 18 - G719X", "2155>T (G719X)", "c.2156G>C;p.G719X", "Exon 18| G719",
        "c.2154_2156delinsTGC;p.G719A", "exon 18 (G719X)", "(mutations at codon G719 in exon 18", "exon 18   G719X",
        "Exons 18 (G719X)", "Exons 19(G719X)", "p.G719K", "p.G719*",
        "Activating G719X Mutation", "exon 18 G719X", "G719 Mutation", "Gly719Xxx",
         "NP_005219.2:p.G719X", "NP_005219.2:p.Gly719Xxx", "Exon 18 Gly719", "NP_005219.2:p.G719X",
         "NP_005219.2:p.Gly719Xxx"]
#NCIt Code: C107154


In [4]:
#Filter applied: biomarker name = EGFR, test result name = poitive, negative and defined variant groups
filtered_data = data[
    (data['biomarker_name_name'] == "EGFR") & 
    (data['test_result_name'].isin(['Positive', 'Equivocal'])) & 
    (data['molecular_variant'].isin(L861Q + S768I + G719X))
]

#total molecular variants post filter for which grouping is required
filtered_data.groupby('molecular_variant').size().reset_index(name='count').sum()

molecular_variant    Exon  20 S7681Exon 18 G719XG719X_S768I_;p.Ser7...
count                                                              306
dtype: object

In [5]:
# assign variant group L861Q, S768I and G719 to column variant_grouping_code, variant_grouping_name,
# and variant_grouping_vocab based on filtered data

filtered_data.loc[filtered_data['molecular_variant'].isin(L861Q), 'variant_grouping_name'] = 'EGFR L861Q'
filtered_data.loc[filtered_data['molecular_variant'].isin(S768I), 'variant_grouping_name'] = 'EGFR S768I'
filtered_data.loc[filtered_data['molecular_variant'].isin(G719X), 'variant_grouping_name'] = 'EGFR G719X'

filtered_data.loc[filtered_data['molecular_variant'].isin(L861Q), 'variant_grouping_code'] = 'C98497'
filtered_data.loc[filtered_data['molecular_variant'].isin(S768I), 'variant_grouping_code'] = 'C125210'
filtered_data.loc[filtered_data['molecular_variant'].isin(G719X), 'variant_grouping_code'] = 'C107154'

filtered_data['variant_grouping_vocab'] = 'NCIt'

#filtered_data.head()

In [6]:
# display the count of newly assigned variant grouping
filtered_data.groupby(['variant_grouping_name', 'variant_grouping_code',
                       'variant_grouping_vocab']).size().reset_index(name='count')

Unnamed: 0,variant_grouping_name,variant_grouping_code,variant_grouping_vocab,count
0,EGFR G719X,C107154,NCIt,155
1,EGFR L861Q,C98497,NCIt,80
2,EGFR S768I,C125210,NCIt,71


In [7]:
# total sum
filtered_data.groupby(['variant_grouping_name', 'variant_grouping_code',
                       'variant_grouping_vocab']).size().reset_index(name='count').sum()

variant_grouping_name     EGFR G719XEGFR L861QEGFR S768I
variant_grouping_code               C107154C98497C125210
variant_grouping_vocab                      NCItNCItNCIt
count                                                306
dtype: object

In [8]:
#apply changes to the csv file

data.update(filtered_data)

In [9]:
# save the modified dataframe to the csv file
data.to_csv("EGFR_Q2_allData_Variant_Grouping.csv", index=False)