# 7 Statistical Evaluation of Cancer Mutations and Natural Variants

Cancer Integration with Data from NCGA < https://portal.gdc.cancer.gov/

There is an API, however since we are only dealing with 31 GAIN domains / 32 receptors, I grabbed all data in JSON. https://docs.gdc.cancer.gov/

In [None]:
%matplotlib inline
import glob, re
import matplotlib.pyplot as plt
import pandas as pd
import gaingrn.scripts.io
import gaingrn.scripts.mutation_utils
from gaingrn.scripts.variant_classes import *

jsons = glob.glob("../data/gain_json/*.json")
csvs = glob.glob("../data/snp_mane/*csv")
oneletter = {'Ala':'A','Arg':'R','Asn':'N','Asp':'D','Cys':'C','Gln':'Q','Glu':'E','Gly':'G','His':'H','Ile':'I','Leu':'L','Lys':'K','Met':'M','Phe':'F',
             'Pro':'P','Pyl':'O','Ser':'S','Thr':'T','Trp':'W','Tyr':'Y','Val':'V'}

We need to construct a new dictionary where the indexing itself locates to. This will be appended to the GainDomain objects.
For each GAIN domain within this collection, map the mutations by residue on the corresponding element. Then, merge all that to construct a global mutation map.

In [None]:
# Initialize the human GAIN collection
human_collection = pd.read_pickle("../data/human_collection.pkl")

human_accessions = [gain.name.split("-")[0].split("_")[0] for gain in human_collection.collection]
human_sequences = ["".join(gain.sequence) for gain in human_collection.collection]
seq_file = '../data/all_query_sequences.fasta'

human_fasta_offsets = gaingrn.scripts.alignment_utils.find_offsets(seq_file,
                                 human_accessions, 
                                 human_sequences)

human_indexing = np.load("../data/human_indexing.pkl", allow_pickle=True)

appended_human_collection = gaingrn.scripts.assign.add_grn_labels(human_collection, human_indexing)

### 1. Construct the MutationAnalysis object with info to all mutations at respective GRN label. 
Also just make a list with the resids to summarize all entry occurences.

In [None]:
segments = ['H1','H2','H3','H4','H5','H6','S1','S2','S3','S4','S5','S6','S7','S8','S9','S10','S11','S12','S13','S14','GPS']

GainMutations = MutationAnalysis(appended_human_collection, segments, jsons, csvs, human_fasta_offsets)


In L3, there is a mismatch between canonical (UniProtKB) and the GDC form (equiv. Isoform 4) - see MSA mapping

D2 (2/2)   | GDC ENST00000334810 = D2-201 (971 aa) - Uniprot Q7Z7M1 (963 aa) - Files OK - remap the four mutations in question : 496, 556, 460, 386, 507 -->  514, 529, 478, 404, X --> MSA mapping with manual curation of these four mutations

F4 (23/26) | GDC ENST00000283303 (695 aa) - Uniprot Q8IZF3 (695 aa) - Re-Download resolved all errors

E2 (6/20)  | GDC ENST00000315576 (823 aa) - Uniprot Q9UHX3 (823 aa) - Re-Download resolved all errors


### 2. Plot the Cancer-enrichement scores of GRN-indexed positions. 
For each SSE, count the mutations/SNPs at each position and generate a score according to 
Wright et al. 2019 https://doi.org/10.1038/s41467-019-08630-2 - cancer-enriched positions will have a positive score.

In [None]:
score_dict = {}

for sse in segments:
# mutations, mut_counts
# snps, snp_counts
    varkeys = [k for k in GainMutations.mut_counts.keys() if sse in k]+[k for k in GainMutations.snp_counts.keys() if sse in k]
    #print(subkeys)
    if sse == "GPS":
        x_range = (1,2,3)
        mut_y = [GainMutations.mut_counts["GPS.-2"], GainMutations.mut_counts["GPS.-1"], GainMutations.mut_counts["GPS.+1"]]
        var_y = [GainMutations.snp_counts["GPS.-2"], GainMutations.snp_counts["GPS.-1"], GainMutations.snp_counts["GPS.+1"]]
    if sse != 'GPS':
        x_positions = [int(x.split('.')[-1]) for x in varkeys]
        x_range = range(min(x_positions), max(x_positions)+1)
        print(sorted(varkeys),"\n", x_range, GainMutations.mut_counts, sep="\n")
        mut_y = gaingrn.scripts.mutation_utils.compose_y(x_range, sse, GainMutations.mut_counts)
        var_y = gaingrn.scripts.mutation_utils.compose_y(x_range, sse, GainMutations.snp_counts)

    
    # Populate the score_dict with the score values
    score_y = gaingrn.scripts.mutation_utils.score(mut_y,var_y)
    if sse != "GPS":
        for j,x in enumerate(x_range):
            label = f"{sse}{x}"
            score_dict[label] = score_y[j]

    # Plot the cancer enrichment score per segment
    gaingrn.scripts.mutation_utils.plot_segment_enrichment_score(sse=sse, res_range=x_range, mutations=mut_y, variants=var_y, scores=score_y, savename=f"../../TESTING/{sse}variants.svg", show=True)

#### Enrichment Score can now be written to B-factor for evaluation

In [None]:
l1_pdb = gaingrn.scripts.io.find_pdb(name="O94910", pdb_folder="../../all_pdbs/")

for i, gain in enumerate(appended_human_collection.collection):
    if "AGRL1" in gain.name: 
        l1_gain = human_collection.collection[i]
        break

gaingrn.scripts.io.score2b(l1_pdb, "../../TESTING/l1_gain_score.pdb", l1_gain.reverse_grn_labels, score_dict)

### 3. Enable parsing data for every GRN label position
Provides a full list of mutations with corresponding info on receptor, impact etc. at each GRN label position.

In [None]:
enriched_positions = ['GPS.-2']#['H2.55','H3.59','H4.37','H4.57','H5.50','H6.41','H6.55','S10.49','S10.50','S11.50','S14.48','S14.50']

# a) Query via dedicated function into text file
for label in enriched_positions:
    gaingrn.scripts.mutation_utils.query_position_variants(GainMutations.generalized_mutations, label, return_aa=True, text_out=f"../../TESTING/{label}.out")

# b)  adress the object directly, i.e. with SNPs
for snp in GainMutations.snps["GPS.-2"]:
    print(snp["resname"],snp["receptor"],snp["HGVS Consequence"])

# c) define your own criteria for parsing the data
poly = lambda n : n > 0.446 # > 0.446 for possibly damaging, > 0.908 for Probably Damaging
sift = lambda n : n < 0.05 # For deleterious impacts

missense_dict = gaingrn.scripts.mutation_utils.query_by_criteria(("consequence", "missense"), poly=lambda n : n > 0.446, mutation_dict=GainMutations.generalized_mutations)

You can also directly parse the variants from the SNP CSV file.

In [None]:
variants = gaingrn.scripts.mutation_utils.retrieve_csv_vars(human_collection.collection[0].name, csvs, filter_str='missense_variant', with_resid=True)

# Filter for VEP annotation to only get the missense mutation
for var in variants:
    if 'Protein Consequence' not in var.keys():
        continue
    #print(var['Protein Consequence'])
    #print(re.findall('[A-z]{3}', var['Protein Consequence']))
    res = oneletter[re.findall('[A-z]{3}', var['Protein Consequence'])[0]] # A, D, ...
    resnum = int(re.findall('[\d]+', var['Protein Consequence'])[0]) # 1364
    print(res, resnum)

### 4. Collect and compile variant data for unindexed loops

In [None]:
loop_lengths = {}
loop_seqs = {}
loop_seq = {}
loop_info = {}

# Check for the indexing and collection to be in the same order
for i, n in enumerate(human_indexing.names):
    assert(n == appended_human_collection.collection[i].name)

for idx, gain in enumerate(human_collection.collection):
    curr_name = gain.name
    intervals = human_indexing.intervals[i]
    named_dir = human_indexing.indexing_dirs[idx]
    
    i_loc, i_dir = gaingrn.scripts.mutation_utils.get_loop_stats(intervals, gain.sequence)
    for k, seq in i_dir.items():
        if k not in loop_info.keys():
            loop_info[k] = []
        loop_info[k].append({'name':f'{gain.name}_{i_loc[k][0]+gain.start}-{i_loc[k][1]+gain.start}', 'sequence':''.join(seq)})

#for loop in loop_info.keys():
#    gaingrn.scripts.mutation_utils.loop2fasta(f"../loops_human/{loop}.fa", loop_info[loop])


Construct a labeled dict with each connector containing the corresponding mutations for this - independent of the receptor:

i.e. "H6-S1": [{mut1}, {mut2}]

we rather not use the Alignment to sort mutations, since the loops themselves have too much variability as shown in the ALN. Subfamily-level.

In [None]:
loop_muts, loop_counts = gaingrn.scripts.mutation_utils.compose_loop_vars(human_collection, jsons, resid_key='x', aa_key='aa_change', fasta_offsets=human_fasta_offsets)

for loop in loop_counts.keys():
    print(loop, loop_counts[loop])


### 5. Get the TOP10 cancer- and variance-enriched positions and plot them

The overall quality for S4 is very low with a Max of 2 variants per score, therefore it was excluded in the Figure.

In [None]:
vm_arr = GainMutations.generate_data_array(return_list=True)

print(vm_arr)

vm_arr.sort(key=lambda x: x[4])

#np.save("../datavarmut.pkl", varmut_arr, allow_pickle=True)

pos = []
val = []

for tup in vm_arr[:10]:
    print(tup)
    pos.append(tup[0]+"."+str(tup[1]))
    val.append(tup[4])

fig, ax = plt.subplots(figsize=[3,2])
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
plt.tick_params(which='both', width=1)
plt.tick_params(which='major', length=8)
plt.tick_params(which='minor', length=6)
plt.bar(range(len(val)),height=[-v for v in val], color="#9090f7")
plt.xticks(ticks=range(len(val)), labels=pos, rotation=90,fontname="FreeSans")
plt.yticks(ticks=[0,0.1,0.2],fontname="FreeSans")
plt.ylim(0, 0.25)
plt.savefig("../TESTING/var_enriched.elem.svg")

pos=[]
val=[]

for tup in vm_arr[:-10:-1]:
    print(tup)
    pos.append(tup[0]+"."+str(tup[1]))
    val.append(tup[4])

fig, ax = plt.subplots(figsize=[3,2])
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
plt.tick_params(which='both', width=1)
plt.tick_params(which='major', length=8)
plt.tick_params(which='minor', length=6)
plt.bar(range(len(val)),height=val, color='#f01717')
plt.xticks(ticks=range(len(val)), labels=pos, rotation=90, fontname="FreeSans")
plt.yticks(ticks=[0,0.1,0.2],fontname="FreeSans")
plt.ylim(0, 0.25)
plt.savefig("../TESTING/mut_enriched.elem.svg")