# 6 GAIN-GRN Statistics and Analysis
With the completed set of GAIN Domains, we analyze for individual segment and label position number, which we define as **occurrence**. 

We create the data for Figure 2 and Supp. Fig. 1 here.

In [None]:
# DEPENDENCIES
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from numpy.ma import masked_array
# LOCAL IMPORTS
import gaingrn.scripts.io
import gaingrn.scripts.assign
import gaingrn.scripts.indexing_utils
from gaingrn.scripts import gain_classes as gain_classes

In [None]:
stal_indexing = np.load("../data/stal_indexing.pkl", allow_pickle=True)
human_collection = np.load("../data/human_collection.pkl", allow_pickle=True)
valid_collection = np.load("../data/valid_collection.pkl", allow_pickle=True)

#### Initialize data for every receptor protein. 
Here, we take the 33 common proteins in human (where ADGRE4 is present as pesudogene) and collect data on each receptor type.

In [None]:
# Initialize Data
receptors = ["ADGRA1","ADGRA2","ADGRA3","ADGRB1","ADGRB2","ADGRB3","CELSR1","CELSR2","CELSR3","ADGRD1","ADGRD2","ADGRE1","ADGRE2",
             "ADGRE3","ADGRE4","ADGRE5","ADGRF1","ADGRF2","ADGRF3","ADGRF4","ADGRF5","ADGRG1","ADGRG2","ADGRG3","ADGRG4","ADGRG5",
             "ADGRG6","ADGRG7","ADGRL1","ADGRL2","ADGRL3","ADGRL4","ADGRV1","unknown","PKD"]
receptors_list = ["A1","A2","A3","B1","B2","B3","C1","C2","C3","D1","D2","E1","E2","E3","E4","E5","F1","F2","F3","F4","F5","G1","G2","G3","G4","G5","G6","G7","L1","L2","L3","L4","V1","X"]
elements = ["H1","H2","H3","H4","H5","H6","S1","S2","S3","S4","S5","S6","S7","S8","S9","S10","S11","S12","S13","S14"]

absolute_subfam_occupancy = np.zeros(shape=(10,20), dtype=int) # OCCURRENCE MATRIX PER SUBFAMILY
absolute_receptors_occupancy = np.zeros(shape=(34,20), dtype=int) # OCCURRENCE MATRIX PER RECEPTOR

n_fams = np.zeros(shape=(10), dtype=int) # Here are the number of proteins for a given subfamily
n_receptors = np.zeros(shape=(34), dtype=int)   #              -""-                           receptor

el_index = dict(zip(elements, range(20)))
fam_index = dict(zip(list("ABCDEFGLVX"),range(10)))
rec_index = dict(zip(receptors_list, range(34)))

### 1. Construct a 2D Occupancy Matrix of each element in correspondence with each other element.

In [None]:
segments = ['H1','H2','H3','H4','H5','H6','S1','S2','S3','S4','S5','S6','S7','S8','S9','S10','S11','S12','S13','S14']
# for unique element assignments, use the keys below. These have a really low frequency though.
#segments = ['H1','H1.D1','H1.E1','H1.F4','H2','H3','H4','H5','H6','S1','S2','S3','S4','S5','S6','S7','S8','S9','S10','S11','S12','S13','S14']

loop_lengths = {}
sse_lengths = {}
center_residues = {}
sse_matrix = np.zeros(shape=(len(stal_indexing.total_keys),len(stal_indexing.total_keys)))

for idx in range(stal_indexing.length):

    loop_lengths = gaingrn.scripts.indexing_utils.match_dirs(gaingrn.scripts.indexing_utils.get_loops(stal_indexing.intervals[idx]), loop_lengths)
    sse_lengths =  gaingrn.scripts.indexing_utils.match_dirs(gaingrn.scripts.indexing_utils.get_sse_len(stal_indexing.intervals[idx], stal_indexing.total_keys), sse_lengths, exclude=[0])
    center_res =   gaingrn.scripts.indexing_utils.match_dirs(gaingrn.scripts.indexing_utils.get_pos_res(stal_indexing.center_dirs[idx], valid_collection.collection[idx]), center_residues)

    present_sse = stal_indexing.intervals[idx].keys()
    for i, kk in enumerate(segments):
        for j in range(i,len(segments)):
            if kk in present_sse and segments[j] in present_sse:
                sse_matrix[j,i] += 1

plt.imshow(sse_matrix, cmap='gist_yarg')
plt.xticks(ticks= range(len(segments)), labels=segments, rotation=90)
plt.yticks(ticks= range(len(segments)), labels=segments)
plt.xlim(-0.5,19.5)
plt.ylim(19.5,-0.5)
cbar = plt.colorbar(shrink=0.5)
plt.savefig("../../TESTING/stal_occ_matrix.png")

### 2. Plot statistics of occupancy and pLDDT values for each labeled position per segment
With the **all\_plddt.tsv** file containing info about the AlphaFold2 confidence values, read them in and construct a data matrix for evaluating the element quality and occupancy.

In [None]:
plddt_dir = gaingrn.scripts.indexing_utils.get_plddt_dir('../data/all_plddt.tsv')
all_starts = [ gain.start for gain in valid_collection.collection ]

plddt_values, occ_values, label_seq = gaingrn.scripts.indexing_utils.construct_id_occupancy(stal_indexing.intervals, 
                                                             stal_indexing.center_dirs, 
                                                             stal_indexing.length, 
                                                             plddt_dir, 
                                                             stal_indexing.names, 
                                                             stal_indexing.sequences,
                                                             all_starts,
                                                             debug=False)

In [None]:
#disregard H1 because of unique H1 indexes
for sse in segments[1:]:
    threshold = 0.01
    # Transform the values first
    pp = plddt_values[sse]
    av_pp = {k:np.average(np.array(v))/100 for k,v in pp.items()}
    norm_occ = {k:v/14435 for k,v in occ_values[sse].items()}
    xax = sorted(av_pp.keys())
    x_vals = [x for x in xax if norm_occ[x]>=threshold]
    y_pp = [av_pp[x] for x in xax if norm_occ[x]>=threshold]
    y_occ = [norm_occ[x] for x in xax if norm_occ[x]>=threshold]

    gaingrn.scripts.plotting_utils.plot_segment_statistics(sse=sse, xvals=x_vals, y_plddt=y_pp, y_occupancy=y_occ, savename=f"../../TESTING/GAIN_{sse}_stats.svg", show=True)
# also plot a smaller version of the occupancy

for ki in segments:
    if "S" in ki:   c = 'xkcd:orange'
    else:           c = 'xkcd:denim'
    gaingrn.scripts.plotting_utils.plot_segment_hist(sse_lengths[ki], c, ki, stal_indexing.length, savename=f"../../TESTING/GAIN_{sse}_hist_mini.svg")


### 3. Generate Logoplots from Supp. Fig. 1 showing segment residue composition.

In [None]:
# GENERATE A FULL DATAFRAME FOR THE LABELED POSITIONS AND THEIR RESPECTIVE AA FREQUENCIES FOR LOGOPLOTS
sse_aa_freqs = {}
aastr = 'ACDEFGHIKLMNPQRSTVWYX'
cols = {aa:i for i,aa in enumerate(aastr)}
for sse in segments:
    sse_dict = label_seq[sse]
    aafreqs = np.zeros(shape=(len(sse_dict.keys()), 21))
    for p_index, pos in enumerate(sorted(sse_dict.keys())):
        aas, freq = np.unique(np.array(sse_dict[pos]), return_counts=True)
        for i, aa in enumerate(aas):
            aafreqs[p_index, cols[aa]] = freq[i]/14435
    sse_aa_freqs[sse] = aafreqs

for sse in segments:

    lframe = pd.DataFrame(data=sse_aa_freqs[sse], columns=[c for c in aastr], index = sorted(plddt_values[sse].keys()))

    gaingrn.scripts.plotting_utils.plot_logo_segment(dataframe=lframe, sse=sse, threshold=0.05, savename=f"../../TESTING/stal_conslogo_{sse}.svg")

### 4. Generate the segment / position data of Figure 2.
Collect the elements present in the respective receptor groups. First, for the human group of receptors.

In [None]:
# extract the human proteins from the dataset
human_ac = [gain.name.split("_",)[0].split("-")[0] for gain in human_collection.collection]
human_idx = {}
for ac in human_ac:
    for i,sac in enumerate(stal_indexing.accessions):
        if ac == sac:
            human_idx[i] = stal_indexing.receptor_types[i]
human_el_matrix = np.zeros(shape=(33,20), dtype=bool)

for k in human_idx.keys():
    dd = stal_indexing.indexing_dirs[k]
    el_list = np.unique([k.split(".")[0] for k in dd.keys()])
    receptors = human_idx[k]
    for el in el_list:
        if "GPS" in el:
            continue
        human_el_matrix[rec_index[receptors], el_index[el]] = 1

Construct an occurrence matrix for the whole aGPCR GAIN set.

For every receptor, count the number of occurrences of each segment. We define these occurrences as **occupancy**.

In [None]:
# Get a matrix : a) Receptors + OCC b) Families+ + OCC
for i in range(len(stal_indexing.receptor_types)):
    dd = stal_indexing.indexing_dirs[i]
    el_list = np.unique([k.split(".")[0] for k in dd.keys()])
    receptors = stal_indexing.receptor_types[i]
    if receptors in receptors_list:
        n_receptors[rec_index[receptors]] += 1
        for el in el_list:
            if el not in elements:
                 continue
            absolute_receptors_occupancy[rec_index[receptors], el_index[el]] += 1
    if receptors[0] in "ABCDEFGLVX":
        n_fams[fam_index[receptors[0]]] += 1
        for el in el_list:
            if el not in elements:
                continue
            absolute_subfam_occupancy[fam_index[receptors[0]], el_index[el]] += 1

In [None]:
# Normalize matrices by frequency
receptors_occ = absolute_receptors_occupancy / n_receptors[:,None]
subfam_occ = absolute_subfam_occupancy / n_fams[:,None]

# store the values somewhere.
import pickle as pkl

data = {"absolute_receptors_occupancy": absolute_receptors_occupancy,
         "receptors_occ": receptors_occ,
         "absolute_subfam_occupancy": absolute_subfam_occupancy,
         "subfam_occ": subfam_occ,
         "fam_index": fam_index,
         "rec_index": rec_index,
         "n_receptors": n_receptors,
         "n_fams": n_fams}
with open("../data/element_occ.pkl",'wb') as occfile:
    pkl.dump(data, occfile, -1)

We do the same for the group of PKD1/PKD1L1 proteins.


In [None]:
# GET PKD OCCUPANCY. THIS IS NOT AS HIGH-QUALITY AND LIKELY NOT FILTERED FOR GOOD AND BAD GAIN DOMAINS.
pkd_indexing = np.load("../data/pkd/pkd_indexing.NEW.pkl", allow_pickle=True)
pkd_collection = np.load("../data/pkd/pkd_collection.pkl", allow_pickle=True)
absolute_pkd_occupancy = np.zeros(shape=(20), dtype=int)

invalid = 0
for pkd in pkd_collection.collection:
    if not pkd.hasSubdomain: invalid +=1
print("Found INVALID PKD structures:",invalid)

n_pkd = 0
pkd_elements = []
for i in range(len(pkd_indexing.receptor_types)):
    dd = pkd_indexing.indexing_dirs[i]
    el_list = np.unique([k.split(".")[0] for k in dd.keys()])
    receptors = pkd_indexing.receptor_types[i]
    pkd_elements.append(len(el_list))
    if receptors in receptors_list:
        n_pkd += 1
        for el in el_list:
            if el not in elements:
                 continue
            absolute_pkd_occupancy[el_index[el]] += 1
print("MATCH STATISTICS (NUMBER OF INDEXED ELEMENTS):",np.unique(pkd_elements, return_counts=True))

pkd_occ = absolute_pkd_occupancy/n_pkd

Compact the aGPCR + PKD GAIN data into a single structure for plotting.

In [None]:
receptors_comb_occ = np.zeros(shape=(35,20),dtype=float)
receptors_comb_occ[:-1,:] = receptors_occ
receptors_comb_occ[-1:,] = pkd_occ.T
subfam_comb_occ = np.zeros(shape=(11,20),dtype=float)
subfam_comb_occ[:-1,:] = subfam_occ
subfam_comb_occ[-1:,] = pkd_occ.T

receptors = ["ADGRA1","ADGRA2","ADGRA3","ADGRB1","ADGRB2","ADGRB3","CELSR1","CELSR2","CELSR3","ADGRD1","ADGRD2","ADGRE1","ADGRE2",
             "ADGRE3","ADGRE4","ADGRE5","ADGRF1","ADGRF2","ADGRF3","ADGRF4","ADGRF5","ADGRG1","ADGRG2","ADGRG3","ADGRG4","ADGRG5",
             "ADGRG6","ADGRG7","ADGRL1","ADGRL2","ADGRL3","ADGRL4","ADGRV1","unknown","PKD"]

Plot per Receptor type.

In [None]:
def draw_circle(ax, center_coord, r=0.3, color='black'):
    # The center of the value is the exact value of the square, therefore no offset should be needed
    circle = plt.Circle(center_coord, r, color=color)
    c_i = plt.Circle(center_coord, r-0.05, color='white')
    ax.add_patch(circle)
    ax.add_patch(c_i)

plt.rcParams['font.family'] = 'FreeSans'


fig,ax = plt.subplots(facecolor='w', figsize=[8,8])

# split the plot so that strands are orange, helices are blue
splitter = np.ones(shape=receptors_comb_occ.shape, dtype=int)
splitter[:,6:] = 2

strands = masked_array(receptors_comb_occ, splitter == 1)
hels = masked_array(receptors_comb_occ, splitter == 2)
ax.set_xticks(ticks = np.arange(-0.5,19.5), labels = elements, rotation=90, size=14, horizontalalignment='left')
ax.set_yticks(ticks = np.arange(-0.5,34.5), labels = receptors, size=13, verticalalignment='top', horizontalalignment='right',style='italic')
him = ax.imshow(hels, cmap='Blues', aspect='equal',extent = (-0.5, 19.5, 34.5, -0.5))
sim = ax.imshow(strands, cmap='Oranges')
#cbar = plt.colorbar(shrink=0.5)
ax.xaxis.tick_top()
#cbar.set_label("Element Occurrence")

for y in range(20):
    for x in range(33):
        if not human_el_matrix[x,y]:
            draw_circle(ax, [y,x], r=0.18, color='black')
ax.grid(True,'both', color = 'black')

cb1 = plt.colorbar(him,shrink=0.7) 
cb2 = plt.colorbar(sim,shrink=0.7) 
cb2.set_ticks([])
cb1.set_ticks(ticks=[0,1],labels=["0%","100%"], size=14)

cb1.set_label("Element Occupancy", size=14)

plt.savefig("../../TESTING/receptors_occ_withpkd_circ2.svg",dpi=600, bbox_inches='tight')

Plot the receptor subfamilies.

In [None]:
from numpy.ma import masked_array

fig,ax = plt.subplots(facecolor='w', figsize=[8,4])

# split the plot so that strands are orange, helices are blue
splitter = np.ones(shape=subfam_comb_occ.shape, dtype=int)
splitter[:,6:] = 2

strands = masked_array(subfam_comb_occ, splitter == 1)
hels = masked_array(subfam_comb_occ, splitter == 2)

ax.grid(True,'both', color = 'black')
ax.set_xticks(ticks = np.arange(-0.5,19.5), labels = elements, rotation=90, size=14, horizontalalignment='left')
ax.set_yticks(ticks = np.arange(-0.5,10.5), labels = ["ADGRA","ADGRB","CELSR","ADGRD","ADGRE","ADGRF","ADGRG","ADGRL","ADGRV","unknown","PKD"], size=14, verticalalignment='top',  style='italic')
him = ax.imshow(hels, cmap='Blues', aspect='equal',extent = (-0.5, 19.5, 10.5, -0.5))
sim = ax.imshow(strands, cmap='Oranges')
#cbar = plt.colorbar(shrink=0.5)
#cbar.set_label("Element Occurrence")

cb1 = plt.colorbar(him,shrink=0.7) 
cb2 = plt.colorbar(sim,shrink=0.7) 
cb2.set_ticks([])
cb1.set_ticks(ticks=[0,1],labels=["0%","100%"], size=14)
cb1.set_label("Element Occupancy", size=14)
plt.savefig("../../TESTING/subfam_occ_withpkd.svg",dpi=600, bbox_inches='tight')

### 5. Further Utilities
Map Label Position and Segment occurrences to the b-factor of a PDB file.

In [None]:
# gather occupancy for each element and position label
label_occ_dict = {}
for indexing_dir in stal_indexing.indexing_dirs:
    for k in indexing_dir.keys():
        if k not in label_occ_dict:
            label_occ_dict[k] = 0
        label_occ_dict[k] += 1

gaingrn.scripts.indexing_utils.mark_seg_cons(stal_indexing, receptors_occ, elements, "O94910", "../all_pdbs/O94910_Q96IE7_Q9BU07_Q9HAR3.pdb", "../l1_segcons2.pdb", fill_b="-1.000")
gaingrn.scripts.indexing_utils.mark_seg_cons(stal_indexing, receptors_occ, elements, "Q96PE1", "../all_pdbs/Q96PE1_A6H8W3_D3DSW4_Q8N3R1_Q8TEM3_Q96KB2_Q9P1Z7_Q9UFY4.pdb", "../a2_segcons2.pdb", fill_b="-1.000")
gaingrn.scripts.indexing_utils.mark_pos_cons(stal_indexing, label_occ_dict, "O94910", "../all_pdbs/O94910_Q96IE7_Q9BU07_Q9HAR3.pdb", "../l1_poscons2.pdb", fill_b="-1.000")
gaingrn.scripts.indexing_utils.mark_pos_cons(stal_indexing, label_occ_dict, "Q96PE1", "../all_pdbs/Q96PE1_A6H8W3_D3DSW4_Q8N3R1_Q8TEM3_Q96KB2_Q9P1Z7_Q9UFY4.pdb", "../a2_poscons2.pdb", fill_b="-1.000")

Extract a specific segment from a specific UNIPROT entry.

In [None]:
gaingrn.scripts.indexing_utils.get_elem_seq(uniprot="O14514", stal_indexing=stal_indexing, valid_collection=valid_collection, segment='H6')


Print the Sequence composition for any GAIN-GRN labeled position.

In [None]:
grn_labels = ["H6.50", "S14.49", "S14.50", "GPS.-1"]

for label in grn_labels:
    x = gaingrn.scripts.indexing_utils.get_residue_composition(label, segments, sse_aa_freqs, plddt_values)
    print(label, x)

Gather information about the Segment-containing loops.

In [None]:
loop_info = {}

for idx in range(stal_indexing.length):
    curr_name = stal_indexing.names[idx]
    start = valid_collection.collection[idx].start
    i_loc, i_dir = gaingrn.scripts.indexing_utils.get_loop_stats(stal_indexing.indexing_dirs[idx], valid_collection.collection[idx].sequence)
    for k, seq in i_dir.items():
        if k not in loop_info.keys():
            loop_info[k] = []
        loop_info[k].append({'name':f'{stal_indexing.names[idx]}_{i_loc[k][0]+start}-{i_loc[k][1]+start}', 'sequence':''.join(seq)})

# Write the collected loop sequences to a FASTA file for later alignment.
def loop2fasta(outfile, itemlist):
    with open(outfile, 'w') as out:
        for subdict in itemlist:
            out.write(f">{subdict['name']}\n{subdict['sequence']}\n")
    print("Done with", outfile)

for loop in loop_info.keys():
    loop2fasta(f"../../TESTING/loops/{loop}.fa", loop_info[loop])