In [141]:
import pandas as pd
import numpy as np
import re

raw = pd.read_csv('./bioinf-edit1 - All.csv')

In [185]:
# Mating pathway
mating_pathway = {
    "Ste2",
    "Ste3",
    "Gαβγ",
    "Ste5",
    "Ste4","Ste1","Gpa1",
    "Cdc24",
    "Cdc42",
    "Ste20",
    "Ste50",
    "Ste11",
    "Ste7",
    "Fus3",
    "Ste12",
}

# Filamentous growth pathway
filamentous_growth_pathway = {
    "Msb2",
    "Sho1","Opy2",
    "Cdc24",
    "Cdc42",
    "Ste20",
    "Ste50",
    "Ste11",
    "Ste7",
    "Kss1",
    "Ste12","Tec1",
}

# High osmolarity pathway
high_osmolarity_pathway = {
    "Msb2","Hkr1",
    "Sho1","Opy2",
    "Cdc24",
    "Cdc42",
    "Ste20",
    "Ste50",
    "Ste11",
    "Pbs2",
    "Hog1",
    "Hot1",
}

# Combine into a dictionary for easy access
pathways_hm = {
    "Mating": mating_pathway,
    "Filamentous Growth": filamentous_growth_pathway,
    "High Osmolarity": high_osmolarity_pathway
}
pathways = set([x.lower() for x in mating_pathway.union(high_osmolarity_pathway).union(filamentous_growth_pathway)])

In [186]:
def extract_gene_name(info_field):
    if "ANN=" in info_field:
        ann_entries = info_field.split("ANN=")[1].split(";")[0]
        annotations = ann_entries.split(",")
        gene_info = set()
        for ann in annotations:
            ann_fields = ann.split("|")
            if len(ann_fields) > 4:
                gene_name = ann_fields[3]
                #gene_id = ann_fields[4]
                gene_info.add(gene_name.lower())
        return gene_info
    else:
        return set() 

def augument_data(data):
  data = data.replace('./.:.',  '')
  sorting_table = {"HIGH":0, "MODERATE":1, "LOW":2, 'MODIFIER':3}
  data['Severity'] = data['FORMAT'].apply(lambda x: sorting_table[x.split('|')[2]])
  data['GENES_AFFECTED'] = data['FORMAT'].map(extract_gene_name)
  #data['Gene'] = data['FORMAT'].apply(lambda x: x.split('|')[3])
  return data

data = augument_data(raw)

strains = list(map(lambda x : x+'.bam', ['G28c1', 'D11c1', 'I38c2', 'M59c1', 'WT']))

In [189]:
def analysis(strain):
  cur = strain
  others = strains.copy()
  others.remove(cur)
  promising_picks = data[
    (data[cur].apply(lambda x: x.startswith('1/1'))) &
    (data["QUAL"] > 100) &
    (data[others].applymap(lambda x: x.startswith('1/1')).sum(axis=1)==0)].sort_values(by='Severity', ascending=True)
  #return promising_picks['GENES_AFFECTED']
  #for ga in promising_picks['GENES_AFFECTED']:
  cross = pathways.intersection(promising_picks['GENES_AFFECTED'])
  if len(cross) > 0:
    print(f'FOUND cross between pathway and affected gene in {strain}\n{cross}')
  else:
    print(f'FOUND nothing in {strain}')


In [190]:
for strain in strains:
  print(analysis(strain))

FOUND nothing in G28c1.bam
FOUND nothing in G28c1.bam
None
FOUND cross between pathway and affected gene in D11c1.bam
{'ste12'}
FOUND nothing in D11c1.bam
FOUND nothing in D11c1.bam
None
FOUND cross between pathway and affected gene in I38c2.bam
{'kss1'}
FOUND nothing in I38c2.bam
None
FOUND nothing in M59c1.bam
FOUND nothing in M59c1.bam
FOUND nothing in M59c1.bam
FOUND nothing in M59c1.bam
None
None


In [175]:
pathways

{'Cdc24',
 'Cdc42',
 'Fus3',
 'Gpa1',
 'Gαβγ',
 'Hkr1',
 'Hog1',
 'Hot1',
 'Kss1',
 'Msb2',
 'Opy2',
 'Pbs2',
 'Sho1',
 'Ste1',
 'Ste11',
 'Ste12',
 'Ste2',
 'Ste20',
 'Ste3',
 'Ste4',
 'Ste5',
 'Ste50',
 'Ste7',
 'Tec1'}

In [173]:
print(strains[1])
analysis(strains[1])

D11c1.bam


216    {STE12}
393     {RRS1}
124    {RAD55}
Name: GENES_AFFECTED, dtype: object