In [141]:
import pandas as pd
import numpy as np
import re

raw = pd.read_csv('./bioinf-edit1 - All.csv')

In [180]:
# Mating pathway
mating_pathway = {
    "Ste2",
    "Ste3",
    "Gαβγ",
    "Ste5",
    "Ste4","Ste1","Gpa1",
    "Cdc24",
    "Cdc42",
    "Ste20",
    "Ste50",
    "Ste11",
    "Ste7",
    "Fus3",
    "Ste12",
}

# Filamentous growth pathway
filamentous_growth_pathway = {
    "Msb2",
    "Sho1","Opy2",
    "Cdc24",
    "Cdc42",
    "Ste20",
    "Ste50",
    "Ste11",
    "Ste7",
    "Kss1",
    "Ste12","Tec1",
}

# High osmolarity pathway
high_osmolarity_pathway = {
    "Msb2","Hkr1",
    "Sho1","Opy2",
    "Cdc24",
    "Cdc42",
    "Ste20",
    "Ste50",
    "Ste11",
    "Pbs2",
    "Hog1",
    "Hot1",
}

# Combine into a dictionary for easy access
pathways_hm = {
    "Mating": mating_pathway,
    "Filamentous Growth": filamentous_growth_pathway,
    "High Osmolarity": high_osmolarity_pathway
}
pathways = [x.lower() for x in mating_pathway.union(high_osmolarity_pathway).union(filamentous_growth_pathway)]

In [142]:
def extract_gene_name(info_field):
    if "ANN=" in info_field:
        ann_entries = info_field.split("ANN=")[1].split(";")[0]
        annotations = ann_entries.split(",")
        gene_info = set()
        for ann in annotations:
            ann_fields = ann.split("|")
            if len(ann_fields) > 4:
                gene_name = ann_fields[3]
                #gene_id = ann_fields[4]
                gene_info.add(gene_name)
        return gene_info
    else:
        return set() 

def augument_data(data):
  data = data.replace('./.:.',  '')
  sorting_table = {"HIGH":0, "MODERATE":1, "LOW":2, 'MODIFIER':3}
  data['Severity'] = data['FORMAT'].apply(lambda x: sorting_table[x.split('|')[2]])
  data['GENES_AFFECTED'] = data['FORMAT'].map(extract_gene_name)
  #data['Gene'] = data['FORMAT'].apply(lambda x: x.split('|')[3])
  return data

data = augument_data(raw)

strains = list(map(lambda x : x+'.bam', ['G28c1', 'D11c1', 'I38c2', 'M59c1', 'WT']))

In [170]:
def analysis(strain):
  cur = strain
  others = strains.copy()
  others.remove(cur)
  promising_picks = data[
    (data[cur].apply(lambda x: x.startswith('1/1'))) &
    (data["QUAL"] > 100) &
    (data[others].applymap(lambda x: x.startswith('1/1')).sum(axis=1)==0)].sort_values(by='Severity', ascending=True)
  #return promising_picks['GENES_AFFECTED']
  for ga in promising_picks['GENES_AFFECTED']:
    cross = pathways.intersection(ga)
    if len(cross) > 0:
      print(f'FOUND cross between pathway and affected gene in {strain}\n{cross}')


In [172]:
for strain in strains:
  print(analysis(strain))

388    {RGA1}
352    {ESC1}
Name: GENES_AFFECTED, dtype: object
216    {STE12}
393     {RRS1}
124    {RAD55}
Name: GENES_AFFECTED, dtype: object
193    {KSS1}
322    {IMD3}
Name: GENES_AFFECTED, dtype: object
370    {YNR021W}
389       {RGA1}
437      {Q0182}
299       {ALY1}
Name: GENES_AFFECTED, dtype: object
Series([], Name: GENES_AFFECTED, dtype: object)


In [175]:
pathways

{'Cdc24',
 'Cdc42',
 'Fus3',
 'Gpa1',
 'Gαβγ',
 'Hkr1',
 'Hog1',
 'Hot1',
 'Kss1',
 'Msb2',
 'Opy2',
 'Pbs2',
 'Sho1',
 'Ste1',
 'Ste11',
 'Ste12',
 'Ste2',
 'Ste20',
 'Ste3',
 'Ste4',
 'Ste5',
 'Ste50',
 'Ste7',
 'Tec1'}

In [173]:
print(strains[1])
analysis(strains[1])

D11c1.bam


216    {STE12}
393     {RRS1}
124    {RAD55}
Name: GENES_AFFECTED, dtype: object

In [None]:
print(strains[2])
analysis(strains[2])

In [135]:

print(strains[3])
analysis(strains[3])

M59c1.bam


  return data[
  return data[


Unnamed: 0,#CHROM,POS,ID,REF,ALT,QUAL,FILTER,INFO,FORMAT,Unnamed: 9,D11c1.bam,G28c1.bam,I38c2.bam,M59c1.bam,WT.bam,Severity
370,XIV,668562,.,C,A,221.999,.,VDB=0.43412;SGB=-0.69312;MQSB=1;MQ0F=0;AF1=1;A...,ANN=A|stop_gained|HIGH|YNR021W|YNR021W|transcr...,GT:PL,,,,"1/1:255,96,0",,0
389,XV,563576,.,G,T,221.999,.,VDB=0.552091;SGB=-0.693141;MQSB=1;MQ0F=0;AF1=1...,ANN=T|stop_gained|HIGH|RGA1|YOR127W|transcript...,GT:PL,,,,"1/1:255,111,0",,0
437,Mito,65922,.,GGGGGC,G,214.458,.,INDEL;IDV=1;IMF=0.00571429;VDB=1.50203e-08;SGB...,ANN=G|frameshift_variant|HIGH|Q0182|Q0182|tran...,GT:PL,,,,"1/1:255,255,0",,0
299,XI,479905,.,T,A,221.999,.,VDB=0.00732327;SGB=-0.691153;MQSB=1;MQ0F=0;AF1...,ANN=A|missense_variant|MODERATE|ALY1|YKR021W|t...,GT:PL,,,,"1/1:255,54,0",,1


In [132]:
x = analysis(strains[2])
x

  return data[
  return data[


Unnamed: 0,#CHROM,POS,ID,REF,ALT,QUAL,FILTER,INFO,FORMAT,Unnamed: 9,D11c1.bam,G28c1.bam,I38c2.bam,M59c1.bam,WT.bam,Severity
193,VII,576333,.,C,A,221.999,.,VDB=0.162935;SGB=-0.693127;MQSB=1;MQ0F=0;AF1=1...,ANN=A|stop_gained|HIGH|KSS1|YGR040W|transcript...,GT:PL,,,"1/1:255,99,0",,,0
322,XII,1003220,.,G,C,225.009,.,VDB=0.502815;SGB=-0.693143;MQSB=0.976313;MQ0F=...,ANN=C|missense_variant|MODERATE|IMD3|YLR432W|t...,GT:PL,,,"1/1:255,114,0",,"0/1:255,0,255",1


In [None]:
def parse_line(line):
  return line.search('')

In [19]:
m = re.search('LOF=\((.*)\)', x)
m.groups()[0].split('|')

TypeError: expected string or bytes-like object, got 'Series'

In [17]:
x = data["FORMAT"]
for xx in x: print(xx) if re.match('LOF=',xx) else None


In [27]:
import json
data['FORMAT'][1:100].to_json('test.json')


In [158]:
yy= data['FORMAT'][0]
def getLOFs(x):
  m = re.search('LOF=\(', x)
  if m != None:
    return True
  return False

In [164]:
loc_matches = data[data['FORMAT'].apply(getLOFs)]['FORMAT']
loc_matches.applymap(lambda x: x[x.find('LOF='),])

AttributeError: 'Series' object has no attribute 'applymap'

In [28]:
data['FORMAT'][1]

'ANN=C|frameshift_variant|HIGH|YAL069W|YAL069W|transcript|YAL069W_mRNA|protein_coding|1/1|c.120_126delTCTCAAA|p.Lys42fs|120/315|120/315|40/104||INFO_REALIGN_3_PRIME;LOF=(YAL069W|YAL069W|1|1.00)'