In [1]:
import pandas as pd
import dataframe_image as dfi
import os
import shutil

### In this file we generated the table with genes present in disease samples but absent in healthy samples. 
### We used metadata from 'SGB3587_metadata_.tsv' and .faa files from Prokkka output.

In [2]:
# loading metadata
meta = pd.read_csv('SGB3587_metadata_.tsv', sep='\t')
meta

Unnamed: 0,magID,sampleID,sex,bmi,age,smoking_state,study_group
0,M1695505146,BP01008,male,26.122449,69,non-smoker,mucositis
1,M1357088918,BP01039,male,26.234568,53,smoker,healthy
2,M1973003636,BP01095,female,19.53125,63,non-smoker,periimplantitis
3,M1970119015,BP02008,male,24.691358,73,smoker,periimplantitis
4,M1263131280,BP03007,female,24.023809,46,non-smoker,periimplantitis
5,M1080786199,BP03021,female,18.256319,80,ex-smoker,mucositis
6,M1267816792,BP07055,male,27.681661,61,ex-smoker,healthy
7,M1837800477,BP08025,male,21.612812,66,non-smoker,periimplantitis
8,M1026938161,BP08044,male,,33,smoker,mucositis
9,M1126320974,BP10003,male,26.54321,76,smoker,periimplantitis


In [3]:
# split metadata into groups
muc = meta[meta['study_group'] == 'mucositis']
peri = meta[meta['study_group'] == 'periimplantitis']
healthy = meta[meta['study_group'] == 'healthy']

In [4]:
# faa_seqs is a folder of .faa files obtained as an output of Prokka
# concatenate all the files in the folder faa_seqs with 'magID' of each group
with open('mucositis.faa', 'w') as f:
    for mag in muc['magID']:
        with open(f'faa_seqs/{mag}.faa') as g:
            shutil.copyfileobj(g, f)
with open('periimplantitis.faa', 'w') as f:
    for mag in peri['magID']:
        with open(f'faa_seqs/{mag}.faa') as g:
            shutil.copyfileobj(g, f)
with open('healthy.faa', 'w') as f:
    for mag in healthy['magID']:
        with open(f'faa_seqs/{mag}.faa') as g:
            shutil.copyfileobj(g, f)


In [5]:
# remove raws without \t (genes not identified)
with open("healthy.txt", 'r') as f:
    lines = f.readlines()
    lines = [line for line in lines if '\t' in line]
    with open("healthyproc.txt", 'w') as f:
        f.writelines(lines)
with open("mucos.txt", 'r') as f:
    lines = f.readlines()
    lines = [line for line in lines if '\t' in line]
    with open("mucos_proc.txt", 'w') as f:
        f.writelines(lines)
with open("periimplant.txt", 'r') as f:
    lines = f.readlines()
    lines = [line for line in lines if '\t' in line]
    with open("periimplant_proc.txt", 'w') as f:
        f.writelines(lines)

In [6]:
# creating dataframes fro each group
healthy = pd.read_csv('healthyproc.txt', sep='\t', header=None)
mucos = pd.read_csv('mucos_proc.txt', sep='\t', header=None)
periimplant = pd.read_csv('periimplant_proc.txt', sep='\t', header=None)
periimplant.head()

Unnamed: 0,0,1
0,AADIBEAE_00011,K00850
1,AADIBEAE_00012,K01489
2,AADIBEAE_00014,K01207
3,AADIBEAE_00018,K03650
4,AADIBEAE_00021,K01534


In [7]:
# finding the different values between columns 1 in mucos+periimplant and healthy
cont = pd.concat([mucos, periimplant]).drop_duplicates(keep=False)
differ = cont[~cont[1].isin(healthy[1])]

In [8]:
# returning string of genes present in mucos+periimplant but not in healthy
# order of IDs could be different for each run (values are the same)
se = set(differ[1])
print('\n'.join(se))

K19157
K25155
K00973
K23518
K01154
K21498
K00931
K04095
K25574
K02015
K01808
K11717
K05878
K25151
K03299
K03773
K05879
K02453
K07407
K09691
K12343
K02481
K06223


#### Then the list of KAGG gene IDs was used to get gene names from the KAGG database.
#### 'final_diff.txt' is the final output file containing the gene names.

In [9]:
diff = pd.read_csv('final_diff.txt', sep='    ', header=None)
diff.index = diff.index + 1
diff

  """Entry point for launching an IPython kernel.


Unnamed: 0,0,1
1,ko:K11717,sufS; cysteine desulfurase / selenocysteine ly...
2,ko:K05879,dhaL; phosphoenolpyruvate---glycerone phosphot...
3,ko:K19157,yafQ; mRNA interferase YafQ [EC:3.1.-.-]
4,ko:K00931,proB; glutamate 5-kinase [EC:2.7.2.11]
5,ko:K03299,"TC.GNTP; gluconate:H+ symporter, GntP family"
6,ko:K00973,"rfbA, rmlA, rffH; glucose-1-phosphate thymidyl..."
7,ko:K06223,dam; DNA adenine methylase [EC:2.1.1.72]
8,ko:K04095,"fic, FICD, HYPE; cell filamentation protein, p..."
9,ko:K05878,dhaK; phosphoenolpyruvate---glycerone phosphot...
10,ko:K02015,ABC.FEV.P; iron complex transport system perme...


In [10]:
from pandas import option_context
# renaming columns
diff = diff.rename(columns={0: 'KEGG ID', 1: 'Protein names'})
# setting caption
diff = diff.style.set_caption('Genes present in peri-implantitis+mucositis and absent in healthy samples')
# exporting to png
with option_context('display.max.colwidth', 1000):
    dfi.export(diff, 'diff_kagg.png')