Copyright (C) 2019-2021  Martin Engqvist

This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program.  If not, see <http://www.gnu.org/licenses/>.

### This Jupyter notebook is designed to filter out duplicate genes from genome fasta files

Analysis performed by Martin Engqvist (Chalmers University of Technology) in January 2020.


For the relative paths to work (which rely on the dotenv module) an empty file named exactly ".env" must be placed in the project base folders. If this is not done all the folder variables below will need to be defined manually.

In [1]:
import os
import sys
from dotenv import load_dotenv, find_dotenv # do 'pip install python-dotenv'
from os.path import join, dirname, basename, exists, isdir
import pandas as pd
import re
import subprocess

import Bio
from Bio import SeqIO


### Load environmental variables from the project root directory ###
# find .env automagically by walking up directories until it's found
dotenv_path = find_dotenv()

# load up the entries as environment variables
load_dotenv(dotenv_path)

# now you can get the variables using their names

# Check whether a network drive has been specified
DATABASE = os.environ.get("NETWORK_URL")
if DATABASE == 'None':
    pass
else:
    pass
    #mount network drive here

# set up directory paths
CURRENT_DIR = os.getcwd()
PROJ = dirname(dotenv_path) # project root directory

DATA = join(PROJ, 'data') #data directory
RAW_EXTERNAL = join(DATA, 'raw_external') # external data raw directory
RAW_INTERNAL = join(DATA, 'raw_internal') # internal data raw directory
INTERMEDIATE = join(DATA, 'intermediate') # intermediate data directory
FINAL = join(DATA, 'final') # final data directory

RESULTS = join(PROJ, 'results') # output directory
FIGURES = join(RESULTS, 'figures') # figure output directory
PICTURES = join(RESULTS, 'pictures') # picture output directory
   
            
tempo_fasta_path = join(RAW_EXTERNAL, 'tempo_in_genome', 'fasta', '332_genome_annotations', 'pep')
if not exists(tempo_fasta_path):
    os.makedirs(tempo_fasta_path)

tempo_fasta_fixed_headers_path = join(INTERMEDIATE, 'tempo_in_genome', 'fasta', '332_genome_annotations', 'pep_fixed_headers')
if not exists(tempo_fasta_fixed_headers_path):
    os.makedirs(tempo_fasta_fixed_headers_path)  
    
tempo_clustered_path = join(INTERMEDIATE, 'tempo_in_genome', 'fasta', '332_genome_annotations', 'pep_clustered')
if not exists(tempo_clustered_path):
    os.makedirs(tempo_clustered_path)  

In [2]:
print('Python version: %s' % (sys.version))
print('Pandas version: %s' % pd.__version__)
print('Re version: %s' % re.__version__)
print('BioPython version: %s' % Bio.__version__)

Python version: 3.7.7 (default, Sep 11 2020, 20:43:12) 
[GCC 7.3.0]
Pandas version: 1.0.3
Re version: 2.2.1
BioPython version: 1.76


#### Download the Figshare files

In [3]:
target_url = 'https://ndownloader.figshare.com/files/13092791'
target_folder = join(RAW_EXTERNAL, 'tempo_in_genome')

if not exists(join(target_folder, '13092791')):
    !wget -P $target_folder $target_url
else:
    print('Zipfile already exists')

--2021-02-18 11:13:24--  https://ndownloader.figshare.com/files/13092791
Resolving ndownloader.figshare.com (ndownloader.figshare.com)... 54.72.238.132, 52.210.254.6, 52.48.193.78, ...
Connecting to ndownloader.figshare.com (ndownloader.figshare.com)|54.72.238.132|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://s3-eu-west-1.amazonaws.com/pfigshare-u-files/13092791/0_332yeast_genomes.zip [following]
--2021-02-18 11:13:24--  https://s3-eu-west-1.amazonaws.com/pfigshare-u-files/13092791/0_332yeast_genomes.zip
Resolving s3-eu-west-1.amazonaws.com (s3-eu-west-1.amazonaws.com)... 52.218.109.211
Connecting to s3-eu-west-1.amazonaws.com (s3-eu-west-1.amazonaws.com)|52.218.109.211|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2902966885 (2,7G) [application/zip]
Saving to: ‘/data/Work/projects/yeast_carbohydrate_enz/data/raw_external/tempo_in_genome/13092791’


2021-02-18 11:28:46 (3,00 MB/s) - ‘/data/Work/projects/yeast_carbohyd

In [9]:
zip_location = join(RAW_EXTERNAL, 'tempo_in_genome')
path_in_zip = '0_332yeast_genomes/332_genome_annotations.zip'
target_folder = join(RAW_EXTERNAL, 'tempo_in_genome', 'fasta', '332_genome_annotations')

# extract the 332_genome_annotations.zip file
zipfile = join(zip_location, '13092791')
!unzip -j $zipfile $path_in_zip -d $target_folder

# remove the zipfile
!rm $zipfile

# unzip its contents
zipfile = join(target_folder, '332_genome_annotations.zip')
!unzip $zipfile -d $target_folder

# remove the zipfile
!rm $zipfile

Archive:  /data/Work/projects/yeast_carbohydrate_enz/data/raw_external/tempo_in_genome/13092791
  inflating: /data/Work/projects/yeast_carbohydrate_enz/data/raw_external/tempo_in_genome/fasta/332_genome_annotations/332_genome_annotations.zip  
Archive:  /data/Work/projects/yeast_carbohydrate_enz/data/raw_external/tempo_in_genome/fasta/332_genome_annotations/332_genome_annotations.zip
  inflating: /data/Work/projects/yeast_carbohydrate_enz/data/raw_external/tempo_in_genome/fasta/332_genome_annotations/Candida_albicans_SC5314_A22_current_default_coding.fasta  
  inflating: /data/Work/projects/yeast_carbohydrate_enz/data/raw_external/tempo_in_genome/fasta/332_genome_annotations/Candida_albicans_SC5314_A22_current_default_protein.fasta  
  inflating: /data/Work/projects/yeast_carbohydrate_enz/data/raw_external/tempo_in_genome/fasta/332_genome_annotations/Saccharomyces_cerevisiae_S288C_coding.fasta  
  inflating: /data/Work/projects/yeast_carbohydrate_enz/data/raw_external/tempo_in_genome/f

  inflating: /data/Work/projects/yeast_carbohydrate_enz/data/raw_external/tempo_in_genome/fasta/332_genome_annotations/cds/kluyveromyces_lactis.max.cds  
  inflating: /data/Work/projects/yeast_carbohydrate_enz/data/raw_external/tempo_in_genome/fasta/332_genome_annotations/cds/kluyveromyces_marxianus.max.cds  
  inflating: /data/Work/projects/yeast_carbohydrate_enz/data/raw_external/tempo_in_genome/fasta/332_genome_annotations/cds/komagataella_pastoris.max.cds  
  inflating: /data/Work/projects/yeast_carbohydrate_enz/data/raw_external/tempo_in_genome/fasta/332_genome_annotations/cds/kuraishia_capsulata.max.cds  
  inflating: /data/Work/projects/yeast_carbohydrate_enz/data/raw_external/tempo_in_genome/fasta/332_genome_annotations/cds/lachancea_cidri.max.cds  
  inflating: /data/Work/projects/yeast_carbohydrate_enz/data/raw_external/tempo_in_genome/fasta/332_genome_annotations/cds/lachancea_dasiensis.max.cds  
  inflating: /data/Work/projects/yeast_carbohydrate_enz/data/raw_external/tempo

  inflating: /data/Work/projects/yeast_carbohydrate_enz/data/raw_external/tempo_in_genome/fasta/332_genome_annotations/cds/ogataea_methanolica.max.cds  
  inflating: /data/Work/projects/yeast_carbohydrate_enz/data/raw_external/tempo_in_genome/fasta/332_genome_annotations/cds/ogataea_parapolymorpha.max.cds  
  inflating: /data/Work/projects/yeast_carbohydrate_enz/data/raw_external/tempo_in_genome/fasta/332_genome_annotations/cds/ogataea_polymorpha.max.cds  
  inflating: /data/Work/projects/yeast_carbohydrate_enz/data/raw_external/tempo_in_genome/fasta/332_genome_annotations/cds/pachysolen_tannophilus.max.cds  
  inflating: /data/Work/projects/yeast_carbohydrate_enz/data/raw_external/tempo_in_genome/fasta/332_genome_annotations/cds/pichia_membranifaciens.max.cds  
  inflating: /data/Work/projects/yeast_carbohydrate_enz/data/raw_external/tempo_in_genome/fasta/332_genome_annotations/cds/priceomyces_haplophilus.max.cds  
  inflating: /data/Work/projects/yeast_carbohydrate_enz/data/raw_exter

  inflating: /data/Work/projects/yeast_carbohydrate_enz/data/raw_external/tempo_in_genome/fasta/332_genome_annotations/cds/yHAB166_kazachstania_yakushimaensis_160519.max.cds  
  inflating: /data/Work/projects/yeast_carbohydrate_enz/data/raw_external/tempo_in_genome/fasta/332_genome_annotations/cds/yHMPu5000026124_ogataea_henricii_160519.max.cds  
  inflating: /data/Work/projects/yeast_carbohydrate_enz/data/raw_external/tempo_in_genome/fasta/332_genome_annotations/cds/yHMPu5000026137_ambrosiozyma_ambrosiae_160519.max.cds  
  inflating: /data/Work/projects/yeast_carbohydrate_enz/data/raw_external/tempo_in_genome/fasta/332_genome_annotations/cds/yHMPu5000026142_citeromyces_matritensis_160519.max.cds  
  inflating: /data/Work/projects/yeast_carbohydrate_enz/data/raw_external/tempo_in_genome/fasta/332_genome_annotations/cds/yHMPu5000026145_ambrosiozyma_vanderkliftii_160519.max.cds  
  inflating: /data/Work/projects/yeast_carbohydrate_enz/data/raw_external/tempo_in_genome/fasta/332_genome_an

  inflating: /data/Work/projects/yeast_carbohydrate_enz/data/raw_external/tempo_in_genome/fasta/332_genome_annotations/cds/yHMPu5000034674_blastobotrys_muscicola_160519.max.cds  
  inflating: /data/Work/projects/yeast_carbohydrate_enz/data/raw_external/tempo_in_genome/fasta/332_genome_annotations/cds/yHMPu5000034675_blastobotrys_mokoenaii_160519.max.cds  
  inflating: /data/Work/projects/yeast_carbohydrate_enz/data/raw_external/tempo_in_genome/fasta/332_genome_annotations/cds/yHMPu5000034681_blastobotrys_americana_160519.max.cds  
  inflating: /data/Work/projects/yeast_carbohydrate_enz/data/raw_external/tempo_in_genome/fasta/332_genome_annotations/cds/yHMPu5000034709_kluyveromyces_aestuarii_160519.max.cds  
  inflating: /data/Work/projects/yeast_carbohydrate_enz/data/raw_external/tempo_in_genome/fasta/332_genome_annotations/cds/yHMPu5000034710_kluyveromyces_dobzhanskii_160519.max.cds  
  inflating: /data/Work/projects/yeast_carbohydrate_enz/data/raw_external/tempo_in_genome/fasta/332_g

  inflating: /data/Work/projects/yeast_carbohydrate_enz/data/raw_external/tempo_in_genome/fasta/332_genome_annotations/cds/yHMPu5000034973_danielozyma_ontarioensis_160519.max.cds  
  inflating: /data/Work/projects/yeast_carbohydrate_enz/data/raw_external/tempo_in_genome/fasta/332_genome_annotations/cds/yHMPu5000034974_deakozyma_indianensis_160519.max.cds  
  inflating: /data/Work/projects/yeast_carbohydrate_enz/data/raw_external/tempo_in_genome/fasta/332_genome_annotations/cds/yHMPu5000034978_cyberlindnera_mrakii_160519.max.cds  
  inflating: /data/Work/projects/yeast_carbohydrate_enz/data/raw_external/tempo_in_genome/fasta/332_genome_annotations/cds/yHMPu5000034979_cyberlindnera_misumaiensis_160519.max.cds  
  inflating: /data/Work/projects/yeast_carbohydrate_enz/data/raw_external/tempo_in_genome/fasta/332_genome_annotations/cds/yHMPu5000034986_candida_oregonensis_160519.max.cds  
  inflating: /data/Work/projects/yeast_carbohydrate_enz/data/raw_external/tempo_in_genome/fasta/332_genom

  inflating: /data/Work/projects/yeast_carbohydrate_enz/data/raw_external/tempo_in_genome/fasta/332_genome_annotations/cds/yHMPu5000035643_yarrowia_bubula_160519.max.cds  
  inflating: /data/Work/projects/yeast_carbohydrate_enz/data/raw_external/tempo_in_genome/fasta/332_genome_annotations/cds/yHMPu5000035645_yarrowia_divulgata_160519.max.cds  
  inflating: /data/Work/projects/yeast_carbohydrate_enz/data/raw_external/tempo_in_genome/fasta/332_genome_annotations/cds/yHMPu5000035650_trigonopsis_variabilis_160613.max.cds  
  inflating: /data/Work/projects/yeast_carbohydrate_enz/data/raw_external/tempo_in_genome/fasta/332_genome_annotations/cds/yHMPu5000035651_torulaspora_microellipsoides_160519.max.cds  
  inflating: /data/Work/projects/yeast_carbohydrate_enz/data/raw_external/tempo_in_genome/fasta/332_genome_annotations/cds/yHMPu5000035652_torulaspora_maleeae_160613.max.cds  
  inflating: /data/Work/projects/yeast_carbohydrate_enz/data/raw_external/tempo_in_genome/fasta/332_genome_annota

  inflating: /data/Work/projects/yeast_carbohydrate_enz/data/raw_external/tempo_in_genome/fasta/332_genome_annotations/cds/zygosaccharomyces_rouxii.max.cds  
   creating: /data/Work/projects/yeast_carbohydrate_enz/data/raw_external/tempo_in_genome/fasta/332_genome_annotations/gtf/
  inflating: /data/Work/projects/yeast_carbohydrate_enz/data/raw_external/tempo_in_genome/fasta/332_genome_annotations/gtf/alloascoidea_hylecoeti.max.gtf  
  inflating: /data/Work/projects/yeast_carbohydrate_enz/data/raw_external/tempo_in_genome/fasta/332_genome_annotations/gtf/ambrosiozyma_kashinagacola.max.gtf  
  inflating: /data/Work/projects/yeast_carbohydrate_enz/data/raw_external/tempo_in_genome/fasta/332_genome_annotations/gtf/ambrosiozyma_monospora.max.gtf  
  inflating: /data/Work/projects/yeast_carbohydrate_enz/data/raw_external/tempo_in_genome/fasta/332_genome_annotations/gtf/arxula_adeninivorans.max.gtf  
  inflating: /data/Work/projects/yeast_carbohydrate_enz/data/raw_external/tempo_in_genome/fa

  inflating: /data/Work/projects/yeast_carbohydrate_enz/data/raw_external/tempo_in_genome/fasta/332_genome_annotations/gtf/lachancea_kluyveri.max.gtf  
  inflating: /data/Work/projects/yeast_carbohydrate_enz/data/raw_external/tempo_in_genome/fasta/332_genome_annotations/gtf/lachancea_lanzarotensis.max.gtf  
  inflating: /data/Work/projects/yeast_carbohydrate_enz/data/raw_external/tempo_in_genome/fasta/332_genome_annotations/gtf/lachancea_meyersii.max.gtf  
  inflating: /data/Work/projects/yeast_carbohydrate_enz/data/raw_external/tempo_in_genome/fasta/332_genome_annotations/gtf/lachancea_mirantina.max.gtf  
  inflating: /data/Work/projects/yeast_carbohydrate_enz/data/raw_external/tempo_in_genome/fasta/332_genome_annotations/gtf/lachancea_nothofagi.max.gtf  
  inflating: /data/Work/projects/yeast_carbohydrate_enz/data/raw_external/tempo_in_genome/fasta/332_genome_annotations/gtf/lachancea_quebecensis.max.gtf  
  inflating: /data/Work/projects/yeast_carbohydrate_enz/data/raw_external/temp

  inflating: /data/Work/projects/yeast_carbohydrate_enz/data/raw_external/tempo_in_genome/fasta/332_genome_annotations/gtf/saccharomyces_mikatae.max.gtf  
  inflating: /data/Work/projects/yeast_carbohydrate_enz/data/raw_external/tempo_in_genome/fasta/332_genome_annotations/gtf/saccharomyces_paradoxus.max.gtf  
  inflating: /data/Work/projects/yeast_carbohydrate_enz/data/raw_external/tempo_in_genome/fasta/332_genome_annotations/gtf/saccharomyces_uvarum.max.gtf  
  inflating: /data/Work/projects/yeast_carbohydrate_enz/data/raw_external/tempo_in_genome/fasta/332_genome_annotations/gtf/saccharomycopsis_malanga.max.gtf  
  inflating: /data/Work/projects/yeast_carbohydrate_enz/data/raw_external/tempo_in_genome/fasta/332_genome_annotations/gtf/saprochaete_clavata.max.gtf  
  inflating: /data/Work/projects/yeast_carbohydrate_enz/data/raw_external/tempo_in_genome/fasta/332_genome_annotations/gtf/scheffersomyces_lignosus.max.gtf  
  inflating: /data/Work/projects/yeast_carbohydrate_enz/data/raw_

  inflating: /data/Work/projects/yeast_carbohydrate_enz/data/raw_external/tempo_in_genome/fasta/332_genome_annotations/gtf/yHMPu5000026274_komagataella_populi_160519.max.gtf  
  inflating: /data/Work/projects/yeast_carbohydrate_enz/data/raw_external/tempo_in_genome/fasta/332_genome_annotations/gtf/yHMPu5000034594_starmera_quercuum_160519.max.gtf  
  inflating: /data/Work/projects/yeast_carbohydrate_enz/data/raw_external/tempo_in_genome/fasta/332_genome_annotations/gtf/yHMPu5000034597_candida_stellimalicola_160519.max.gtf  
  inflating: /data/Work/projects/yeast_carbohydrate_enz/data/raw_external/tempo_in_genome/fasta/332_genome_annotations/gtf/yHMPu5000034604_sporopachydermia_lactativora_160519.max.gtf  
  inflating: /data/Work/projects/yeast_carbohydrate_enz/data/raw_external/tempo_in_genome/fasta/332_genome_annotations/gtf/yHMPu5000034605_spencermartinsiella_europaea_160519.max.gtf  
  inflating: /data/Work/projects/yeast_carbohydrate_enz/data/raw_external/tempo_in_genome/fasta/332_g

  inflating: /data/Work/projects/yeast_carbohydrate_enz/data/raw_external/tempo_in_genome/fasta/332_genome_annotations/gtf/yHMPu5000034757_lipomyces_doorenjongii_160519.max.gtf  
  inflating: /data/Work/projects/yeast_carbohydrate_enz/data/raw_external/tempo_in_genome/fasta/332_genome_annotations/gtf/yHMPu5000034758_lipomyces_japonicus_160519.max.gtf  
  inflating: /data/Work/projects/yeast_carbohydrate_enz/data/raw_external/tempo_in_genome/fasta/332_genome_annotations/gtf/yHMPu5000034760_lipomyces_kononenkoae_160519.max.gtf  
  inflating: /data/Work/projects/yeast_carbohydrate_enz/data/raw_external/tempo_in_genome/fasta/332_genome_annotations/gtf/yHMPu5000034761_lipomyces_lipofer_160519.max.gtf  
  inflating: /data/Work/projects/yeast_carbohydrate_enz/data/raw_external/tempo_in_genome/fasta/332_genome_annotations/gtf/yHMPu5000034862_zygotorulaspora_florentina_160519.max.gtf  
  inflating: /data/Work/projects/yeast_carbohydrate_enz/data/raw_external/tempo_in_genome/fasta/332_genome_ann

  inflating: /data/Work/projects/yeast_carbohydrate_enz/data/raw_external/tempo_in_genome/fasta/332_genome_annotations/gtf/yHMPu5000035011_candida_pyralidae_160519.max.gtf  
  inflating: /data/Work/projects/yeast_carbohydrate_enz/data/raw_external/tempo_in_genome/fasta/332_genome_annotations/gtf/yHMPu5000035018_candida_canberraensis_160519.max.gtf  
  inflating: /data/Work/projects/yeast_carbohydrate_enz/data/raw_external/tempo_in_genome/fasta/332_genome_annotations/gtf/yHMPu5000035022_candida_emberorum_160519.max.gtf  
  inflating: /data/Work/projects/yeast_carbohydrate_enz/data/raw_external/tempo_in_genome/fasta/332_genome_annotations/gtf/yHMPu5000035031_candida_kruisii_160519.max.gtf  
  inflating: /data/Work/projects/yeast_carbohydrate_enz/data/raw_external/tempo_in_genome/fasta/332_genome_annotations/gtf/yHMPu5000035032_candida_gatunensis_160519.max.gtf  
  inflating: /data/Work/projects/yeast_carbohydrate_enz/data/raw_external/tempo_in_genome/fasta/332_genome_annotations/gtf/yHMP

  inflating: /data/Work/projects/yeast_carbohydrate_enz/data/raw_external/tempo_in_genome/fasta/332_genome_annotations/gtf/yHMPu5000035665_middelhovenomyces_tepae_160613.max.gtf  
  inflating: /data/Work/projects/yeast_carbohydrate_enz/data/raw_external/tempo_in_genome/fasta/332_genome_annotations/gtf/yHMPu5000035667_kurtzmaniella_cleridarum_160928.max.gtf  
  inflating: /data/Work/projects/yeast_carbohydrate_enz/data/raw_external/tempo_in_genome/fasta/332_genome_annotations/gtf/yHMPu5000035670_phaffomyces_opuntiae_160613.max.gtf  
  inflating: /data/Work/projects/yeast_carbohydrate_enz/data/raw_external/tempo_in_genome/fasta/332_genome_annotations/gtf/yHMPu5000035671_phaffomyces_antillensis_160613.max.gtf  
  inflating: /data/Work/projects/yeast_carbohydrate_enz/data/raw_external/tempo_in_genome/fasta/332_genome_annotations/gtf/yHMPu5000035672_phaffomyces_thermotolerans_160613.max.gtf  
  inflating: /data/Work/projects/yeast_carbohydrate_enz/data/raw_external/tempo_in_genome/fasta/332

  inflating: /data/Work/projects/yeast_carbohydrate_enz/data/raw_external/tempo_in_genome/fasta/332_genome_annotations/pep/candida_apicola.max.pep  
  inflating: /data/Work/projects/yeast_carbohydrate_enz/data/raw_external/tempo_in_genome/fasta/332_genome_annotations/pep/candida_arabinofermentans.max.pep  
  inflating: /data/Work/projects/yeast_carbohydrate_enz/data/raw_external/tempo_in_genome/fasta/332_genome_annotations/pep/candida_auris.max.pep  
  inflating: /data/Work/projects/yeast_carbohydrate_enz/data/raw_external/tempo_in_genome/fasta/332_genome_annotations/pep/candida_boidinii_JCM9604.max.pep  
  inflating: /data/Work/projects/yeast_carbohydrate_enz/data/raw_external/tempo_in_genome/fasta/332_genome_annotations/pep/candida_carpophila.max.pep  
  inflating: /data/Work/projects/yeast_carbohydrate_enz/data/raw_external/tempo_in_genome/fasta/332_genome_annotations/pep/candida_dubliniensis.max.pep  
  inflating: /data/Work/projects/yeast_carbohydrate_enz/data/raw_external/tempo_i

  inflating: /data/Work/projects/yeast_carbohydrate_enz/data/raw_external/tempo_in_genome/fasta/332_genome_annotations/pep/metschnikowia_aberdeeniae.max.pep  
  inflating: /data/Work/projects/yeast_carbohydrate_enz/data/raw_external/tempo_in_genome/fasta/332_genome_annotations/pep/metschnikowia_arizonensis.max.pep  
  inflating: /data/Work/projects/yeast_carbohydrate_enz/data/raw_external/tempo_in_genome/fasta/332_genome_annotations/pep/metschnikowia_bicuspidata.max.pep  
  inflating: /data/Work/projects/yeast_carbohydrate_enz/data/raw_external/tempo_in_genome/fasta/332_genome_annotations/pep/metschnikowia_borealis.max.pep  
  inflating: /data/Work/projects/yeast_carbohydrate_enz/data/raw_external/tempo_in_genome/fasta/332_genome_annotations/pep/metschnikowia_bowlesiae.max.pep  
  inflating: /data/Work/projects/yeast_carbohydrate_enz/data/raw_external/tempo_in_genome/fasta/332_genome_annotations/pep/metschnikowia_cerradonensis.max.pep  
  inflating: /data/Work/projects/yeast_carbohydra

  inflating: /data/Work/projects/yeast_carbohydrate_enz/data/raw_external/tempo_in_genome/fasta/332_genome_annotations/pep/spathaspora_gorwiae.max.pep  
  inflating: /data/Work/projects/yeast_carbohydrate_enz/data/raw_external/tempo_in_genome/fasta/332_genome_annotations/pep/spathaspora_hagerdaliae.max.pep  
  inflating: /data/Work/projects/yeast_carbohydrate_enz/data/raw_external/tempo_in_genome/fasta/332_genome_annotations/pep/spathaspora_passalidarum.max.pep  
  inflating: /data/Work/projects/yeast_carbohydrate_enz/data/raw_external/tempo_in_genome/fasta/332_genome_annotations/pep/sporopachydermia_quercuum.max.pep  
  inflating: /data/Work/projects/yeast_carbohydrate_enz/data/raw_external/tempo_in_genome/fasta/332_genome_annotations/pep/starmerella_bombicola_JCM9596.max.pep  
  inflating: /data/Work/projects/yeast_carbohydrate_enz/data/raw_external/tempo_in_genome/fasta/332_genome_annotations/pep/sugiyamaella_lignohabitans.max.pep  
  inflating: /data/Work/projects/yeast_carbohydrat

  inflating: /data/Work/projects/yeast_carbohydrate_enz/data/raw_external/tempo_in_genome/fasta/332_genome_annotations/pep/yHMPu5000034611_saturnispora_mendoncae_160519.max.pep  
  inflating: /data/Work/projects/yeast_carbohydrate_enz/data/raw_external/tempo_in_genome/fasta/332_genome_annotations/pep/yHMPu5000034612_saturnispora_saitoi_160519.max.pep  
  inflating: /data/Work/projects/yeast_carbohydrate_enz/data/raw_external/tempo_in_genome/fasta/332_genome_annotations/pep/yHMPu5000034613_saturnispora_serradocipensis_160519.max.pep  
  inflating: /data/Work/projects/yeast_carbohydrate_enz/data/raw_external/tempo_in_genome/fasta/332_genome_annotations/pep/yHMPu5000034614_saturnispora_silvae_160519.max.pep  
  inflating: /data/Work/projects/yeast_carbohydrate_enz/data/raw_external/tempo_in_genome/fasta/332_genome_annotations/pep/yHMPu5000034615_saturnispora_zaruensis_160519.max.pep  
  inflating: /data/Work/projects/yeast_carbohydrate_enz/data/raw_external/tempo_in_genome/fasta/332_genom

  inflating: /data/Work/projects/yeast_carbohydrate_enz/data/raw_external/tempo_in_genome/fasta/332_genome_annotations/pep/yHMPu5000034876_tetrapisispora_iriomotensis_160519.max.pep  
  inflating: /data/Work/projects/yeast_carbohydrate_enz/data/raw_external/tempo_in_genome/fasta/332_genome_annotations/pep/yHMPu5000034877_tetrapisispora_namnaonensis_160519.max.pep  
  inflating: /data/Work/projects/yeast_carbohydrate_enz/data/raw_external/tempo_in_genome/fasta/332_genome_annotations/pep/yHMPu5000034881_torulaspora_pretoriensis_160519.max.pep  
  inflating: /data/Work/projects/yeast_carbohydrate_enz/data/raw_external/tempo_in_genome/fasta/332_genome_annotations/pep/yHMPu5000034883_peterozyma_xylosa_160519.max.pep  
  inflating: /data/Work/projects/yeast_carbohydrate_enz/data/raw_external/tempo_in_genome/fasta/332_genome_annotations/pep/yHMPu5000034884_peterozyma_toletana_160519.max.pep  
  inflating: /data/Work/projects/yeast_carbohydrate_enz/data/raw_external/tempo_in_genome/fasta/332_g

  inflating: /data/Work/projects/yeast_carbohydrate_enz/data/raw_external/tempo_in_genome/fasta/332_genome_annotations/pep/yHMPu5000035041_ambrosiozyma_pseudovanderkliftii_160519.max.pep  
  inflating: /data/Work/projects/yeast_carbohydrate_enz/data/raw_external/tempo_in_genome/fasta/332_genome_annotations/pep/yHMPu5000035044_barnettozyma_californica_160519.max.pep  
  inflating: /data/Work/projects/yeast_carbohydrate_enz/data/raw_external/tempo_in_genome/fasta/332_genome_annotations/pep/yHMPu5000035045_barnettozyma_hawaiiensis_160613.max.pep  
  inflating: /data/Work/projects/yeast_carbohydrate_enz/data/raw_external/tempo_in_genome/fasta/332_genome_annotations/pep/yHMPu5000035046_barnettozyma_populi_160613.max.pep  
  inflating: /data/Work/projects/yeast_carbohydrate_enz/data/raw_external/tempo_in_genome/fasta/332_genome_annotations/pep/yHMPu5000035047_barnettozyma_pratensis_160519.max.pep  
  inflating: /data/Work/projects/yeast_carbohydrate_enz/data/raw_external/tempo_in_genome/fast

  inflating: /data/Work/projects/yeast_carbohydrate_enz/data/raw_external/tempo_in_genome/fasta/332_genome_annotations/pep/yHMPu5000035677_kodamaea_ohmeri_160519.max.pep  
  inflating: /data/Work/projects/yeast_carbohydrate_enz/data/raw_external/tempo_in_genome/fasta/332_genome_annotations/pep/yHMPu5000035679_candida_rhagii_160613.max.pep  
  inflating: /data/Work/projects/yeast_carbohydrate_enz/data/raw_external/tempo_in_genome/fasta/332_genome_annotations/pep/yHMPu5000035681_candida_gotoi_160519.max.pep  
  inflating: /data/Work/projects/yeast_carbohydrate_enz/data/raw_external/tempo_in_genome/fasta/332_genome_annotations/pep/yHMPu5000035684_kloeckera_hatyaiensis_160519.max.pep  
  inflating: /data/Work/projects/yeast_carbohydrate_enz/data/raw_external/tempo_in_genome/fasta/332_genome_annotations/pep/yHMPu5000035686_cyberlindnera_saturnus_160519.max.pep  
  inflating: /data/Work/projects/yeast_carbohydrate_enz/data/raw_external/tempo_in_genome/fasta/332_genome_annotations/pep/yHMPu50

#### Move the two genomes that are outside the folder structure

In [11]:
# move dna files
target = join(RAW_EXTERNAL, 'tempo_in_genome', 'fasta', '332_genome_annotations', 'cds')

filepath = join(RAW_EXTERNAL, 'tempo_in_genome', 'fasta', '332_genome_annotations', 'Candida_albicans_SC5314_A22_current_default_coding.fasta')
!mv $filepath $target

filepath = join(RAW_EXTERNAL, 'tempo_in_genome', 'fasta', '332_genome_annotations', 'Saccharomyces_cerevisiae_S288C_coding.fasta')
!mv $filepath $target


# move protein files
target = join(RAW_EXTERNAL, 'tempo_in_genome', 'fasta', '332_genome_annotations', 'pep')

filepath = join(RAW_EXTERNAL, 'tempo_in_genome', 'fasta', '332_genome_annotations', 'Candida_albicans_SC5314_A22_current_default_protein.fasta')
!mv $filepath $target

filepath = join(RAW_EXTERNAL, 'tempo_in_genome', 'fasta', '332_genome_annotations', 'Saccharomyces_cerevisiae_S288C_protein.fasta')
!mv $filepath $target

mv: cannot stat '/data/Work/projects/yeast_carbohydrate_enz/data/raw_external/tempo_in_genome/fasta/332_genome_annotations/Candida_albicans_SC5314_A22_current_default_coding.fasta': No such file or directory
mv: cannot stat '/data/Work/projects/yeast_carbohydrate_enz/data/raw_external/tempo_in_genome/fasta/332_genome_annotations/Saccharomyces_cerevisiae_S288C_coding.fasta': No such file or directory
mv: cannot stat '/data/Work/projects/yeast_carbohydrate_enz/data/raw_external/tempo_in_genome/fasta/332_genome_annotations/Candida_albicans_SC5314_A22_current_default_protein.fasta': No such file or directory
mv: cannot stat '/data/Work/projects/yeast_carbohydrate_enz/data/raw_external/tempo_in_genome/fasta/332_genome_annotations/Saccharomyces_cerevisiae_S288C_protein.fasta': No such file or directory


#### First I need to go through and remove the whitespace from the FASTA headers

In [12]:

files = os.listdir(tempo_fasta_path)

for fi in sorted(files):
    
    if fi.endswith('.pep') or fi.endswith('.fasta'):
        infile = join(tempo_fasta_path, fi)
        outfile = join(tempo_fasta_fixed_headers_path, fi)
        
        if exists(outfile):
            continue
            
        print(fi)
        with open(outfile, 'w') as fo:
            
            for record in SeqIO.parse(infile, "fasta"):
                fo.write('>%s\n' % record.description.replace(' ', '_'))
                fo.write('%s\n' % record.seq)


#### For each of the fasta files, cluster them at 98% identity

In [13]:
filepath = tempo_fasta_fixed_headers_path

word = 5
cutoff = 0.98 # 98% identity
memory = 6000

files = os.listdir(filepath)

for fi in sorted(files):
    if fi.endswith('.pep'):
        file_end = '.pep'
        
    elif fi.endswith('.fasta'):
        file_end = '.fasta'
        
    else:
        raise ValueError
        
    infile = join(filepath, fi)
    outfile = join(tempo_clustered_path, fi.replace(file_end, '.clustered%s') % file_end)
    
    if not exists(outfile):
        print(fi)
        my_cmd = 'cd-hit -i %s -o %s -c %s -n %s -T 1 -M %s' % (infile, outfile, cutoff, word, memory)
        os.system(my_cmd)

#### Print some stats

In [14]:
def norm_organism(orgname):
    '''
    Return abbreviated organism name
    '''
    orgname = orgname.replace('[', '').replace(']', '')
    
    # two organisms occur twice, deal with these special cases
    if orgname == 'Metschnikowia matae var. maris':
        return 'metschnikowia_matae_maris'
    
    elif orgname == 'Nadsonia fulvescens var. elongata':
        return 'nadsonia_fulvescens_var_elongata'
      
    # now parse filename for the others
    if orgname.startswith('yH'):
        organism = '_'.join(orgname.split()[1:3]).lower().replace('.', '')
    else:
        organism = '_'.join(orgname.split()[:2]).lower().replace('.', '')
    
    return organism


def get_fasta_gene_number(path):
    '''
    For an organism count the number of genes in fasta file.
    '''
    data = {'genome':[], 'organism':[], 'genes_in_fasta':[]}
    
    for fi in sorted(os.listdir(path)):
        
        if fi.endswith('.pep') or fi.endswith('.fasta'):
            filepath = join(path, fi)
            
            genome = fi.split('.')[0]
            org = norm_organism(genome.replace('_', ' '))
            
            counter = 0
            for record in SeqIO.parse(filepath, "fasta"):
                counter += 1

            data['genome'].append(genome)
            data['organism'].append(org)
            data['genes_in_fasta'].append(counter)
    
    return pd.DataFrame(data)


# get gene numbers from the original fasta files
path = tempo_fasta_fixed_headers_path
original_df = get_fasta_gene_number(path)
original_df.columns = ['genome', 'organism', 'genes_before_filter']
display(original_df.describe())

# get gene numbers for the clustered data
path = tempo_clustered_path
clustered_df = get_fasta_gene_number(path)
clustered_df.columns = ['genome', 'organism', 'genes_after_filter']
display(clustered_df.describe())     

# combine
stats_df = original_df.merge(clustered_df, on=['genome', 'organism'])
stats_df['difference'] = stats_df.genes_before_filter - stats_df.genes_after_filter
display(stats_df.describe())

# save to disk
stats_df.to_csv(join(FINAL, 'de-duplication_stats.tsv'), sep='\t', index=False)

Unnamed: 0,genes_before_filter
count,332.0
mean,5700.903614
std,1043.299566
min,4162.0
25%,5060.0
50%,5556.0
75%,5983.5
max,12786.0


Unnamed: 0,genes_after_filter
count,332.0
mean,5606.608434
std,962.598863
min,4128.0
25%,4984.5
50%,5464.5
75%,5901.75
max,11757.0


Unnamed: 0,genes_before_filter,genes_after_filter,difference
count,332.0,332.0,332.0
mean,5700.903614,5606.608434,94.295181
std,1043.299566,962.598863,193.860456
min,4162.0,4128.0,1.0
25%,5060.0,4984.5,12.0
50%,5556.0,5464.5,26.5
75%,5983.5,5901.75,75.25
max,12786.0,11757.0,1565.0
