In [24]:
import os
import csv

from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio import SeqIO

In [25]:
def createFolder(folder):
    if not os.path.exists(folder):
        os.makedirs(folder)
        print(f"Folder '{folder}' created.")
    else:
        print(f"Folder '{folder}' already exists.")

In [26]:
createFolder("../csvFiles")

Folder '../csvFiles' created.


In [27]:
createFolder("../aaFastaFiles")

Folder '../aaFastaFiles' created.


In [28]:
createFolder("../nuclFastaFiles")

Folder '../nuclFastaFiles' created.


In [29]:
def processFeatureSource(feature, record, infoFile):
    id = record.id
    organism = feature.qualifiers.get("organism", ["N/A"])[0]
    strain = feature.qualifiers.get("strain", ["N/A"])[0]
    molprod = feature.qualifiers.get("mol_prod", ["N/A"])[0]
    dbXref = feature.qualifiers.get("db_xref", ["N/A"])[0]
    location = feature.qualifiers.get("geo_loc_name", ["N/A"])[0]
    data = [[id, organism, strain, molprod, dbXref, location]]
    with open(infoFile, 'a', newline='') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerows(data)

In [30]:
def processCSVdatasets(filename, collectionDates):
    with open(filename, mode='r') as file:
        acceptedValues = ["peplomer protein", "s", "s1", "glycoprotein s"]
        csvReader = csv.DictReader(file)
        for row in csvReader:
            type = row["Protein"].lower()
            if type.startswith("spike") or type.startswith("e2") or type.startswith("surface") or type.startswith("s ") or type  in acceptedValues:
                if row["Nucleotide"] == "":
                    data = [[row["Accession"], row["Collection_Date"]]]
                else:
                    data = [[row["Nucleotide"], row["Collection_Date"]]]
                with open(collectionDates, 'a', newline='') as csvfile:
                    writer = csv.writer(csvfile)
                    writer.writerows(data)

In [31]:
def writeToAAfasta(feature, record, aaFasta):
    records = [
            SeqRecord(Seq(feature.qualifiers.get("translation", ["N/A"])[0]), record.id, description=""),
    ]
                
    with open(aaFasta, "a") as f:
        SeqIO.write(records, f, "fasta")

In [32]:
def writeToNuclFasta(feature, record, nuclFasta):
    records = [
            SeqRecord(Seq(record.seq[feature.location.start:feature.location.end]), record.id, description=""),
    ]
    with open(nuclFasta, "a") as f:
        SeqIO.write(records, f, "fasta")

In [33]:
def processGBformat(sequence, aaFasta, nuclFasta, infoFile):
    record = SeqIO.read(sequence, "genbank")
    #ind = 0
    acceptedValues = ["s", "s1", "glycoprotein s"]
    for feature in record.features:
        if feature.type == "source":
            processFeatureSource(feature, record, infoFile)
        if feature.type == "CDS":
            gene = feature.qualifiers.get("gene", ["N/A"])[0]
            product = feature.qualifiers.get("product", ["N/A"])[0]
            prod = product.lower()
            gene = gene.lower()
            #print(gene)
            if prod.find("spike") != -1 or prod.startswith("e2") or prod.startswith("surface") or prod.startswith("s ") or prod.find("peplomer") != -1 or prod in acceptedValues or gene == "spike" or gene == "s":
                writeToAAfasta(feature, record, aaFasta)
                writeToNuclFasta(feature, record, nuclFasta)
                break

                
    #if ind != 1:
    #    print(sequence)

In [34]:
def processGPformat(sequence, aaFasta, infoFile):
    record = SeqIO.read(sequence, "genbank")
    
    for feature in record.features:
        if feature.type == "source":
            processFeatureSource(feature, record, infoFile)
        if feature.type == "Protein":
            writeToNuclFasta(feature, record, aaFasta)
            break

In [35]:
def makeSourceCSVfile(filename):
    headers = ["ID", "Organism", "Strain", "Molecule product", "DB xref", "location"]
    with open(filename, mode="a", newline="") as file:
        writer = csv.writer(file)
        writer.writerow(headers)

In [36]:
def makeCollectionDateCSVfile(filename):
    headers = ["ID", "Collection date"]
    with open(filename, mode="a", newline="") as file:
        writer = csv.writer(file)
        writer.writerow(headers)

In [37]:
baseFolder = "../processedDatasets"
for folder in os.listdir(baseFolder):
    if folder.startswith("."):
        continue

    newFolder = baseFolder + "/" + folder
    print(newFolder)
    
    csvSourceFile = "../csvFiles/" + folder + "Source.csv"

    aaFastaFile = newFolder.replace("../processedDatasets", "../aaFastaFiles") + ".fasta"
    nuclFastaFile = newFolder.replace("../processedDatasets", "../nuclFastaFiles") + ".fasta"

    makeSourceCSVfile(csvSourceFile)
    print(csvSourceFile)
    for file in os.listdir(newFolder):
        path = newFolder + "/" + file
        if file.startswith("."):
            continue
        elif file.endswith(".gb"):
            processGBformat(path, aaFastaFile, nuclFastaFile, csvSourceFile)
        elif file.endswith(".gp"):
            processGPformat(path, aaFastaFile, csvSourceFile)

../processedDatasets/IBV
../csvFiles/IBVSource.csv
../processedDatasets/feline-CoV
../csvFiles/feline-CoVSource.csv
../processedDatasets/HCoV-229E
../csvFiles/HCoV-229ESource.csv
../processedDatasets/dolphin-CoV
../csvFiles/dolphin-CoVSource.csv
../processedDatasets/bovine-CoV
../csvFiles/bovine-CoVSource.csv
../processedDatasets/hedgehog-CoV
../csvFiles/hedgehog-CoVSource.csv
../processedDatasets/HCoV-OC43
../csvFiles/HCoV-OC43Source.csv
../processedDatasets/SARS-CoV
../csvFiles/SARS-CoVSource.csv
../processedDatasets/ferret-CoV
../csvFiles/ferret-CoVSource.csv
../processedDatasets/MERS-CoV
../csvFiles/MERS-CoVSource.csv
../processedDatasets/porcine-CoV
../csvFiles/porcine-CoVSource.csv
../processedDatasets/rabbit-CoV
../csvFiles/rabbit-CoVSource.csv
../processedDatasets/turkey-CoV
../csvFiles/turkey-CoVSource.csv
../processedDatasets/HCoV-HKU1
../csvFiles/HCoV-HKU1Source.csv
../processedDatasets/SARS-CoV2
../csvFiles/SARS-CoV2Source.csv
../processedDatasets/rat-CoV
../csvFiles/rat-Co

In [38]:
baseFolder = "../datasets"
notOk = ["SARS-CoV2"]

for folder in os.listdir(baseFolder):
    if folder.startswith(".") or folder in notOk:
        continue
        
    newFolder = baseFolder + "/" + folder
    csvCollectionDateFile = "../csvFiles/" + folder + "CollectionDate.csv"

    makeCollectionDateCSVfile(csvCollectionDateFile)
    print(csvCollectionDateFile)
    
    for file in os.listdir(newFolder):
        path = newFolder + "/" + file
        if file.startswith("."):
            continue
        processCSVdatasets(path, csvCollectionDateFile)

../csvFiles/IBVCollectionDate.csv
../csvFiles/feline-CoVCollectionDate.csv
../csvFiles/HCoV-229ECollectionDate.csv
../csvFiles/dolphin-CoVCollectionDate.csv
../csvFiles/bovine-CoVCollectionDate.csv
../csvFiles/hedgehog-CoVCollectionDate.csv
../csvFiles/HCoV-OC43CollectionDate.csv
../csvFiles/SARS-CoVCollectionDate.csv
../csvFiles/ferret-CoVCollectionDate.csv
../csvFiles/MERS-CoVCollectionDate.csv
../csvFiles/porcine-CoVCollectionDate.csv
../csvFiles/rabbit-CoVCollectionDate.csv
../csvFiles/turkey-CoVCollectionDate.csv
../csvFiles/HCoV-HKU1CollectionDate.csv
../csvFiles/rat-CoVCollectionDate.csv
../csvFiles/equine-CoVCollectionDate.csv
../csvFiles/HCoV-NL63CollectionDate.csv
../csvFiles/canine-CoVCollectionDate.csv
../csvFiles/bat-CoVCollectionDate.csv
