In [1]:
import requests
import os
import csv
import time
from multiprocessing import Pool
from Bio import Entrez
from Bio import SeqIO

In [5]:
Entrez.email = "andja13damnjanovic@gmail.com"

def downloadSARSCOV2(protein_id):

# Fetch the GenBank file from NCBI
    with Entrez.efetch(db="protein", id=protein_id, rettype="gb", retmode="text") as handle:
    # Save to local file
        with open(f"../processedDatasets/SARS-CoV2/{protein_id}.gb", "w") as out_handle:
            out_handle.write(handle.read())

    print(f"GenBank file for {protein_id} has been saved as {protein_id}.gb")


In [9]:
for record in SeqIO.parse("../aaFastaFiles/SARS-CoV2.fasta", "fasta"):
    id = record.id
    id = id.replace(".1", "").replace(".2", "")
    downloadSARSCOV2(id)

GenBank file for QIG55955 has been saved as QIG55955.gb
GenBank file for QIC53213 has been saved as QIC53213.gb
GenBank file for QHR63290 has been saved as QHR63290.gb
GenBank file for QHR63280 has been saved as QHR63280.gb
GenBank file for QHR63270 has been saved as QHR63270.gb
GenBank file for QHR63260 has been saved as QHR63260.gb
GenBank file for QHR63250 has been saved as QHR63250.gb
GenBank file for UMA71172 has been saved as UMA71172.gb
GenBank file for UMA71171 has been saved as UMA71171.gb
GenBank file for XHS93831 has been saved as XHS93831.gb
GenBank file for XHS93830 has been saved as XHS93830.gb
GenBank file for XHS93829 has been saved as XHS93829.gb
GenBank file for XHS93828 has been saved as XHS93828.gb
GenBank file for XHS93827 has been saved as XHS93827.gb
GenBank file for XHS93826 has been saved as XHS93826.gb
GenBank file for XHS93825 has been saved as XHS93825.gb
GenBank file for XHS93824 has been saved as XHS93824.gb
GenBank file for XHS93823 has been saved as XHS9

In [2]:
def createFolder(folder):
    if not os.path.exists(folder):
        os.makedirs(folder)
        print(f"Folder '{folder}' created.")
    else:
        print(f"Folder '{folder}' already exists.")

In [3]:
createFolder("../processedDatasets")

Folder '../processedDatasets' created.


In [3]:
folderToProcess = '../datasets'

for folder in os.listdir(folderToProcess):
    if folder.startswith("."):
        continue
    newPath = "../processedDatasets/" + folder
    createFolder(newPath)

Folder '../processedDatasets/IBV' already exists.
Folder '../processedDatasets/feline-CoV' already exists.
Folder '../processedDatasets/HCoV-229E' already exists.
Folder '../processedDatasets/dolphin-CoV' already exists.
Folder '../processedDatasets/bovine-CoV' already exists.
Folder '../processedDatasets/hedgehog-CoV' already exists.
Folder '../processedDatasets/HCoV-OC43' already exists.
Folder '../processedDatasets/SARS-CoV' already exists.
Folder '../processedDatasets/ferret-CoV' already exists.
Folder '../processedDatasets/MERS-CoV' already exists.
Folder '../processedDatasets/porcine-CoV' already exists.
Folder '../processedDatasets/rabbit-CoV' created.
Folder '../processedDatasets/turkey-CoV' already exists.
Folder '../processedDatasets/HCoV-HKU1' already exists.
Folder '../processedDatasets/SARS-CoV2' already exists.
Folder '../processedDatasets/rat-CoV' already exists.
Folder '../processedDatasets/equine-CoV' already exists.
Folder '../processedDatasets/HCoV-NL63' already exis

In [2]:
def downloadNucleotide(nucleotide, folderPath, max_retries = 5, delay = 0.4):
    url = f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nuccore&id={nucleotide}&rettype=gb&retmode=text"
    fileName = f"{nucleotide}.gb"
    
    response = requests.get(url)
    filePath = os.path.join(folderPath, fileName)
    print(filePath)
    with open(filePath, "w") as f:
        f.write(response.text)
       # print(filePath)
        print(f"Downloaded: {nucleotide}.gb")

In [3]:
def downloadAccession(accession, folderPath):
    url = (
        f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
        f"?db=protein&id={accession}&rettype={format}&retmode=text"
    )
    fileName = f"{accession}.gp"
    response = requests.get(url)
    filePath = os.path.join(folderPath, fileName)
    print(filePath)
    with open(filePath, "wb") as f:
        f.write(response.content)
       # print(filePath)
        print(f"Downloaded: {accession}.gp")

In [4]:
def processFiles(filename, folderToWriteTo):
    global sequence_files
    with open(filename, mode='r') as file:
        #print(filename)
        acceptedValues = ["peplomer protein", "s", "s1", "glycoprotein s"]
        csvReader = csv.DictReader(file)
        for row in csvReader:
            type = row["Protein"].lower()
            if type.startswith("spike") or type.startswith("e2") or type.startswith("surface") or type.startswith("s ") or type  in acceptedValues:
                if row["Nucleotide"] == "":
                    downloadAccession(row["Accession"], folderToWriteTo)
                else:
                    downloadNucleotide(row["Nucleotide"], folderToWriteTo)

In [7]:
folderToProcess = '../datasets'

for baseFolder in os.listdir(folderToProcess):
    if baseFolder.startswith("."):
        continue
    newFolder = folderToProcess + "/" + baseFolder
    for file in os.listdir(newFolder):
        if file.startswith("."):
            continue
        filepath = newFolder + "/" + file
        #print(newFolder.replace("datasets", "processedDatasets"))
        processFiles(filepath, newFolder.replace("datasets", "processedDatasets"))

In [5]:
def processFiles2(filename, folderToWriteTo):
    with open(filename, mode='r') as file:
        csvReader = csv.DictReader(file)
        for row in csvReader:
            if row["Accession"]:
                accession = row["Accession"]
                url = (
                    f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
                    f"?db=protein&id={accession}&rettype={format}&retmode=text"
                )
                fileName = f"{accession}.fasta"
                response = requests.get(url)
                filePath = os.path.join(folderTWriteTo, fileName)
                

In [3]:
i = 0
with open("../datasets/SARS-CoV2/SARS-CoV2.csv", "r") as inFile, open("../processedDatasets/SARS-CoV2/accessions.txt", "w") as out:
    csvReader = csv.DictReader(inFile)
    for row in csvReader:
        i += 1
        if row["Nucleotide"] == "":
            out.write(row["Accession"])
            out.write("\n")
        else:
            out.write(row["Nucleotide"])
            out.write("\n")
print(i)

4112783
