<a href="https://colab.research.google.com/github/GDAmitha/plasmidInteractions/blob/main/GeneBank_Parsing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install biopython

Collecting biopython
  Downloading biopython-1.81-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m15.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: biopython
Successfully installed biopython-1.81


In [None]:
from Bio import SeqIO
from Bio.SeqUtils import GC
import collections

def analyze_gene_sequence(genbank_file):
    # Parse the GenBank file
    #record = SeqIO.read(genbank_file, "genbank")
    full_sequence = str(record.seq)

    # Extract features
    feature_list = [{"type": feature.type, "location": str(feature.location)} for feature in record.features]

    # Gene structure (simplified for this example)
    gene_structure = {"coding_sequences": [], "introns": [], "exons": []}
    for feature in record.features:
        if feature.type == "CDS":
            gene_structure["coding_sequences"].append(str(feature.location))
        # Introns and exons can be added similarly

    # K-mer distribution (example with k=3)
    kmer_size = 3
    kmer_distribution = collections.Counter([full_sequence[i:i+kmer_size] for i in range(len(full_sequence) - kmer_size + 1)])

    # Physicochemical properties
    physicochemical_properties = {"GC_content": GC(record.seq)}

    # Compressed sequence and sketching are more complex and may require additional libraries or custom algorithms

    # Compile all information into a dictionary
    gene_data = {
        "full_sequence": full_sequence,
        "feature_list": feature_list,
        "gene_structure": gene_structure,
        "kmer_distribution": kmer_distribution,
        "physicochemical_properties": physicochemical_properties
        # Add other representations as needed
    }

    return gene_data



In [None]:
import requests
import gzip
import os
from Bio import SeqIO

# Function to download and decompress the file
def download_and_decompress(url, local_filename):
    # Download the file
    with requests.get(url, stream=True) as r:
        r.raise_for_status()
        with open(local_filename, 'wb') as f:
            for chunk in r.iter_content(chunk_size=8192):
                f.write(chunk)

    # Decompress the file
    with gzip.open(local_filename, 'rb') as f_in:
        with open(local_filename[:-3], 'wb') as f_out:
            f_out.write(f_in.read())

    os.remove(local_filename)  # Remove the compressed file

# Base URL for GenBank sequence files
base_url = "https://ftp.ncbi.nih.gov/genbank/"

# Files to download and parse
files = ["gbbct1001.seq.gz", "gbbct1002.seq.gz", "gbbct1003.seq.gz", "gbbct1004.seq.gz", "gbbct1005.seq.gz"]

files_to_genedata = {}

for file in files:
    url = base_url + file
    local_filename = file
    print(f"Downloading and decompressing {file}...")
    download_and_decompress(url, local_filename)

    # Parse the decompressed file
    print(f"Parsing {local_filename[:-3]}...")
    with open(local_filename[:-3], "r") as handle:
        for record in SeqIO.parse(handle, "genbank"):
            gene_data = analyze_gene_sequence(record)
            # Call your previously defined function here
            # gene_data = analyze_gene_sequence(record)
            # Process the gene_data as needed
            gene_data = analyze_gene_sequence(record)
            files_to_genedata[record.name] = gene_data
            pass

    os.remove(local_filename[:-3])  # Remove the decompressed file



# Example usage
#genbank_file = "path_to_your_genbank_file.gb"
#print(gene_data)

Downloading and decompressing gbbct1001.seq.gz...
Parsing gbbct1001.seq...




Downloading and decompressing gbbct1002.seq.gz...
Parsing gbbct1002.seq...
Downloading and decompressing gbbct1003.seq.gz...
Parsing gbbct1003.seq...
Downloading and decompressing gbbct1004.seq.gz...
Parsing gbbct1004.seq...
Downloading and decompressing gbbct1005.seq.gz...
Parsing gbbct1005.seq...


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
def write_gene_data_to_file(files_to_genedata, filename):
    with open(filename, 'w') as file:
        json.dump(files_to_genedata, file, indent=4)

In [None]:
import json
write_gene_data_to_file(files_to_genedata, 'gene_data_output.json')

The output file is currently 1 GB rn!

# New Section

In [None]:
from google.colab import files

files.download('gene_data_output.json')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:

# Path in your Google Drive where you want to save the file
# path_to_save = '/content/drive/My Drive/gene_data_output.json'

# # Assuming 'all_feature_lists' is your data
# with open(path_to_save, 'w') as file:
#     json.dump(all_feature_lists, file, indent=4)
from google.colab import drive
drive.mount('/content/drive')

import gzip
import shutil

with open('/content/drive/My Drive/gene_data_output.json', 'rb') as f_in:
    with gzip.open('/content/drive/My Drive/gene_data_output.json.gz', 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
/content/gene_data_output.json