In [1]:
import pandas as pd
import csv
from Bio import SeqIO
from Bio.SeqFeature import FeatureLocation
from Bio.Seq import UndefinedSequenceError

## Obtaining valid accession numbers

In [2]:
accession_values = pd.read_csv('../data/processed/model_data.csv')
# List of 'Accession' values from the first DataFrame
accession_values = accession_values['Accession'].tolist()


## Processing full genbank file

In [3]:
genbank_file_path = "/Users/Alvaro/Downloads/8Sep2023_phages_downloaded_from_genbank.gb"
# Lists to hold data
ids = []
genome_lengths = []
gc_contents = []
sequences = []
reverse_complements = []
cds_numbers = []
positive_strands = []
negative_strands = []
coding_capacities = []
molecule_types = []
topologies = []
trna_counts = []

# Read the GenBank file
for record in SeqIO.parse(genbank_file_path, "genbank"):
    try:
        # Attempt to access the sequence, which may raise UndefinedSequenceError
        sequence = str(record.seq)
        print(record.id)
    except UndefinedSequenceError:
        print(f"Skipping record {record.id} as sequence is undefined.")
        continue  # Skip this record

    # Calculate genome length and GC content
    total_length = len(sequence)
    gc_content = round((sequence.count('C') + sequence.count('G')) / total_length * 100, 3)

    # Initialize counters
    plus = 0
    minus = 0
    coding_count = 0
    trna_count = 0

    for feature in record.features:
        if feature.type == 'CDS':
            feat_length = len(FeatureLocation(feature.location.start, feature.location.end))
            coding_count += feat_length

            if feature.location.strand == 1:
                plus += 1
            elif feature.location.strand == -1:
                minus += 1
        if feature.type == 'tRNA':
            trna_count += 1

    # Calculate total number of CDS
    total_CDS = plus + minus

    # Calculate strand usage as a percentage
    per_plus = round((plus / total_CDS) * 100, 2) if total_CDS != 0 else 0
    per_minus = round((minus / total_CDS) * 100, 2) if total_CDS != 0 else 0

    # Calculate coding capacity as a percentage
    coding_capacity = (coding_count / total_length) * 100

    # Extract molecule_type and topology
    molecule_type = record.annotations.get('molecule_type', 'N/A')
    topology = record.annotations.get('topology', 'N/A')

    # Append data to lists
    ids.append(record.id)
    genome_lengths.append(total_length)
    gc_contents.append(gc_content)
    sequences.append(sequence)
    reverse_complements.append(str(sequence[::-1]))
    cds_numbers.append(total_CDS)
    positive_strands.append(per_plus)
    negative_strands.append(per_minus)
    coding_capacities.append(coding_capacity)
    molecule_types.append(molecule_type)
    topologies.append(topology)
    trna_counts.append(trna_count)

# Convert lists to pandas DataFrame
df = pd.DataFrame({
    'id': ids,
    'genome_length': genome_lengths,
    'gc_%': gc_contents,
    'sequence': sequences,
    'reverse_complement': reverse_complements,
    'cds_number': cds_numbers,
    'positive_strand_%': positive_strands,
    'negative_strand_%': negative_strands,
    'coding_capacity': coding_capacities,
    'molecule_type': molecule_types,
    'topology': topologies,
    'trna_count': trna_counts
})

df

FileNotFoundError: [Errno 2] No such file or directory: '/Users/Alvaro/Downloads/8Sep2023_phages_downloaded_from_genbank.gb'

In [None]:
# Before changing coding capacity
df = df[['id', 'genome_length',  'molecule_type', 'topology', 'gc_%', 'trna_count','cds_number', 'coding_capacity','positive_strand_%','negative_strand_%' ,'sequence', 'reverse_complement']]
print(df['coding_capacity'].mean())
df.head()

76.2919810438814


Unnamed: 0,id,genome_length,molecule_type,topology,gc_%,trna_count,cds_number,coding_capacity,positive_strand_%,negative_strand_%,sequence,reverse_complement
0,GU339467.1,53332,DNA,linear,64.53,1,95,92.627316,38.95,61.05,TGCGGCTGCCCCATCCTGTACGGGTTTCCAAGTCGATCTCGGGGGC...,AATGGCTGGCCAGTCCTTCTGTGGGGGACCCCCATAGGGGGGACCC...
1,KT462701.1,3580,RNA,linear,48.296,0,1,38.072626,100.0,0.0,GGAGACAGATATGACTAATCTGACATTCGCTTTCACTAATGAATGG...,ATCTCGGAAGACACGCCTACTAGGCCGTGGACTGAATAGTACCAGT...
2,KT462700.1,3609,RNA,linear,41.701,0,3,94.846218,100.0,0.0,ACGTCTGTCGTCGCACGATTATTAAGGAGGTGCTATATGGAAATTT...,AGTACAGTCCCTTCTTGCTGTTCTATGTAAGACAGTTCTTATTAGA...
3,KT462698.1,3776,RNA,linear,44.147,0,3,98.013771,100.0,0.0,CAGTAATCGCAATCCAAGGAAAGGGAAATTTACTTATCCCATTCTT...,GAACGGGGGCCGAAACCGCATTTCGGACGGTCGTTGATAAGCTAGT...
4,MF417929.1,32618,DNA,linear,39.218,0,42,88.635109,16.67,83.33,ATGTTGTCTAGTCCATCATAGGTGCAACGGATATACGAGCATTTTT...,AATTTTTATTAATTATTATTAGTTTTTCAATTATATTTTTTATATT...


In [98]:
genbank_file_path = "../data/raw/inphared_8Sep2023/8Sep2023_phages_downloaded_from_genbank.gb"
# Lists to hold data
ids = []
genome_lengths = []
gc_contents = []
sequences = []
reverse_complements = []
cds_numbers = []
positive_strands = []
negative_strands = []
coding_capacities = []
molecule_types = []
topologies = []
trna_counts = []

# Read the GenBank file
for record in SeqIO.parse(genbank_file_path, "genbank"):
    try:
        # Attempt to access the sequence, which may raise UndefinedSequenceError
        sequence = str(record.seq)
        print(record.id)
    except UndefinedSequenceError:
        print(f"Skipping record {record.id} as sequence is undefined.")
        continue  # Skip this record

    # Calculate genome length and GC content
    total_length = len(sequence)
    gc_content = round((sequence.count('C') + sequence.count('G')) / total_length * 100, 3)

    # Initialize counters
    plus = 0
    minus = 0
    coding_count = 0
    trna_count = 0
    seen = set()  # Store seen barcodes

    for feature in record.features:
        start = feature.location.start
        end = feature.location.end
        length = len(FeatureLocation(start, end))
        barcode = f"{start}_{end}_{length}"

        if feature.type != 'source' and barcode not in seen:
            coding_count += length
            seen.add(barcode)

        if feature.type == 'CDS':
            if feature.location.strand == 1:
                plus += 1
            elif feature.location.strand == -1:
                minus += 1
        elif feature.type == 'tRNA':
            trna_count += 1


    # Calculate total number of CDS
    total_CDS = plus + minus

    # Calculate strand usage as a percentage
    per_plus = round((plus / total_CDS) * 100, 2) if total_CDS != 0 else 0
    per_minus = round((minus / total_CDS) * 100, 2) if total_CDS != 0 else 0

    # Calculate coding capacity as a percentage
    coding_capacity = (coding_count / total_length) * 100

    # Extract molecule_type and topology
    molecule_type = record.annotations.get('molecule_type', 'N/A')
    topology = record.annotations.get('topology', 'N/A')

    # Append data to lists
    ids.append(record.id)
    genome_lengths.append(total_length)
    gc_contents.append(gc_content)
    sequences.append(sequence)
    reverse_complements.append(str(sequence[::-1]))
    cds_numbers.append(total_CDS)
    positive_strands.append(per_plus)
    negative_strands.append(per_minus)
    coding_capacities.append(coding_capacity)
    molecule_types.append(molecule_type)
    topologies.append(topology)
    trna_counts.append(trna_count)

# Convert lists to pandas DataFrame
df = pd.DataFrame({
    'id': ids,
    'genome_length': genome_lengths,
    'gc_%': gc_contents,
    'sequence': sequences,
    'reverse_complement': reverse_complements,
    'cds_number': cds_numbers,
    'positive_strand_%': positive_strands,
    'negative_strand_%': negative_strands,
    'coding_capacity': coding_capacities,
    'molecule_type': molecule_types,
    'topology': topologies,
    'trna_count': trna_counts
})

df

GU339467.1
KT462701.1
KT462700.1
KT462698.1
Skipping record NZ_CP014526.1 as sequence is undefined.
Skipping record NC_022901.1 as sequence is undefined.
Skipping record NZ_CP038625.1 as sequence is undefined.
Skipping record NZ_CP023686.1 as sequence is undefined.
Skipping record NZ_CP023680.1 as sequence is undefined.
Skipping record NZ_CP019275.1 as sequence is undefined.
MF417929.1
MH616963.1
MH552500.1
BK010471.1
NR_074910.1
AC171169.12
BK014221.1
BK014220.1
BK014219.1
BK014218.1
BK014217.1
BK014216.1
BK014215.1
BK014214.1
BK014213.1
BK014212.1
BK014211.1
BK014210.1
BK014209.1
BK014208.1
BK014207.1
BK014206.1
BK014205.1
BK014204.1
BK014203.1
BK014202.1
BK014201.1
BK014200.1
BK014199.1
BK014198.1
BK014197.1
BK014196.1
BK014195.1
BK014194.1
BK014193.1
BK014192.1
BK014191.1
BK014190.1
BK014189.1
BK014188.1
BK014187.1
BK014186.1
BK014185.1
BK014184.1
BK014183.1
BK014182.1
BK014181.1
BK014180.1
BK014179.1
BK014178.1
BK014177.1
BK014176.1
BK014175.1
BK014174.1
BK014173.1
BK014172.1
BK01

Unnamed: 0,id,genome_length,gc_%,sequence,reverse_complement,cds_number,positive_strand_%,negative_strand_%,coding_capacity,molecule_type,topology,trna_count
0,GU339467.1,53332,64.530,TGCGGCTGCCCCATCCTGTACGGGTTTCCAAGTCGATCTCGGGGGC...,AATGGCTGGCCAGTCCTTCTGTGGGGGACCCCCATAGGGGGGACCC...,95,38.95,61.05,92.857946,DNA,linear,1
1,KT462701.1,3580,48.296,GGAGACAGATATGACTAATCTGACATTCGCTTTCACTAATGAATGG...,ATCTCGGAAGACACGCCTACTAGGCCGTGGACTGAATAGTACCAGT...,1,100.00,0.00,99.748603,RNA,linear,0
2,KT462700.1,3609,41.701,ACGTCTGTCGTCGCACGATTATTAAGGAGGTGCTATATGGAAATTT...,AGTACAGTCCCTTCTTGCTGTTCTATGTAAGACAGTTCTTATTAGA...,3,100.00,0.00,94.846218,RNA,linear,0
3,KT462698.1,3776,44.147,CAGTAATCGCAATCCAAGGAAAGGGAAATTTACTTATCCCATTCTT...,GAACGGGGGCCGAAACCGCATTTCGGACGGTCGTTGATAAGCTAGT...,3,100.00,0.00,98.013771,RNA,linear,0
4,MF417929.1,32618,39.218,ATGTTGTCTAGTCCATCATAGGTGCAACGGATATACGAGCATTTTT...,AATTTTTATTAATTATTATTAGTTTTTCAATTATATTTTTTATATT...,42,16.67,83.33,88.635109,DNA,linear,0
...,...,...,...,...,...,...,...,...,...,...,...,...
35754,AF466696.1,3128,44.757,TTATCCAAAACTCGGTTTACAGGAAACGGTAAATCAGGCTAGGAAC...,AATAGGTTTTGGAGCCAAATGTCCTTTGCCATTTAGTCCGAAGACC...,0,0.00,0.00,100.000000,DNA,linear,0
35755,AF109874.2,38347,36.222,CATGAAGAACTACATAGTAAAGGATAGATGTGCTTTAGATAATGGC...,TTGCTATTATTTGTTATTATCTTATTTATATTTTATTTTTGCCGAA...,56,94.64,5.36,92.346207,DNA,circular,0
35756,J02473.1,2468,42.666,GAATTCTCGTACATCATTGGAATAGCGAGAGACGTTTTAATTAATC...,GCTATAACCACAACATCCCCTTTGGTATCAACAATCCAAATTAATC...,0,0.00,0.00,0.000000,DNA,linear,0
35757,X13010.1,3408,41.050,GTACCCTTCTTTTGCTTGTTCTTCTGTTGAAACCCTAACGTATATC...,CTTAAGTATAAGGTAAAGTAGAAAAACCTGCCCGAAACAGGGCTGG...,0,0.00,0.00,151.995305,DNA,linear,0


In [100]:
# After changing coding capacity
df = df[['id', 'genome_length',  'molecule_type', 'topology', 'gc_%', 'trna_count','cds_number', 'coding_capacity','positive_strand_%','negative_strand_%' ,'sequence', 'reverse_complement']]
print(df['coding_capacity'].mean())
df.head()

79.98662119883382


Unnamed: 0,id,genome_length,molecule_type,topology,gc_%,trna_count,cds_number,coding_capacity,positive_strand_%,negative_strand_%,sequence,reverse_complement
0,GU339467.1,53332,DNA,linear,64.53,1,95,92.857946,38.95,61.05,TGCGGCTGCCCCATCCTGTACGGGTTTCCAAGTCGATCTCGGGGGC...,AATGGCTGGCCAGTCCTTCTGTGGGGGACCCCCATAGGGGGGACCC...
1,KT462701.1,3580,RNA,linear,48.296,0,1,99.748603,100.0,0.0,GGAGACAGATATGACTAATCTGACATTCGCTTTCACTAATGAATGG...,ATCTCGGAAGACACGCCTACTAGGCCGTGGACTGAATAGTACCAGT...
2,KT462700.1,3609,RNA,linear,41.701,0,3,94.846218,100.0,0.0,ACGTCTGTCGTCGCACGATTATTAAGGAGGTGCTATATGGAAATTT...,AGTACAGTCCCTTCTTGCTGTTCTATGTAAGACAGTTCTTATTAGA...
3,KT462698.1,3776,RNA,linear,44.147,0,3,98.013771,100.0,0.0,CAGTAATCGCAATCCAAGGAAAGGGAAATTTACTTATCCCATTCTT...,GAACGGGGGCCGAAACCGCATTTCGGACGGTCGTTGATAAGCTAGT...
4,MF417929.1,32618,DNA,linear,39.218,0,42,88.635109,16.67,83.33,ATGTTGTCTAGTCCATCATAGGTGCAACGGATATACGAGCATTTTT...,AATTTTTATTAATTATTATTAGTTTTTCAATTATATTTTTTATATT...


In [101]:
df2 = df.copy(deep=True)
df2['id'] = df2['id'].str[:-2]
# Filter sequences_df based on 'id' column
df2 = df2[df2['id'].isin(accession_values)]
print(df2.shape)
df2.head()

(11154, 12)


Unnamed: 0,id,genome_length,molecule_type,topology,gc_%,trna_count,cds_number,coding_capacity,positive_strand_%,negative_strand_%,sequence,reverse_complement
903,MN335248,7045,ss-DNA,linear,60.298,0,14,104.22995,85.71,14.29,AGTACCGCCCGAATTTCGCAGCAACCCAACCGACGCAAGCCCAACC...,CACCCCAGTTCGATTCCGGGGGGAATGGGGGGTACGCCCCCCACGC...
907,MK250029,540217,DNA,circular,25.796,0,0,0.0,0.0,0.0,GGACAAAGTTTAAAATCAAGAATTGATAGAAAAACATTTAATAGCA...,TAGATTGAATAATTAAGCAAGGAGTTCATTTAGATGAGATAGTTAG...
908,MK250028,550053,DNA,circular,26.012,0,0,0.0,0.0,0.0,TTACTCATAACTTATCTTACCCATTCATTAATATAACCATATTCAT...,TTTATTTTAAAATTGTCATATATAACGTTCTCTATTATCTTGTATA...
909,MK250027,551627,DNA,circular,26.022,0,0,0.0,0.0,0.0,TTACTCATAACTTATCTTACCCATTCATTAATATAACCATATTCAT...,TGCATTACTTATACCAATATAATTACTTACCCATTCTATTCAATAC...
910,MK250026,550702,DNA,circular,26.02,0,0,0.0,0.0,0.0,TTACTCATAACTTATCTTACCCATTCATTAATATAACCATATTCAT...,TTTATTTTAAAATTGTCATATATAACGTTCTCTATTATCTTGTATA...


### Figuring out what the different versions (.1) mean in the ids

Counting the number of entries with different versions

In [91]:
# Extract the base ID from the 'id' column
df['base_id'] = df['id'].str.split('.').str[0]

# Group by the base ID and count the number of versions for each base ID
version_counts = df.groupby('base_id').size()

# Filter the results to only show base IDs that have more than one version
multi_version_entries = version_counts[version_counts > 1]

print(multi_version_entries)

base_id
AF059242    2
AF334111    2
AM040673    2
BK013355    2
BK013356    2
           ..
EF489910    2
JX045649    2
JX045650    2
JX625144    2
LT821717    2
Length: 424, dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['base_id'] = df['id'].str.split('.').str[0]


In [24]:
# Write the results to a text file to check the names
'''with open("multi_version_ids.txt", "w") as file:
    for base_id, count in multi_version_entries.items():
        file.write(f"{base_id}: {count}\n")
'''

In [42]:
# Checking if the data in a random entry with two versions is different
df[df['base_id'] == "LT821717"]

Unnamed: 0,id,genome_length,gc_%,cds_number,positive_strand_%,negative_strand_%,coding_capacity,sequence,reverse_complement,base_id
934,LT821717.1,3669,54.811,3,100.0,0.0,93.131643,GGTGTACTCCCGCCTGAACTAGGCGGGAGGACGCTCCAGTCGTTGT...,CCAGCTACCGACGACTCGTTCGTCGGTTGCCCAGGCTTCTGGTCCA...,LT821717
30371,LT821717.1,3669,54.811,3,100.0,0.0,93.131643,GGTGTACTCCCGCCTGAACTAGGCGGGAGGACGCTCCAGTCGTTGT...,CCAGCTACCGACGACTCGTTCGTCGGTTGCCCAGGCTTCTGGTCCA...,LT821717


Conclusion regarding the different versions: I am just looking at entries that have been parsed twice and thus are repeated

## Filtering out unwanted sequences

In [93]:
df2 = df.copy(deep=True)
df2['id'] = df2['id'].str[:-2]
# Filter sequences_df based on 'id' column
df2 = df2[df2['id'].isin(accession_values)]
print(df2.shape)
df2.head()

(11154, 13)


Unnamed: 0,id,genome_length,molecule_type,topology,gc_%,trna_count,cds_number,coding_capacity,positive_strand_%,negative_strand_%,sequence,reverse_complement,base_id
903,MN335248,7045,ss-DNA,linear,60.298,0,14,95.940383,85.71,14.29,AGTACCGCCCGAATTTCGCAGCAACCCAACCGACGCAAGCCCAACC...,CACCCCAGTTCGATTCCGGGGGGAATGGGGGGTACGCCCCCCACGC...,MN335248
907,MK250029,540217,DNA,circular,25.796,0,0,0.0,0.0,0.0,GGACAAAGTTTAAAATCAAGAATTGATAGAAAAACATTTAATAGCA...,TAGATTGAATAATTAAGCAAGGAGTTCATTTAGATGAGATAGTTAG...,MK250029
908,MK250028,550053,DNA,circular,26.012,0,0,0.0,0.0,0.0,TTACTCATAACTTATCTTACCCATTCATTAATATAACCATATTCAT...,TTTATTTTAAAATTGTCATATATAACGTTCTCTATTATCTTGTATA...,MK250028
909,MK250027,551627,DNA,circular,26.022,0,0,0.0,0.0,0.0,TTACTCATAACTTATCTTACCCATTCATTAATATAACCATATTCAT...,TGCATTACTTATACCAATATAATTACTTACCCATTCTATTCAATAC...,MK250027
910,MK250026,550702,DNA,circular,26.02,0,0,0.0,0.0,0.0,TTACTCATAACTTATCTTACCCATTCATTAATATAACCATATTCAT...,TTTATTTTAAAATTGTCATATATAACGTTCTCTATTATCTTGTATA...,MK250026


In [94]:
df2[df2['cds_number'] == 0]

Unnamed: 0,id,genome_length,molecule_type,topology,gc_%,trna_count,cds_number,coding_capacity,positive_strand_%,negative_strand_%,sequence,reverse_complement,base_id
907,MK250029,540217,DNA,circular,25.796,0,0,0.0,0.0,0.0,GGACAAAGTTTAAAATCAAGAATTGATAGAAAAACATTTAATAGCA...,TAGATTGAATAATTAAGCAAGGAGTTCATTTAGATGAGATAGTTAG...,MK250029
908,MK250028,550053,DNA,circular,26.012,0,0,0.0,0.0,0.0,TTACTCATAACTTATCTTACCCATTCATTAATATAACCATATTCAT...,TTTATTTTAAAATTGTCATATATAACGTTCTCTATTATCTTGTATA...,MK250028
909,MK250027,551627,DNA,circular,26.022,0,0,0.0,0.0,0.0,TTACTCATAACTTATCTTACCCATTCATTAATATAACCATATTCAT...,TGCATTACTTATACCAATATAATTACTTACCCATTCTATTCAATAC...,MK250027
910,MK250026,550702,DNA,circular,26.020,0,0,0.0,0.0,0.0,TTACTCATAACTTATCTTACCCATTCATTAATATAACCATATTCAT...,TTTATTTTAAAATTGTCATATATAACGTTCTCTATTATCTTGTATA...,MK250026
911,MK250025,546689,DNA,circular,26.029,0,0,0.0,0.0,0.0,TAACCATATTCATTACGTGCATTCTCTGGATAGCGGTTATCAACAT...,ATAATTACTTACCCATTCTATTCAATACTCATTTTTATTTTAAAAT...,MK250025
...,...,...,...,...,...,...,...,...,...,...,...,...,...
33229,HG803181,33828,DNA,linear,48.170,0,0,0.0,0.0,0.0,TGCGCGCTTTCGCACGTGCATTTTTTTCAGTTTTTATTTTCTGAAA...,CCGCCCACTCCTTCTCACGCCTAAGGCACTCGCTTCCTCAGCTAAA...,HG803181
33251,KJ021043,35560,DNA,linear,48.765,0,0,0.0,0.0,0.0,GGACAGCATCGAACTGAAGCGACCCAAGCTGTGGGCCGCGCTTTAC...,AACATTGACATGAAGAGAAGCTGTTGGCCGCCTCCACAATGGTCGT...,KJ021043
33305,KF854250,47283,DNA,linear,59.336,0,0,0.0,0.0,0.0,GTCCCAGAGACTACCGAGGGCTGACAACATGACGAAAAACAACGGG...,GCGAGGTAGCAGAACAGCGAGCGTCCGAGATCGGTGACCACGACGC...,KF854250
34443,HQ906664,21272,DNA,linear,35.836,0,0,0.0,0.0,0.0,GTGTTAGGAAAGTATTAATTGTGTAATTTTTAAGACACTAATTTGT...,ATACCGGAAAGCTGTGAAGTAGTCACCAAGTAAAGGTAAGCGGAGA...,HQ906664


In [95]:
pearl_data = pd.read_csv("../data/processed/model_data.csv")
pearl_data[pearl_data['Accession'] == "HQ906664"]

Unnamed: 0,Accession,staining,Genome Length (bp),Jumbophage,molGC (%),Number CDS,Positive Strand (%),Negative Strand (%),Coding Capacity (%),tRNAs,Molecule_DNA,Molecule_RNA,Molecule_ss-DNA,Molecule_ss-RNA
10954,HQ906664,negative,21272,0,35.836,28,10.714286,89.285714,91.444152,0,1,0,0,0


# Figure out what is wrong with genbank file for specific entries

It turns out that these entries simply do not have CDS in the genbank files that I am working with

In [49]:
# Read the GenBank file
for record in SeqIO.parse(genbank_file_path, "genbank"):
    
    # Only process the record with the specified ID
    if record.id == "MK250028.1":
        print(record.id)

        for feature in record.features:
            print(feature)


        break

MK250028.1
type: source
location: [0:550053](+)
qualifiers:
    Key: country, Value: ['Kenya']
    Key: db_xref, Value: ['taxon:2491291']
    Key: environmental_sample, Value: ['']
    Key: host, Value: ['Prevotella sp.']
    Key: isolation_source, Value: ['baboon feces']
    Key: metagenome_source, Value: ['gut metagenome']
    Key: mol_type, Value: ['genomic DNA']
    Key: note, Value: ['metagenomic']
    Key: organism, Value: ['Prevotella phage Lak-B9']



## How many of my entries lack the CDS number

In [53]:
original_shape = df2.shape[0]
filtered_shape = df2[df2['cds_number'] != 0].shape[0]
print("I lose "+str(original_shape-filtered_shape)+" entries")

I lose 885 entries


# Extracting other features from genbank file

In [55]:
df2.head()

Unnamed: 0,id,genome_length,gc_%,cds_number,positive_strand_%,negative_strand_%,coding_capacity,sequence,reverse_complement,base_id
903,MN335248,7045,60.298,14,85.71,14.29,95.940383,AGTACCGCCCGAATTTCGCAGCAACCCAACCGACGCAAGCCCAACC...,CACCCCAGTTCGATTCCGGGGGGAATGGGGGGTACGCCCCCCACGC...,MN335248
907,MK250029,540217,25.796,0,0.0,0.0,0.0,GGACAAAGTTTAAAATCAAGAATTGATAGAAAAACATTTAATAGCA...,TAGATTGAATAATTAAGCAAGGAGTTCATTTAGATGAGATAGTTAG...,MK250029
908,MK250028,550053,26.012,0,0.0,0.0,0.0,TTACTCATAACTTATCTTACCCATTCATTAATATAACCATATTCAT...,TTTATTTTAAAATTGTCATATATAACGTTCTCTATTATCTTGTATA...,MK250028
909,MK250027,551627,26.022,0,0.0,0.0,0.0,TTACTCATAACTTATCTTACCCATTCATTAATATAACCATATTCAT...,TGCATTACTTATACCAATATAATTACTTACCCATTCTATTCAATAC...,MK250027
910,MK250026,550702,26.02,0,0.0,0.0,0.0,TTACTCATAACTTATCTTACCCATTCATTAATATAACCATATTCAT...,TTTATTTTAAAATTGTCATATATAACGTTCTCTATTATCTTGTATA...,MK250026


In [None]:
df2[df2['']].head()

In [62]:
sequence = df2[df2['id'] == "MK250029"]['sequence'].to_list()

### Fixing the barcodes

Current issue: the coding capacities are not being correctly computed because the genes and CDS are being calculated twice.
- in the pearl, they add this barcoding thing to avoid counting twice the same feature with the same barcode
- the problem is that I think that they might also be counting as cds things that are not cds (regulatory items)

In [80]:
# Step 1: Parse the GenBank file to get the DNA sequence for the desired record ID
for record in SeqIO.parse(genbank_file_path, "genbank"):
    if record.id == "MN335248.1":
        for feature in record.features:
            print(feature)
        break


type: source
location: [0:7045](+)
qualifiers:
    Key: collected_by, Value: ['Lopez-Vielma, C., Quinones-Aguilar, E., Rincon-Enriquez, E.']
    Key: country, Value: ['Mexico']
    Key: db_xref, Value: ['taxon:3071318']
    Key: environmental_sample, Value: ['']
    Key: host, Value: ['Xanthomonas vesicatoria']
    Key: isolation_source, Value: ['soil']
    Key: mol_type, Value: ['genomic DNA']
    Key: organism, Value: ['Vibrio phage XacF13']

type: regulatory
location: [7:27](+)
qualifiers:
    Key: inference, Value: ['alignment:phiSITE-PromoterHunter:']
    Key: regulatory_class, Value: ['minus_35_signal']

type: regulatory
location: [45:57](+)
qualifiers:
    Key: inference, Value: ['alignment:phiSITE-PromoterHunter:']
    Key: regulatory_class, Value: ['minus_10_signal']

type: gene
location: [65:362](+)
qualifiers:
    Key: locus_tag, Value: ['XaF13_p01']

type: CDS
location: [65:362](+)
qualifiers:
    Key: codon_start, Value: ['1']
    Key: inference, Value: ['similar to sequen

What I see in the code below is that there are other features besides the gene and the cds (Regulatory) which is also being counted as number of cds

In [123]:
# Read the GenBank file
for record in SeqIO.parse(genbank_file_path, "genbank"):
    if record.id == "MN335248.1":
        print(f"Processing record: {record.id}\n")
        
        sequence = str(record.seq)

        # Calculate genome length and GC content
        total_length = len(sequence)
        gc_content = round((sequence.count('C') + sequence.count('G')) / total_length * 100, 3)
        print(f"Total Length: {total_length}, GC Content: {gc_content}%\n")

        # Initialize counters
        plus = 0
        minus = 0
        coding_count = 0
        seen = set()  # Set to store seen barcodes

        for feature in record.features:
            # If the feature isn't a source, compute its barcode and check against seen features
            if feature.type != 'source':
                start = feature.location.start
                end = feature.location.end
                length = len(FeatureLocation(start, end))
                barcode = f"{start}_{end}_{length}"
                
                if barcode not in seen:
                    # print(f"New Barcode Detected: {barcode} for feature type {feature.type}")
                    coding_count += length
                    seen.add(barcode)
                else:
                    # print(f"Duplicate Barcode Detected: {barcode} for feature type {feature.type}")
                    pass
            if feature.type == 'CDS':
                if feature.location.strand == 1:
                    plus += 1
                elif feature.location.strand == -1:
                    minus += 1

        # Calculate total number of CDS
        total_CDS = plus + minus

        # Calculate strand usage as a percentage
        per_plus = round((plus / total_CDS) * 100, 2) if total_CDS != 0 else 0
        per_minus = round((minus / total_CDS) * 100, 2) if total_CDS != 0 else 0

        # Calculate coding capacity as a percentage
        coding_capacity = (coding_count / total_length) * 100

        print(f"\nCDS on + strand: {plus}")
        print(f"CDS on - strand: {minus}")
        print(f"Total CDS: {total_CDS}")
        print(f"Total count: {coding_count}")
        print(f"Positive Strand %: {per_plus}")
        print(f"Negative Strand %: {per_minus}")
        print(f"Coding Capacity: {coding_capacity}%\n")
        break

Processing record: MN335248.1

Total Length: 7045, GC Content: 60.298%


CDS on + strand: 12
CDS on - strand: 2
Total CDS: 14
Total count: 7343
Positive Strand %: 85.71
Negative Strand %: 14.29
Coding Capacity: 104.22995031937545%



I am trying now to avoid counting the regulatory type as number of cds

In [119]:
# Read the GenBank file
for record in SeqIO.parse(genbank_file_path, "genbank"):
    if record.id == "MN335248.1":
        print(f"Processing record: {record.id}\n")
        
        sequence = str(record.seq)

        # Calculate genome length and GC content
        total_length = len(sequence)
        gc_content = round((sequence.count('C') + sequence.count('G')) / total_length * 100, 3)
        print(f"Total Length: {total_length}, GC Content: {gc_content}%\n")

        # Initialize counters
        plus = 0
        minus = 0
        coding_count = 0
        seen = set()  # Set to store seen barcodes

        for feature in record.features:
            # Only consider 'gene' and 'CDS' for coding capacity
            if feature.type in ['gene', 'CDS']:
                start = feature.location.start
                end = feature.location.end
                length = len(FeatureLocation(start, end))
                barcode = f"{start}_{end}_{length}"
                
                if barcode not in seen:
                    # print(f"New Barcode Detected: {barcode} for feature type {feature.type}")
                    coding_count += length
                    seen.add(barcode)
                else:
                    # print(f"Duplicate Barcode Detected: {barcode} for feature type {feature.type}")
                    pass
                
            if feature.type == 'CDS':
                if feature.location.strand == 1:
                    plus += 1
                elif feature.location.strand == -1:
                    minus += 1

        # Calculate total number of CDS
        total_CDS = plus + minus

        # Calculate strand usage as a percentage
        per_plus = round((plus / total_CDS) * 100, 2) if total_CDS != 0 else 0
        per_minus = round((minus / total_CDS) * 100, 2) if total_CDS != 0 else 0

        # Calculate coding capacity as a percentage
        coding_capacity = (coding_count / total_length) * 100

        print(f"\nCDS on + strand: {plus}")
        print(f"CDS on - strand: {minus}")
        print(f"Total CDS: {total_CDS}")
        print(f"Total count: {coding_count}")
        print(f"Positive Strand %: {per_plus}")
        print(f"Negative Strand %: {per_minus}")
        print(f"Coding Capacity: {coding_capacity}%\n")
        break

Processing record: MN335248.1

Total Length: 7045, GC Content: 60.298%


CDS on + strand: 12
CDS on - strand: 2
Total CDS: 14
Total count: 7128
Positive Strand %: 85.71
Negative Strand %: 14.29
Coding Capacity: 101.17814052519518%



I want to check now the total count coming from genes and cds

In [129]:
from Bio import SeqIO
from Bio.SeqFeature import FeatureLocation

# Read the GenBank file
for record in SeqIO.parse(genbank_file_path, "genbank"):
    if record.id == "MN335248.1":
        print(f"Processing record: {record.id}\n")
        
        sequence = str(record.seq)

        # Calculate genome length and GC content
        total_length = len(sequence)
        gc_content = round((sequence.count('C') + sequence.count('G')) / total_length * 100, 3)
        print(f"Total Length: {total_length}, GC Content: {gc_content}%\n")

        # Initialize counters
        plus = 0
        minus = 0
        coding_count = 0
        gene_count = 0  # New counter for 'gene' feature
        cds_count = 0   # New counter for 'CDS' feature
        seen = set()  # Set to store seen barcodes

        for feature in record.features:
            # Only consider 'gene' and 'CDS' for coding capacity
            if feature.type in ['gene', 'CDS']:
                start = feature.location.start
                end = feature.location.end
                length = len(FeatureLocation(start, end))
                barcode = f"{start}_{end}_{length}"
                
                if barcode not in seen:
                    coding_count += length
                    seen.add(barcode)
                    if feature.type == 'gene':
                        gene_count += length
                    elif feature.type == 'CDS':
                        cds_count += length
                else:
                    pass
                
            if feature.type == 'CDS':
                if feature.location.strand == 1:
                    plus += 1
                elif feature.location.strand == -1:
                    minus += 1

        # Calculate total number of CDS
        total_CDS = plus + minus

        # Calculate strand usage as a percentage
        per_plus = round((plus / total_CDS) * 100, 2) if total_CDS != 0 else 0
        per_minus = round((minus / total_CDS) * 100, 2) if total_CDS != 0 else 0

        # Calculate coding capacity as a percentage
        coding_capacity = (coding_count / total_length) * 100

        print(f"\nCDS on + strand: {plus}")
        print(f"CDS on - strand: {minus}")
        print(f"Total CDS: {total_CDS}")
        print(f"Total count from 'gene': {gene_count}")
        print(f"Total count from 'CDS': {cds_count}")
        print(f"Total count (coding_count): {coding_count}")
        print(f"Positive Strand %: {per_plus}")
        print(f"Negative Strand %: {per_minus}")
        print(f"Coding Capacity: {coding_capacity}%\n")
        break


Processing record: MN335248.1

Total Length: 7045, GC Content: 60.298%


CDS on + strand: 12
CDS on - strand: 2
Total CDS: 14
Total count from 'gene': 6789
Total count from 'CDS': 339
Total count (coding_count): 7128
Positive Strand %: 85.71
Negative Strand %: 14.29
Coding Capacity: 101.17814052519518%



Seeing the same output in the phage with 255%/175% coding capacity in my code depending of code but which has 60 something in peral

In [137]:
from Bio import SeqIO
from Bio.SeqFeature import FeatureLocation

genbank_file_path2 =  "/Users/Alvaro/Downloads/sequence.gb"

# Read the GenBank file
for record in SeqIO.parse(genbank_file_path2, "genbank"):
    if record.id == "EF380009.1":
        print(f"Processing record: {record.id}\n")
        
        sequence = str(record.seq)

        # Calculate genome length and GC content
        total_length = len(sequence)
        gc_content = round((sequence.count('C') + sequence.count('G')) / total_length * 100, 3)
        print(f"Total Length: {total_length}, GC Content: {gc_content}%\n")

        # Initialize counters
        plus = 0
        minus = 0
        coding_count = 0
        gene_count = 0  # Counter for 'gene' feature
        cds_count = 0   # Counter for 'CDS' feature
        seen = set()  # Set to store seen barcodes

        for feature in record.features:
            # Only consider 'gene' and 'CDS' for coding capacity
            if feature.type in ['gene', 'CDS']:
                start = feature.location.start
                end = feature.location.end
                length = len(FeatureLocation(start, end))
                barcode = f"{start}_{end}_{length}"
                
                if barcode not in seen:
                    coding_count += length
                    seen.add(barcode)
                    if feature.type == 'gene':
                        gene_count += length
                        print(f"'gene' contribution: {length}, Barcode: {barcode}")
                    elif feature.type == 'CDS':
                        cds_count += length
                        print(f"'CDS' contribution: {length}, Barcode: {barcode}")
                else:
                    pass
                
            if feature.type == 'CDS':
                if feature.location.strand == 1:
                    plus += 1
                elif feature.location.strand == -1:
                    minus += 1

        # Calculate total number of CDS
        total_CDS = plus + minus

        # Calculate strand usage as a percentage
        per_plus = round((plus / total_CDS) * 100, 2) if total_CDS != 0 else 0
        per_minus = round((minus / total_CDS) * 100, 2) if total_CDS != 0 else 0

        # Calculate coding capacity as a percentage
        coding_capacity = (coding_count / total_length) * 100

        print(f"\nCDS on + strand: {plus}")
        print(f"CDS on - strand: {minus}")
        print(f"Total CDS: {total_CDS}")
        print(f"Total count from 'gene': {gene_count}")
        print(f"Total count from 'CDS': {cds_count}")
        print(f"Total count (coding_count): {coding_count}")
        print(f"Positive Strand %: {per_plus}")
        print(f"Negative Strand %: {per_minus}")
        print(f"Coding Capacity: {coding_capacity}%\n")
        break


Processing record: EF380009.1

Total Length: 5386, GC Content: 44.653%

'CDS' contribution: 5386, Barcode: 0_5386_5386
'CDS' contribution: 171, Barcode: 50_221_171
'CDS' contribution: 237, Barcode: 132_369_237
'CDS' contribution: 459, Barcode: 389_848_459
'CDS' contribution: 276, Barcode: 567_843_276
'CDS' contribution: 117, Barcode: 847_964_117
'CDS' contribution: 1284, Barcode: 1000_2284_1284
'CDS' contribution: 528, Barcode: 2394_2922_528
'CDS' contribution: 987, Barcode: 2930_3917_987

CDS on + strand: 11
CDS on - strand: 0
Total CDS: 11
Total count from 'gene': 0
Total count from 'CDS': 9445
Total count (coding_count): 9445
Positive Strand %: 100.0
Negative Strand %: 0.0
Coding Capacity: 175.3620497586335%



In [136]:
## Obtener coding count manualmente para esa entry
cds1 = 5386-3981+136-1
cds2 = 5386-4497+136-1
cds3 = 5386-5075+51-1
cds4 = 221-51
cds5 = 369-133
cds6 = 848-390
cds7 = 843-568
cds8 = 964-848
cds9= 2284-1001
cds10 = 2922-2395
cds11 = 3917-2931

total = cds1+cds2+cds3+cds4+cds5+cds6+cds7+cds8+cds9+cds10+cds11
print(total)

6976


Quiero ver como se procesan los valores de inicio y final de secuencia cuando se trata de un join() (Ver genbank file)

In [8]:

genbank_file_path2 =  "/Users/Alvaro/Desktop/phage/sequence.gb"

# Read the GenBank file
for record in SeqIO.parse(genbank_file_path2, "genbank"):
    print(record.annotations.get('topology', 'N/A'))
    if record.id == "EF380009.1":
        total_length = 0
        feature_no = 1
        for feature in record.features:
            
            if feature.type == 'CDS':
                start = feature.location.start
                end = feature.location.end
                length = len(FeatureLocation(start, end))
                length2 = len(feature.location)
                total_length += length2
                barcode = f"{start}_{end}_{length2}"
                # print(feature)
                print(start, end, length, barcode)
                print("Feature number ", str(feature_no), 'has length ',str(length2), "according to compound location and", str(length), "according to pythoned perl")
                feature_no += 1
                print()
                print()



circular
0 5386 5386 0_5386_1542
Feature number  1 has length  1542 according to compound location and 5386 according to pythoned perl

['__abstractmethods__', '__add__', '__class__', '__contains__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__iter__', '__le__', '__len__', '__lt__', '__module__', '__ne__', '__new__', '__nonzero__', '__radd__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__slots__', '__str__', '__subclasshook__', '__weakref__', '_abc_impl', '_flip', '_get_strand', '_set_strand', '_shift', 'end', 'extract', 'fromstring', 'nofuzzy_end', 'nofuzzy_start', 'operator', 'parts', 'ref', 'ref_db', 'start', 'strand']
0 5386 5386 0_5386_1026
Feature number  2 has length  1026 according to compound location and 5386 according to pythoned perl

['__abstractmethods__', '__add__', '__class__', '__contains__', '__delattr__', '__dict__', '__di

**Seeing the output with the new length measurement**
I have managed now to achieve a coding capacity of 120 but it is still way off the 60 something % from pearl

In [28]:
from Bio import SeqIO
from Bio.SeqFeature import FeatureLocation

genbank_file_path2 =  "/Users/Alvaro/Desktop/phage/sequence.gb"

# Read the GenBank file
for record in SeqIO.parse(genbank_file_path2, "genbank"):
    if record.id == "EF380009.1":
        print(f"Processing record: {record.id}\n")
        
        sequence = str(record.seq)

        # Calculate genome length and GC content
        total_length = len(sequence)
        gc_content = round((sequence.count('C') + sequence.count('G')) / total_length * 100, 3)
        print(f"Total Length: {total_length}, GC Content: {gc_content}%\n")

        # Initialize counters
        plus = 0
        minus = 0
        coding_count = 0
        gene_count = 0  # Counter for 'gene' feature
        cds_count = 0   # Counter for 'CDS' feature
        seen = set()  # Set to store seen barcodes

        for feature in record.features:
            # Only consider 'gene' and 'CDS' for coding capacity
            if feature.type in ['gene', 'CDS']:
                start = feature.location.start
                end = feature.location.end
                length = len(feature.location)
                barcode = f"{start}_{end}_{length}"
                
                if barcode not in seen:
                    coding_count += length
                    seen.add(barcode)
                    if feature.type == 'gene':
                        gene_count += length
                        print(f"'gene' contribution: {length}, Barcode: {barcode}")
                    elif feature.type == 'CDS':
                        cds_count += length
                        print(f"'CDS' contribution: {length}, Barcode: {barcode}")
                else:
                    pass
                
            if feature.type == 'CDS':
                if feature.location.strand == 1:
                    plus += 1
                elif feature.location.strand == -1:
                    minus += 1

        # Calculate total number of CDS
        total_CDS = plus + minus

        # Calculate strand usage as a percentage
        per_plus = round((plus / total_CDS) * 100, 2) if total_CDS != 0 else 0
        per_minus = round((minus / total_CDS) * 100, 2) if total_CDS != 0 else 0

        # Calculate coding capacity as a percentage
        coding_capacity = (coding_count / total_length) * 100

        print(f"\nCDS on + strand: {plus}")
        print(f"CDS on - strand: {minus}")
        print(f"Total CDS: {total_CDS}")
        print(f"Total count from 'gene': {gene_count}")
        print(f"Total count from 'CDS': {cds_count}")
        print(f"Total count (coding_count): {coding_count}")
        print(f"Total sequence length: {total_length}")
        print(f"Positive Strand %: {per_plus}")
        print(f"Negative Strand %: {per_minus}")
        print(f"Coding Capacity: {coding_capacity}%\n")
        break


Processing record: EF380009.1

Total Length: 5386, GC Content: 44.653%

'CDS' contribution: 1542, Barcode: 0_5386_1542
'CDS' contribution: 1026, Barcode: 0_5386_1026
'CDS' contribution: 363, Barcode: 0_5386_363
'CDS' contribution: 171, Barcode: 50_221_171
'CDS' contribution: 237, Barcode: 132_369_237
'CDS' contribution: 459, Barcode: 389_848_459
'CDS' contribution: 276, Barcode: 567_843_276
'CDS' contribution: 117, Barcode: 847_964_117
'CDS' contribution: 1284, Barcode: 1000_2284_1284
'CDS' contribution: 528, Barcode: 2394_2922_528
'CDS' contribution: 987, Barcode: 2930_3917_987

CDS on + strand: 11
CDS on - strand: 0
Total CDS: 11
Total count from 'gene': 0
Total count from 'CDS': 6990
Total count (coding_count): 6990
Total sequence length: 5386
Positive Strand %: 100.0
Negative Strand %: 0.0
Coding Capacity: 129.78091347939102%



Investigate the feature.location

In [25]:

genbank_file_path2 =  "/Users/Alvaro/Desktop/phage/sequence.gb"

# Read the GenBank file
for record in SeqIO.parse(genbank_file_path2, "genbank"):

    if record.id == "EF380009.1":
        total_length = 0
        feature_no = 1
        for feature in record.features:
            
            if feature.type == 'CDS':
                print("Supposed start of sequence: ", str(feature.location.start))

                print("Supposed end of sequence: ", str(feature.location.end))


                #print(feature.extract)
                print("Real start and end: ",str(feature.location))

                print(len(list(feature.location)))
                
                # print(feature.location.operator)
                # print(feature.location.parts)
                # print(feature.location.ref)
                # print(feature.location.ref_db)
                # print(feature.location.strand)
                print("Feature number ", str(feature_no), 'has length ',str(len(feature.location)), "according to compound location and", str(len(FeatureLocation(feature.location.start, feature.location.end))), "according to pythoned perl")
                feature_no += 1
                print(dir(feature.location))
                print()
                break



Supposed start of sequence:  0
Supposed end of sequence:  5386
Real start and end:  join{[3980:5386](+), [0:136](+)}
1542
Feature number  1 has length  1542 according to compound location and 5386 according to pythoned perl
['__abstractmethods__', '__add__', '__class__', '__contains__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__iter__', '__le__', '__len__', '__lt__', '__module__', '__ne__', '__new__', '__nonzero__', '__radd__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__slots__', '__str__', '__subclasshook__', '__weakref__', '_abc_impl', '_flip', '_get_strand', '_set_strand', '_shift', 'end', 'extract', 'fromstring', 'nofuzzy_end', 'nofuzzy_start', 'operator', 'parts', 'ref', 'ref_db', 'start', 'strand']



Try to get the correct barcode myself

In [35]:
from Bio import SeqIO
from Bio.SeqFeature import FeatureLocation

genbank_file_path2 =  "/Users/Alvaro/Desktop/phage/sequence.gb"

# Read the GenBank file
for record in SeqIO.parse(genbank_file_path2, "genbank"):
    if record.id == "EF380009.1":
        print(f"Processing record: {record.id}\n")
        
        sequence = str(record.seq)

        # Calculate genome length and GC content
        total_length = len(sequence)
        gc_content = round((sequence.count('C') + sequence.count('G')) / total_length * 100, 3)
        print(f"Total Length: {total_length}, GC Content: {gc_content}%\n")

        # Initialize counters
        plus = 0
        minus = 0
        coding_count = 0
        gene_count = 0  # Counter for 'gene' feature
        cds_count = 0   # Counter for 'CDS' feature
        seen = set()  # Set to store seen barcodes

        for feature in record.features:
            # Only consider 'gene' and 'CDS' for coding capacity
            if feature.type in ['gene', 'CDS']:
                sequence_positions = list(feature.location)
                start = sequence_positions[0]
                end = sequence_positions[-1]
                length = len(feature.location)
                barcode = f"{start}_{end}_{length}"
                
                if barcode not in seen:
                    coding_count += length
                    seen.add(barcode)
                    if feature.type == 'gene':
                        gene_count += length
                        print(f"'gene' contribution: {length}, Barcode: {barcode}")
                    elif feature.type == 'CDS':
                        cds_count += length
                        print(f"'CDS' contribution: {length}, Barcode: {barcode}")
                else:
                    pass
                
            if feature.type == 'CDS':
                if feature.location.strand == 1:
                    plus += 1
                elif feature.location.strand == -1:
                    minus += 1

        # Calculate total number of CDS
        total_CDS = plus + minus

        # Calculate strand usage as a percentage
        per_plus = round((plus / total_CDS) * 100, 2) if total_CDS != 0 else 0
        per_minus = round((minus / total_CDS) * 100, 2) if total_CDS != 0 else 0

        # Calculate coding capacity as a percentage
        coding_capacity = (coding_count / total_length) * 100

        print(f"\nCDS on + strand: {plus}")
        print(f"CDS on - strand: {minus}")
        print(f"Total CDS: {total_CDS}")
        print(f"Total count from 'gene': {gene_count}")
        print(f"Total count from 'CDS': {cds_count}")
        print(f"Total count (coding_count): {coding_count}")
        print(f"Total sequence length: {total_length}")
        print(f"Positive Strand %: {per_plus}")
        print(f"Negative Strand %: {per_minus}")
        print(f"Coding Capacity: {coding_capacity}%\n")
        break


Processing record: EF380009.1

Total Length: 5386, GC Content: 44.653%

'CDS' contribution: 1542, Barcode: 3980_135_1542
'CDS' contribution: 1026, Barcode: 4496_135_1026
'CDS' contribution: 363, Barcode: 5074_50_363
'CDS' contribution: 171, Barcode: 50_220_171
'CDS' contribution: 237, Barcode: 132_368_237
'CDS' contribution: 459, Barcode: 389_847_459
'CDS' contribution: 276, Barcode: 567_842_276
'CDS' contribution: 117, Barcode: 847_963_117
'CDS' contribution: 1284, Barcode: 1000_2283_1284
'CDS' contribution: 528, Barcode: 2394_2921_528
'CDS' contribution: 987, Barcode: 2930_3916_987

CDS on + strand: 11
CDS on - strand: 0
Total CDS: 11
Total count from 'gene': 0
Total count from 'CDS': 6990
Total count (coding_count): 6990
Total sequence length: 5386
Positive Strand %: 100.0
Negative Strand %: 0.0
Coding Capacity: 129.78091347939102%



## Code to avoid overlap in coding capacity

In [43]:
def calculate_coding_capacity_from_features(features, total_length):
    covered_positions = set()  # Set to store covered sequence positions

    for feature in features:
        if feature.type in ['gene', 'CDS']:
            # Get the positions covered by this feature
            positions = list(feature.location)
            # Add these positions to the set (automatically takes care of duplicates)
            covered_positions.update(positions)

    # Calculate coding capacity
    coding_capacity_percentage = (len(covered_positions) / total_length) * 100
    return coding_capacity_percentage

In [67]:
genbank_file_path2 = "/Users/Alvaro/Desktop/phage/sequence.gb"

# Read the GenBank file
for record in SeqIO.parse(genbank_file_path2, "genbank"):
    if record.id == "EF380009.1":
        print(f"Processing record: {record.id}\n")
        
        sequence = str(record.seq)

        # Calculate genome length and GC content
        total_length = len(sequence)
        gc_content = round((sequence.count('C') + sequence.count('G')) / total_length * 100, 3)
        print(f"Total Length: {total_length}, GC Content: {gc_content}%\n")

        # Initialize counters
        plus = 0
        minus = 0
        seen = set()  # Set to store seen barcodes
        covered_positions = set()  # Set to store covered sequence positions

        for feature in record.features:
            # Only consider 'gene' and 'CDS' for coding capacity
            if feature.type in ['gene', 'CDS']:
                sequence_positions = list(feature.location)
                start = sequence_positions[0]
                end = sequence_positions[-1]
                length = len(feature.location)
                barcode = f"{start}_{end}_{length}"

                # Get the positions covered by this feature
                positions = list(feature.location)
                # Add these positions to the set (automatically takes care of duplicates)
                covered_positions.update(positions)

                if barcode not in seen:
                    seen.add(barcode)
                    if feature.type == 'gene':
                        print(f"'gene' contribution: {length}, Barcode: {barcode}")
                    elif feature.type == 'CDS':
                        print(f"'CDS' contribution: {length}, Barcode: {barcode}")
                
            if feature.type == 'CDS':
                if feature.location.strand == 1:
                    plus += 1
                elif feature.location.strand == -1:
                    minus += 1

        # Calculate total number of CDS
        total_CDS = plus + minus

        # Calculate strand usage as a percentage
        per_plus = round((plus / total_CDS) * 100, 2) if total_CDS != 0 else 0
        per_minus = round((minus / total_CDS) * 100, 2) if total_CDS != 0 else 0

        # Calculate coding capacity using the updated function
        coding_capacity = (len(covered_positions) / total_length) * 100

        print(f"\nCDS on + strand: {plus}")
        print(f"CDS on - strand: {minus}")
        print(f"Total CDS: {total_CDS}")
        print(f"Total sequence length: {total_length}")
        print(f"Total count (coding_count): {len(covered_positions)}")
        print(f"Positive Strand %: {per_plus}")
        print(f"Negative Strand %: {per_minus}")
        print(f"Coding Capacity: {coding_capacity}%\n")
        break

Processing record: EF380009.1

Total Length: 5386, GC Content: 44.653%

'CDS' contribution: 1542, Barcode: 3980_135_1542
'CDS' contribution: 1026, Barcode: 4496_135_1026
'CDS' contribution: 363, Barcode: 5074_50_363
'CDS' contribution: 171, Barcode: 50_220_171
'CDS' contribution: 237, Barcode: 132_368_237
'CDS' contribution: 459, Barcode: 389_847_459
'CDS' contribution: 276, Barcode: 567_842_276
'CDS' contribution: 117, Barcode: 847_963_117
'CDS' contribution: 1284, Barcode: 1000_2283_1284
'CDS' contribution: 528, Barcode: 2394_2921_528
'CDS' contribution: 987, Barcode: 2930_3916_987

CDS on + strand: 11
CDS on - strand: 0
Total CDS: 11
Total sequence length: 5386
Total count (coding_count): 5149
Positive Strand %: 100.0
Negative Strand %: 0.0
Coding Capacity: 95.59970293353139%



# Code for Powerpoint
I want to obtain
- The dataframe from Pearl (other jupyter notebook)
- The dataframe that I had originally (well, actually, not originally, but after including the barcodes to avoid counting a same gene/cds twice)
- The dataframe that I have that avoids the overlaps


## Pearl dataframe

In [65]:
df = pd.read_csv("../data/processed/model_data.csv")
df.head()

Unnamed: 0,Accession,staining,Genome Length (bp),Jumbophage,molGC (%),Number CDS,Positive Strand (%),Negative Strand (%),Coding Capacity (%),tRNAs,Molecule_DNA,Molecule_RNA,Molecule_ss-DNA,Molecule_ss-RNA
0,MN335248,negative,7045,0,60.298,13,84.615385,15.384615,88.828957,0,0,0,1,0
1,MK250029,negative,540217,1,25.796,830,47.108434,52.891566,68.324951,30,1,0,0,0
2,MK250028,negative,550053,1,26.012,859,52.270081,47.729919,69.188424,29,1,0,0,0
3,MK250027,negative,551627,1,26.022,860,53.023256,46.976744,69.318761,33,1,0,0,0
4,MK250026,negative,550702,1,26.02,859,53.201397,46.798603,69.363285,33,1,0,0,0


In [66]:
df[df['Accession'] == "EF380009"]

Unnamed: 0,Accession,staining,Genome Length (bp),Jumbophage,molGC (%),Number CDS,Positive Strand (%),Negative Strand (%),Coding Capacity (%),tRNAs,Molecule_DNA,Molecule_RNA,Molecule_ss-DNA,Molecule_ss-RNA
11151,EF380009,negative,5386,0,44.653,6,100.0,0.0,65.837356,0,0,0,1,0


In [46]:
accession_values = pd.read_csv('../data/processed/model_data.csv')
# List of 'Accession' values from the first DataFrame
accession_values = accession_values['Accession'].tolist()

## Original dataframe (with barcodes)

This avoids a gene and a cds from being counted twice if they are the same

In [47]:
genbank_file_path = "../data/raw/inphared_8Sep2023/8Sep2023_phages_downloaded_from_genbank.gb"
# Lists to hold data
ids = []
genome_lengths = []
gc_contents = []
sequences = []
reverse_complements = []
cds_numbers = []
positive_strands = []
negative_strands = []
coding_capacities = []
molecule_types = []
topologies = []
trna_counts = []

# Read the GenBank file
for record in SeqIO.parse(genbank_file_path, "genbank"):
    try:
        # Attempt to access the sequence, which may raise UndefinedSequenceError
        sequence = str(record.seq)
        print(record.id)
    except UndefinedSequenceError:
        print(f"Skipping record {record.id} as sequence is undefined.")
        continue  # Skip this record

    # Calculate genome length and GC content
    total_length = len(sequence)
    gc_content = round((sequence.count('C') + sequence.count('G')) / total_length * 100, 3)

    # Initialize counters
    plus = 0
    minus = 0
    coding_count = 0
    trna_count = 0
    seen = set()  # Store seen barcodes

    for feature in record.features:
        start = feature.location.start
        end = feature.location.end
        length = len(FeatureLocation(start, end))
        barcode = f"{start}_{end}_{length}"

        if feature.type != 'source' and barcode not in seen:
            coding_count += length
            seen.add(barcode)

        if feature.type == 'CDS':
            if feature.location.strand == 1:
                plus += 1
            elif feature.location.strand == -1:
                minus += 1
        elif feature.type == 'tRNA':
            trna_count += 1


    # Calculate total number of CDS
    total_CDS = plus + minus

    # Calculate strand usage as a percentage
    per_plus = round((plus / total_CDS) * 100, 2) if total_CDS != 0 else 0
    per_minus = round((minus / total_CDS) * 100, 2) if total_CDS != 0 else 0

    # Calculate coding capacity as a percentage
    coding_capacity = (coding_count / total_length) * 100

    # Extract molecule_type and topology
    molecule_type = record.annotations.get('molecule_type', 'N/A')
    topology = record.annotations.get('topology', 'N/A')

    # Append data to lists
    ids.append(record.id)
    genome_lengths.append(total_length)
    gc_contents.append(gc_content)
    sequences.append(sequence)
    reverse_complements.append(str(sequence[::-1]))
    cds_numbers.append(total_CDS)
    positive_strands.append(per_plus)
    negative_strands.append(per_minus)
    coding_capacities.append(coding_capacity)
    molecule_types.append(molecule_type)
    topologies.append(topology)
    trna_counts.append(trna_count)

# Convert lists to pandas DataFrame
df = pd.DataFrame({
    'id': ids,
    'genome_length': genome_lengths,
    'gc_%': gc_contents,
    'sequence': sequences,
    'reverse_complement': reverse_complements,
    'cds_number': cds_numbers,
    'positive_strand_%': positive_strands,
    'negative_strand_%': negative_strands,
    'coding_capacity': coding_capacities,
    'molecule_type': molecule_types,
    'topology': topologies,
    'trna_count': trna_counts
})

df = df[['id', 'genome_length',  'molecule_type', 'topology', 'gc_%', 'trna_count','cds_number', 'coding_capacity','positive_strand_%','negative_strand_%' ,'sequence', 'reverse_complement']]
print(df['coding_capacity'].mean())
df['id'] = df['id'].str[:-2]
# Filter sequences_df based on 'id' column
df = df[df['id'].isin(accession_values)]
print(df.shape)
df.head()

GU339467.1
KT462701.1
KT462700.1
KT462698.1
Skipping record NZ_CP014526.1 as sequence is undefined.
Skipping record NC_022901.1 as sequence is undefined.
Skipping record NZ_CP038625.1 as sequence is undefined.
Skipping record NZ_CP023686.1 as sequence is undefined.
Skipping record NZ_CP023680.1 as sequence is undefined.
Skipping record NZ_CP019275.1 as sequence is undefined.
MF417929.1
MH616963.1
MH552500.1
BK010471.1
NR_074910.1
AC171169.12
BK014221.1
BK014220.1
BK014219.1
BK014218.1
BK014217.1
BK014216.1
BK014215.1
BK014214.1
BK014213.1
BK014212.1
BK014211.1
BK014210.1
BK014209.1
BK014208.1
BK014207.1
BK014206.1
BK014205.1
BK014204.1
BK014203.1
BK014202.1
BK014201.1
BK014200.1
BK014199.1
BK014198.1
BK014197.1
BK014196.1
BK014195.1
BK014194.1
BK014193.1
BK014192.1
BK014191.1
BK014190.1
BK014189.1
BK014188.1
BK014187.1
BK014186.1
BK014185.1
BK014184.1
BK014183.1
BK014182.1
BK014181.1
BK014180.1
BK014179.1
BK014178.1
BK014177.1
BK014176.1
BK014175.1
BK014174.1
BK014173.1
BK014172.1
BK01

Unnamed: 0,id,genome_length,molecule_type,topology,gc_%,trna_count,cds_number,coding_capacity,positive_strand_%,negative_strand_%,sequence,reverse_complement
903,MN335248,7045,ss-DNA,linear,60.298,0,14,104.22995,85.71,14.29,AGTACCGCCCGAATTTCGCAGCAACCCAACCGACGCAAGCCCAACC...,CACCCCAGTTCGATTCCGGGGGGAATGGGGGGTACGCCCCCCACGC...
907,MK250029,540217,DNA,circular,25.796,0,0,0.0,0.0,0.0,GGACAAAGTTTAAAATCAAGAATTGATAGAAAAACATTTAATAGCA...,TAGATTGAATAATTAAGCAAGGAGTTCATTTAGATGAGATAGTTAG...
908,MK250028,550053,DNA,circular,26.012,0,0,0.0,0.0,0.0,TTACTCATAACTTATCTTACCCATTCATTAATATAACCATATTCAT...,TTTATTTTAAAATTGTCATATATAACGTTCTCTATTATCTTGTATA...
909,MK250027,551627,DNA,circular,26.022,0,0,0.0,0.0,0.0,TTACTCATAACTTATCTTACCCATTCATTAATATAACCATATTCAT...,TGCATTACTTATACCAATATAATTACTTACCCATTCTATTCAATAC...
910,MK250026,550702,DNA,circular,26.02,0,0,0.0,0.0,0.0,TTACTCATAACTTATCTTACCCATTCATTAATATAACCATATTCAT...,TTTATTTTAAAATTGTCATATATAACGTTCTCTATTATCTTGTATA...


There are entries where the cds number is 0 but the coding capacity is not 0. This means that this data is being obtained from genes

In [51]:
df[(df['cds_number'] == 0) & (df['coding_capacity'] != 0)].head()

Unnamed: 0,id,genome_length,molecule_type,topology,gc_%,trna_count,cds_number,coding_capacity,positive_strand_%,negative_strand_%,sequence,reverse_complement
4638,LC710219,4189,RNA,linear,47.362,0,0,105.609931,0.0,0.0,CCTTAGGGGGTCACCTCACACAGCAGTATTTCACTGAGTATGAGAG...,GGGAGAGAGTGAGTATTCTCATTAACACTATTAAATAAGTGTTAAT...
4639,LC710218,3605,RNA,linear,51.928,0,0,95.755895,0.0,0.0,TACCAGATCTATCGCTGTGACTGGAGTTCAGACGTGTGCTCTTCCG...,ATGCTCTAGATAGCGACACTGACCTCAAGTCTGCACACGAGAAGGC...
4640,LC710217,3576,RNA,linear,51.846,0,0,96.224832,0.0,0.0,GGCCGATTCTATTTTTCGCCGTTTACCTGCTTATCTTTTTCCTGCT...,GCGGTTCGTCGATCAATGGTTTAGCCCTCTTAGGGCCCAGGAGAGA...
5048,OQ031496,138717,DNA,linear,30.288,0,0,0.245824,0.0,0.0,AGAAAGATATAAATGGATGGAAAAGAACTAATTAAGATAGCACAAG...,GGAATCTTACAAAGACTTAAAAGGAGTAAGTAATGAAAAAGGAAAA...
5049,OQ031495,169098,DNA,linear,36.861,0,0,2.144319,0.0,0.0,ATGATTATTGAAACGGCTAAAGAAACGATTATTGGTTCAGGCGGTA...,TAAAAAGAGAGTAGTGTTCAATGCAAATTTTTAAAGACGATGAAGT...


In [55]:
(df['coding_capacity'] == 0).sum()

862

In [59]:
'''df.to_pickle("../data/interim/genbank_engineering/genbank_eng_overlap.pkl")'''

In [None]:
df = df[df['coding_capacity'] != 0]

## New dataframe avoiding overlapping with barcodes

This avoid that two different cds occupying the same space are counted as two different contributions to the total count

In [48]:
genbank_file_path = "../data/raw/inphared_8Sep2023/8Sep2023_phages_downloaded_from_genbank.gb"
# Lists to hold data
ids = []
genome_lengths = []
gc_contents = []
sequences = []
reverse_complements = []
cds_numbers = []
positive_strands = []
negative_strands = []
coding_capacities = []
molecule_types = []
topologies = []
trna_counts = []

# Read the GenBank file
for record in SeqIO.parse(genbank_file_path, "genbank"):
    try:
        # Attempt to access the sequence, which may raise UndefinedSequenceError
        sequence = str(record.seq)
        print(record.id)
    except UndefinedSequenceError:
        print(f"Skipping record {record.id} as sequence is undefined.")
        continue  # Skip this record

    # Calculate genome length and GC content
    total_length = len(sequence)
    gc_content = round((sequence.count('C') + sequence.count('G')) / total_length * 100, 3)

    # Initialize counters
    plus = 0
    minus = 0
    coding_count = 0
    trna_count = 0
    seen = set()  # Store seen barcodes
    covered_positions = set()  # Set to store covered sequence positions

    for feature in record.features:
        # Only consider 'gene' and 'CDS' for coding capacity
        if feature.type in ['gene', 'CDS']:
            start = sequence_positions[0]
            end = sequence_positions[-1]
            length = len(feature.location)
            barcode = f"{start}_{end}_{length}"

            # Get the positions covered by this feature
            positions = list(feature.location)
            # Add these positions to the set (automatically takes care of duplicates)
            covered_positions.update(positions)

            if barcode not in seen:
                seen.add(barcode)

        if feature.type == 'CDS':
            if feature.location.strand == 1:
                plus += 1
            elif feature.location.strand == -1:
                minus += 1
        elif feature.type == 'tRNA':
            trna_count += 1


    # Calculate total number of CDS
    total_CDS = plus + minus

    # Calculate strand usage as a percentage
    per_plus = round((plus / total_CDS) * 100, 2) if total_CDS != 0 else 0
    per_minus = round((minus / total_CDS) * 100, 2) if total_CDS != 0 else 0

    # Calculate coding capacity as a percentage
    coding_capacity = (len(covered_positions) / total_length) * 100

    # Extract molecule_type and topology
    molecule_type = record.annotations.get('molecule_type', 'N/A')
    topology = record.annotations.get('topology', 'N/A')

    # Append data to lists
    ids.append(record.id)
    genome_lengths.append(total_length)
    gc_contents.append(gc_content)
    sequences.append(sequence)
    reverse_complements.append(str(sequence[::-1]))
    cds_numbers.append(total_CDS)
    positive_strands.append(per_plus)
    negative_strands.append(per_minus)
    coding_capacities.append(coding_capacity)
    molecule_types.append(molecule_type)
    topologies.append(topology)
    trna_counts.append(trna_count)

# Convert lists to pandas DataFrame
df2 = pd.DataFrame({
    'id': ids,
    'genome_length': genome_lengths,
    'gc_%': gc_contents,
    'sequence': sequences,
    'reverse_complement': reverse_complements,
    'cds_number': cds_numbers,
    'positive_strand_%': positive_strands,
    'negative_strand_%': negative_strands,
    'coding_capacity': coding_capacities,
    'molecule_type': molecule_types,
    'topology': topologies,
    'trna_count': trna_counts
})

df2 = df2[['id', 'genome_length',  'molecule_type', 'topology', 'gc_%', 'trna_count','cds_number', 'coding_capacity','positive_strand_%','negative_strand_%' ,'sequence', 'reverse_complement']]
print(df2['coding_capacity'].mean())
df2['id'] = df2['id'].str[:-2]
# Filter sequences_df based on 'id' column
df2 = df2[df2['id'].isin(accession_values)]
print(df2.shape)
df2.head()

GU339467.1
KT462701.1
KT462700.1
KT462698.1
Skipping record NZ_CP014526.1 as sequence is undefined.
Skipping record NC_022901.1 as sequence is undefined.
Skipping record NZ_CP038625.1 as sequence is undefined.
Skipping record NZ_CP023686.1 as sequence is undefined.
Skipping record NZ_CP023680.1 as sequence is undefined.
Skipping record NZ_CP019275.1 as sequence is undefined.
MF417929.1
MH616963.1
MH552500.1
BK010471.1
NR_074910.1
AC171169.12
BK014221.1
BK014220.1
BK014219.1
BK014218.1
BK014217.1
BK014216.1
BK014215.1
BK014214.1
BK014213.1
BK014212.1
BK014211.1
BK014210.1
BK014209.1
BK014208.1
BK014207.1
BK014206.1
BK014205.1
BK014204.1
BK014203.1
BK014202.1
BK014201.1
BK014200.1
BK014199.1
BK014198.1
BK014197.1
BK014196.1
BK014195.1
BK014194.1
BK014193.1
BK014192.1
BK014191.1
BK014190.1
BK014189.1
BK014188.1
BK014187.1
BK014186.1
BK014185.1
BK014184.1
BK014183.1
BK014182.1
BK014181.1
BK014180.1
BK014179.1
BK014178.1
BK014177.1
BK014176.1
BK014175.1
BK014174.1
BK014173.1
BK014172.1
BK01

Unnamed: 0,id,genome_length,molecule_type,topology,gc_%,trna_count,cds_number,coding_capacity,positive_strand_%,negative_strand_%,sequence,reverse_complement
903,MN335248,7045,ss-DNA,linear,60.298,0,14,94.918382,85.71,14.29,AGTACCGCCCGAATTTCGCAGCAACCCAACCGACGCAAGCCCAACC...,CACCCCAGTTCGATTCCGGGGGGAATGGGGGGTACGCCCCCCACGC...
907,MK250029,540217,DNA,circular,25.796,0,0,0.0,0.0,0.0,GGACAAAGTTTAAAATCAAGAATTGATAGAAAAACATTTAATAGCA...,TAGATTGAATAATTAAGCAAGGAGTTCATTTAGATGAGATAGTTAG...
908,MK250028,550053,DNA,circular,26.012,0,0,0.0,0.0,0.0,TTACTCATAACTTATCTTACCCATTCATTAATATAACCATATTCAT...,TTTATTTTAAAATTGTCATATATAACGTTCTCTATTATCTTGTATA...
909,MK250027,551627,DNA,circular,26.022,0,0,0.0,0.0,0.0,TTACTCATAACTTATCTTACCCATTCATTAATATAACCATATTCAT...,TGCATTACTTATACCAATATAATTACTTACCCATTCTATTCAATAC...
910,MK250026,550702,DNA,circular,26.02,0,0,0.0,0.0,0.0,TTACTCATAACTTATCTTACCCATTCATTAATATAACCATATTCAT...,TTTATTTTAAAATTGTCATATATAACGTTCTCTATTATCTTGTATA...


In [52]:
df2[(df2['cds_number'] == 0) & (df2['coding_capacity'] != 0)].head()

Unnamed: 0,id,genome_length,molecule_type,topology,gc_%,trna_count,cds_number,coding_capacity,positive_strand_%,negative_strand_%,sequence,reverse_complement
4638,LC710219,4189,RNA,linear,47.362,0,0,96.013368,0.0,0.0,CCTTAGGGGGTCACCTCACACAGCAGTATTTCACTGAGTATGAGAG...,GGGAGAGAGTGAGTATTCTCATTAACACTATTAAATAAGTGTTAAT...
4639,LC710218,3605,RNA,linear,51.928,0,0,90.208044,0.0,0.0,TACCAGATCTATCGCTGTGACTGGAGTTCAGACGTGTGCTCTTCCG...,ATGCTCTAGATAGCGACACTGACCTCAAGTCTGCACACGAGAAGGC...
4640,LC710217,3576,RNA,linear,51.846,0,0,90.771812,0.0,0.0,GGCCGATTCTATTTTTCGCCGTTTACCTGCTTATCTTTTTCCTGCT...,GCGGTTCGTCGATCAATGGTTTAGCCCTCTTAGGGCCCAGGAGAGA...
30350,KY695241,11696,DNA,linear,34.986,0,0,93.75855,0.0,0.0,GCAGAGGAGCCGTATGAGGTGAAAGTCTCACGTACGGTTCTGCAGC...,TATATGTAGAGAACTAAGAAGTTCATTGGTGAGAAGATGCGCGGAC...
30352,KY695239,11689,DNA,linear,34.118,0,0,90.589443,0.0,0.0,TAAATTTCGAAAATTAAAGATTTTTTGGACATAATGGAATTACTCT...,AGACGAGGATCGAAGTCAGTCATTGAAATGATAAAGGTTAAAAACA...


In [56]:
(df2['coding_capacity'] == 0).sum()

876

In [58]:
# '''df2.to_pickle("../data/interim/genbank_engineering/genbank_eng_no_overlap.pkl")'''

## Checking the difference in the rows with 0 coding capcaity between df and df2

In [60]:
# Filter both dataframes to only rows where coding_capacity is 0
df_zero = df[df['coding_capacity'] == 0]
df2_zero = df2[df2['coding_capacity'] == 0]

# Use merge with the indicator parameter
only_in_df2 = pd.merge(df_zero, df2_zero, how='outer', indicator=True)

# Filter the result to only rows that are in df2 but not in df
only_in_df2 = only_in_df2[only_in_df2['_merge'] == 'right_only'].drop(columns=['_merge'])
only_in_df2

Unnamed: 0,id,genome_length,molecule_type,topology,gc_%,trna_count,cds_number,coding_capacity,positive_strand_%,negative_strand_%,sequence,reverse_complement
862,OQ031496,138717,DNA,linear,30.288,0,0,0.0,0.0,0.0,AGAAAGATATAAATGGATGGAAAAGAACTAATTAAGATAGCACAAG...,GGAATCTTACAAAGACTTAAAAGGAGTAAGTAATGAAAAAGGAAAA...
863,OQ031495,169098,DNA,linear,36.861,0,0,0.0,0.0,0.0,ATGATTATTGAAACGGCTAAAGAAACGATTATTGGTTCAGGCGGTA...,TAAAAAGAGAGTAGTGTTCAATGCAAATTTTTAAAGACGATGAAGT...
864,OQ031492,149246,DNA,linear,37.575,0,0,0.0,0.0,0.0,AATGATAATTATTCTCATTCGCATGATATTTGGGAATTGTCGCTGC...,GCATAAGTTACTGAATACGTTCCGAGTACGGTTGTATCGCTGTTCC...
865,OQ031491,44989,DNA,linear,31.461,0,0,0.0,0.0,0.0,AGCTTTTTACGTTCATAATTGATCTCGCAGTTTGCAAAAGCACATC...,CCGAAGTAGCTTGCATTGTTCAACCACCTCCGTCGGATTACTTTCT...
866,MW373745,210101,DNA,linear,27.565,0,0,0.0,0.0,0.0,ACTAAAAACTTAAGCATTTTTTATACTTTTTAATATTAGTTTAAGC...,TATCTCTAATGTATGGGGTAGGGAATTCGAGTAAAAATCAGAAAAG...
867,ON528744,167055,DNA,linear,35.524,0,0,0.0,0.0,0.0,TAGTGTTTGCCCAGATGAAGCATTATAAGTTTTCCAGGCACCGGCT...,TGCCGTCTTACCCCCTTTAGTATGCGATAATTATGAAGCAGACCCT...
868,ON528733,40156,DNA,linear,52.465,0,0,0.0,0.0,0.0,AGGCCAAGGCTGAGGGTGCTGCCATCGACGCTCAGAACAGACAGGC...,CAACTCTACGCCTGTATCACCGGGGCCGATGTCGGTCATATGGGTA...
869,MT080595,132999,DNA,linear,29.973,0,0,0.0,0.0,0.0,CAAACTATAGACCCTGAAAAACCTATAGAACCTGAAAAACCTATAG...,CCAAGATATCCAAAAAGTCCAAGATATCCAAAAAGTCCAAGATATC...
870,MN935200,110012,DNA,linear,30.464,0,0,0.0,0.0,0.0,AAATATCCAAGTACCTGGGAATTTTTTTACGTGTCCGTCTCGTTCA...,CAATGGCCGTATGATTAAGGAGGAATAATTTAGAGATCAAAGAGTA...
871,KX452699,59230,DNA,linear,55.756,0,0,0.0,0.0,0.0,CCCTTTACACGTTTCCTGACGTAAACACGTCCAGTGGTACCGTCCA...,CACGCATCTACACTAGTTCTCTGTCGAGTCGCTAGGCCCATAAGTC...


In [64]:
df[df['id'] == "HQ906663"]

Unnamed: 0,id,genome_length,molecule_type,topology,gc_%,trna_count,cds_number,coding_capacity,positive_strand_%,negative_strand_%,sequence,reverse_complement
34444,HQ906663,40003,DNA,linear,33.31,0,0,3.889708,0.0,0.0,TTATCCGGCAAGCCAACAATATTGACTGCTAGTACAACATTGCATA...,AAACTAAAAGATCTCCTTCTATTGATTGTAGCTTGTCTGTTTTAAG...


# Future: obtan number of regulatory items as feature

In [120]:
# Step 1: Parse the GenBank file to get the DNA sequence for the desired record ID
for record in SeqIO.parse(genbank_file_path, "genbank"):
    if record.id == "MN335248.1":
        for feature in record.features:
            if feature.type == "regulatory":
                length = len(FeatureLocation(start, end))
                print(length)
                print(feature)
        break


279
type: regulatory
location: [7:27](+)
qualifiers:
    Key: inference, Value: ['alignment:phiSITE-PromoterHunter:']
    Key: regulatory_class, Value: ['minus_35_signal']

279
type: regulatory
location: [45:57](+)
qualifiers:
    Key: inference, Value: ['alignment:phiSITE-PromoterHunter:']
    Key: regulatory_class, Value: ['minus_10_signal']

279
type: regulatory
location: [841:863](+)
qualifiers:
    Key: inference, Value: ['alignment:phiSITE-PromoterHunter:']
    Key: regulatory_class, Value: ['minus_35_signal']

279
type: regulatory
location: [878:899](+)
qualifiers:
    Key: inference, Value: ['alignment:phiSITE-PromoterHunter:']
    Key: regulatory_class, Value: ['minus_10_signal']

279
type: regulatory
location: [4862:4901](-)
qualifiers:
    Key: inference, Value: ['alignment:phiSITE-PromoterHunter:']
    Key: regulatory_class, Value: ['terminator']

279
type: regulatory
location: [5812:5834](+)
qualifiers:
    Key: inference, Value: ['alignment:phiSITE-PromoterHunter:']
    K