##### -------------------------------------------------------------------------------- Glimmer --------------------------------------------------------------------------------

> Installation: https://ccb.jhu.edu/software/glimmer/index.shtml

* Ecoli / Halo.. / Myco.. / Natro.. / Roseo... (Manually)

In [6]:
# Glimmer-from-scratch
!cd Ecoli && csh g3-from-scratch.csh sequence.fasta  from-scratch
!cd Ecoli && mkdir -p from-scratch
!cd Ecoli && mv from-scratch.detail from-scratch.icm from-scratch.longorfs from-scratch.predict from-scratch.train from-scratch/

Step 1 of 4:  Finding long orfs for training
Starting at Wed Apr 23 12:44:07 2025

Sequence file = sequence.fasta
Excluded regions file = none
Circular genome = true
Initial minimum gene length = 90 bp
Determine optimal min gene length to maximize number of genes
Maximum overlap bases = 30
Start codons = atg,gtg,ttg
Stop codons = taa,tag,tga
Sequence length = 4641652
Final minimum gene length = 463
Number of genes = 2795
Total bases = 3094821
Step 2 of 4:  Extracting training sequences
Step 3 of 4:  Building ICM
Step 4 of 4:  Running Glimmer3
Starting at Wed Apr 23 12:44:15 2025

Sequence file = sequence.fasta
Number of sequences = 1
ICM model file = from-scratch.icm
Excluded regions file = none
List of orfs file = none
Input is NOT separate orfs
Independent (non-coding) scores are used
Circular genome = true
Truncated orfs = false
Minimum gene length = 110 bp
Maximum overlap bases = 50
Threshold score = 30
Use first start codon = false
Start codons = atg,gtg,ttg
Start probs = 0.600,0.

In [7]:
# Glimmer-iterated
!cd Ecoli && csh g3-iterated.csh sequence.fasta iterated
!cd Ecoli && mkdir -p iterated
!cd Ecoli && mv iterated.coords iterated.longorfs iterated.run1.detail iterated.upstream iterated.detail iterated.motif iterated.run1.predict iterated.icm iterated.predict iterated.train iterated/

Step 1 of 8:  Finding long orfs for training
Starting at Wed Apr 23 12:46:32 2025

Sequence file = sequence.fasta
Excluded regions file = none
Circular genome = true
Initial minimum gene length = 90 bp
Determine optimal min gene length to maximize number of genes
Maximum overlap bases = 30
Start codons = atg,gtg,ttg
Stop codons = taa,tag,tga
Sequence length = 4641652
Final minimum gene length = 463
Number of genes = 2795
Total bases = 3094821
Step 2 of 8:  Extracting training sequences
Step 3 of 8:  Building ICM
Step 4 of 8:  Running first Glimmer3
Starting at Wed Apr 23 12:46:41 2025

Sequence file = sequence.fasta
Number of sequences = 1
ICM model file = iterated.icm
Excluded regions file = none
List of orfs file = none
Input is NOT separate orfs
Independent (non-coding) scores are used
Circular genome = true
Truncated orfs = false
Minimum gene length = 110 bp
Maximum overlap bases = 50
Threshold score = 30
Use first start codon = false
Start codons = atg,gtg,ttg
Start probs = 0.600,

* Utils

In [5]:
import pandas as pd
from pathlib import Path

def process_gff(gff_path: str, output_path: str, portion: str = "3.3"):
    gff_path = Path(gff_path)
    output_path = Path(output_path)

    df = pd.read_csv(gff_path, sep="\t", comment="#", header=None)
    df = df.drop(columns=[1, 2])
    
    total = len(df)
    if portion == "1.3":
        df = df.iloc[: (1 * total) // 3]
    elif portion == "1.2":
        df = df.iloc[: total // 2]
    elif portion == "2.3":
        df = df.iloc[: (2 * total) // 3]

    df.to_csv(output_path, sep="\t", index=False, header=False)
    print(f"✅ Saved {portion} GFF to: {output_path}")


* Ecoli

In [27]:
# Glimmer-from-training
folder = "Ecoli"
portion = "1.3"
process_gff(
    gff_path=f"Verified-genome-ALL/{folder}/verified.gff",
    output_path=f"{folder}/coords-{portion}.nh",
    portion=portion
)

# script
!cd Ecoli && csh g3-from-training.csh sequence.fasta coords-1.3.nh training
!cd Ecoli && mkdir -p training-1.3
!cd Ecoli && mv training.detail training.icm training.motif training.predict training.train training.upstream training-1.3/

✅ Saved 1.3 GFF to: Ecoli/coords-1.3.nh
Step 1 of 5:  Extracting training sequences
Step 2 of 5:  Building ICM
Step 3 of 5:  Making PWM from upstream regions
[1]39: alignProb=646.54 Info/param=0.05 diff=9.23     
[2]35: alignProb=680.03 Info/param=0.05 diff=14.55     
[3]21: alignProb=636.09 Info/param=0.05 diff=12.54    
[4]56: alignProb=658.05 Info/param=0.05 diff=11.94    
[5]15: alignProb=676.54 Info/param=0.05 diff=6.28     
[6]46: alignProb=661.84 Info/param=0.05 diff=4.94     
[7]40: alignProb=681.31 Info/param=0.05 diff=16.32    
[8]26: alignProb=642.74 Info/param=0.05 diff=30.49    
[9]23: alignProb=620.49 Info/param=0.05 diff=11.20    
[10]44: alignProb=652.84 Info/param=0.05 diff=32.82    
Optimizing...
New align prob=803.92
New align prob=806.45
New align prob=806.35
Step 4 of 5:  Getting start-codon usage
Step 5 of 5:  Running Glimmer3
Starting at Wed Apr 23 13:16:04 2025

Sequence file = sequence.fasta
Number of sequences = 1
ICM model file = training.icm
Excluded regions

In [28]:
# Glimmer-from-training
folder = "Ecoli"
portion = "1.2"
process_gff(
    gff_path=f"Verified-genome-ALL/{folder}/verified.gff",
    output_path=f"{folder}/coords-{portion}.nh",
    portion=portion
)

# script
!cd Ecoli && csh g3-from-training.csh sequence.fasta coords-1.2.nh training
!cd Ecoli && mkdir -p training-1.2
!cd Ecoli && mv training.detail training.icm training.motif training.predict training.train training.upstream training-1.2/

✅ Saved 1.2 GFF to: Ecoli/coords-1.2.nh
Step 1 of 5:  Extracting training sequences
Step 2 of 5:  Building ICM
Step 3 of 5:  Making PWM from upstream regions
[1]33: alignProb=937.85 Info/param=0.05 diff=22.75    
[2]37: alignProb=942.41 Info/param=0.04 diff=11.44    
[3]30: alignProb=999.25 Info/param=0.06 diff=37.56    
[4]42: alignProb=969.59 Info/param=0.05 diff=57.65    
[5]17: alignProb=924.68 Info/param=0.05 diff=90.23    
[6]18: alignProb=1012.51 Info/param=0.05 diff=55.05    
[7]24: alignProb=950.57 Info/param=0.05 diff=44.89    
[8]38: alignProb=919.25 Info/param=0.04 diff=12.02    
[9]27: alignProb=931.07 Info/param=0.05 diff=41.04    
[10]41: alignProb=947.13 Info/param=0.05 diff=28.49    
Optimizing...
New align prob=1212.50
New align prob=1218.54
New align prob=1219.95
New align prob=1220.62
Step 4 of 5:  Getting start-codon usage
Step 5 of 5:  Running Glimmer3
Starting at Wed Apr 23 13:16:23 2025

Sequence file = sequence.fasta
Number of sequences = 1
ICM model file = tra

In [29]:
# Glimmer-from-training
folder = "Ecoli"
portion = "2.3"
process_gff(
    gff_path=f"Verified-genome-ALL/{folder}/verified.gff",
    output_path=f"{folder}/coords-{portion}.nh",
    portion=portion
)

# script
!cd Ecoli && csh g3-from-training.csh sequence.fasta coords-2.3.nh training
!cd Ecoli && mkdir -p training-2.3
!cd Ecoli && mv training.detail training.icm training.motif training.predict training.train training.upstream training-2.3/

✅ Saved 2.3 GFF to: Ecoli/coords-2.3.nh
Step 1 of 5:  Extracting training sequences
Step 2 of 5:  Building ICM
Step 3 of 5:  Making PWM from upstream regions
[1]51: alignProb=1287.22 Info/param=0.05 diff=18.14    
[2]34: alignProb=1135.11 Info/param=0.04 diff=27.94    
[3]27: alignProb=1254.56 Info/param=0.05 diff=20.38    
[4]65: alignProb=1175.11 Info/param=0.04 diff=4.11     
[5]16: alignProb=1166.24 Info/param=0.04 diff=29.56    
[6]28: alignProb=1138.63 Info/param=0.04 diff=30.89    
[7]40: alignProb=1286.43 Info/param=0.05 diff=1.26     
[8]49: alignProb=1251.24 Info/param=0.05 diff=10.49    
[9]33: alignProb=1150.55 Info/param=0.04 diff=27.65    
[10]23: alignProb=1283.72 Info/param=0.05 diff=18.83    
Optimizing...
New align prob=1600.34
New align prob=1609.05
New align prob=1612.15
Step 4 of 5:  Getting start-codon usage
Step 5 of 5:  Running Glimmer3
Starting at Wed Apr 23 13:16:43 2025

Sequence file = sequence.fasta
Number of sequences = 1
ICM model file = training.icm
Excl

In [30]:
# Glimmer-from-training
folder = "Ecoli"
portion = "3.3"
process_gff(
    gff_path=f"Verified-genome-ALL/{folder}/verified.gff",
    output_path=f"{folder}/coords-{portion}.nh",
    portion=portion
)

# script
!cd Ecoli && csh g3-from-training.csh sequence.fasta coords-3.3.nh training
!cd Ecoli && mkdir -p training-3.3
!cd Ecoli && mv training.detail training.icm training.motif training.predict training.train training.upstream training-3.3/

✅ Saved 3.3 GFF to: Ecoli/coords-3.3.nh
Step 1 of 5:  Extracting training sequences
Step 2 of 5:  Building ICM
Step 3 of 5:  Making PWM from upstream regions
[1]56: alignProb=1859.65 Info/param=0.04 diff=38.47     
[2]33: alignProb=1543.60 Info/param=0.04 diff=12.60     
[3]57: alignProb=1630.46 Info/param=0.03 diff=3.16      
[4]29: alignProb=1567.92 Info/param=0.04 diff=6.29      
[5]49: alignProb=1622.05 Info/param=0.04 diff=40.74    
[6]41: alignProb=1622.61 Info/param=0.04 diff=29.99     
[7]48: alignProb=1829.97 Info/param=0.04 diff=50.67     
[8]47: alignProb=1770.18 Info/param=0.04 diff=45.75     
[9]52: alignProb=1379.91 Info/param=0.02 diff=6.91     
[10]30: alignProb=1284.92 Info/param=0.03 diff=100.86    
Optimizing...
New align prob=2342.71
New align prob=2352.49
New align prob=2353.63
New align prob=2355.09
New align prob=2355.27
New align prob=2355.30
New align prob=2356.46
Step 4 of 5:  Getting start-codon usage
Step 5 of 5:  Running Glimmer3
Starting at Wed Apr 23 13:1

* Halobacterium

In [31]:
# Glimmer-from-training
folder = "Halobacterium"
portion = "1.3"
process_gff(
    gff_path=f"Verified-genome-ALL/{folder}/verified.gff",
    output_path=f"{folder}/coords-{portion}.nh",
    portion=portion
)

# script
!cd Halobacterium && csh g3-from-training.csh sequence.fasta coords-1.3.nh training
!cd Halobacterium && mkdir -p training-1.3
!cd Halobacterium && mv training.detail training.icm training.motif training.predict training.train training.upstream training-1.3/

✅ Saved 1.3 GFF to: Halobacterium/coords-1.3.nh
Step 1 of 5:  Extracting training sequences
Step 2 of 5:  Building ICM
Step 3 of 5:  Making PWM from upstream regions
[1]69: alignProb=437.38 Info/param=0.03 diff=5.99     
[2]24: alignProb=386.53 Info/param=0.03 diff=4.99     
[3]38: alignProb=443.49 Info/param=0.03 diff=14.24    
[4]28: alignProb=419.12 Info/param=0.02 diff=23.79    
[5]42: alignProb=430.07 Info/param=0.03 diff=3.32     
[6]24: alignProb=387.38 Info/param=0.03 diff=11.73    
[7]110: alignProb=455.38 Info/param=0.04 diff=3.20    
[8]21: alignProb=420.25 Info/param=0.03 diff=0.81     
[9]42: alignProb=429.99 Info/param=0.03 diff=10.24    
[10]39: alignProb=439.61 Info/param=0.04 diff=16.67    
Optimizing...
New align prob=507.28
New align prob=510.89
New align prob=511.21
Step 4 of 5:  Getting start-codon usage
Step 5 of 5:  Running Glimmer3
Starting at Wed Apr 23 13:17:22 2025

Sequence file = sequence.fasta
Number of sequences = 5
ICM model file = training.icm
Excluded 

In [32]:
# Glimmer-from-training
folder = "Halobacterium"
portion = "1.2"
process_gff(
    gff_path=f"Verified-genome-ALL/{folder}/verified.gff",
    output_path=f"{folder}/coords-{portion}.nh",
    portion=portion
)

# script
!cd Halobacterium && csh g3-from-training.csh sequence.fasta coords-1.2.nh training
!cd Halobacterium && mkdir -p training-1.2
!cd Halobacterium && mv training.detail training.icm training.motif training.predict training.train training.upstream training-1.2/

✅ Saved 1.2 GFF to: Halobacterium/coords-1.2.nh
Step 1 of 5:  Extracting training sequences
Step 2 of 5:  Building ICM
Step 3 of 5:  Making PWM from upstream regions
[1]76: alignProb=645.07 Info/param=0.03 diff=36.15    
[2]43: alignProb=586.16 Info/param=0.02 diff=11.45    
[3]46: alignProb=605.81 Info/param=0.03 diff=20.31    
[4]15: alignProb=526.23 Info/param=0.02 diff=25.99    
[5]43: alignProb=607.41 Info/param=0.03 diff=15.23    
[6]30: alignProb=544.83 Info/param=0.02 diff=4.01     
[7]36: alignProb=622.22 Info/param=0.03 diff=6.49     
[8]49: alignProb=586.05 Info/param=0.02 diff=3.89     
[9]30: alignProb=552.42 Info/param=0.02 diff=12.46    
[10]23: alignProb=565.64 Info/param=0.03 diff=18.20    
Optimizing...
New align prob=738.31
New align prob=742.22
New align prob=742.61
Step 4 of 5:  Getting start-codon usage
Step 5 of 5:  Running Glimmer3
Starting at Wed Apr 23 13:17:36 2025

Sequence file = sequence.fasta
Number of sequences = 5
ICM model file = training.icm
Excluded 

In [33]:
# Glimmer-from-training
folder = "Halobacterium"
portion = "2.3"
process_gff(
    gff_path=f"Verified-genome-ALL/{folder}/verified.gff",
    output_path=f"{folder}/coords-{portion}.nh",
    portion=portion
)

# script
!cd Halobacterium && csh g3-from-training.csh sequence.fasta coords-2.3.nh training
!cd Halobacterium && mkdir -p training-2.3
!cd Halobacterium && mv training.detail training.icm training.motif training.predict training.train training.upstream training-2.3/

✅ Saved 2.3 GFF to: Halobacterium/coords-2.3.nh
Step 1 of 5:  Extracting training sequences
Step 2 of 5:  Building ICM
Step 3 of 5:  Making PWM from upstream regions
[1]58: alignProb=752.39 Info/param=0.02 diff=9.02     
[2]15: alignProb=659.93 Info/param=0.03 diff=33.28    
[3]46: alignProb=709.59 Info/param=0.02 diff=4.90     
[4]48: alignProb=701.08 Info/param=0.02 diff=24.02    
[5]53: alignProb=702.34 Info/param=0.03 diff=18.47    
[6]32: alignProb=737.40 Info/param=0.02 diff=42.51    
[7]24: alignProb=712.43 Info/param=0.02 diff=48.73     
[8]61: alignProb=798.34 Info/param=0.03 diff=6.88     
[9]57: alignProb=810.66 Info/param=0.03 diff=1.45      
[10]24: alignProb=670.79 Info/param=0.02 diff=2.32     
Optimizing...
New align prob=969.25
New align prob=973.60
New align prob=974.33
Step 4 of 5:  Getting start-codon usage
Step 5 of 5:  Running Glimmer3
Starting at Wed Apr 23 13:17:51 2025

Sequence file = sequence.fasta
Number of sequences = 5
ICM model file = training.icm
Exclude

In [34]:
# Glimmer-from-training
folder = "Halobacterium"
portion = "3.3"
process_gff(
    gff_path=f"Verified-genome-ALL/{folder}/verified.gff",
    output_path=f"{folder}/coords-{portion}.nh",
    portion=portion
)

# script
!cd Halobacterium && csh g3-from-training.csh sequence.fasta coords-3.3.nh training
!cd Halobacterium && mkdir -p training-3.3
!cd Halobacterium && mv training.detail training.icm training.motif training.predict training.train training.upstream training-3.3/

✅ Saved 3.3 GFF to: Halobacterium/coords-3.3.nh
Step 1 of 5:  Extracting training sequences
Step 2 of 5:  Building ICM
Step 3 of 5:  Making PWM from upstream regions
[1]31: alignProb=1007.76 Info/param=0.02 diff=7.62     
[2]58: alignProb=1087.23 Info/param=0.02 diff=18.21    
[3]18: alignProb=957.44 Info/param=0.02 diff=1.68     
[4]61: alignProb=1078.81 Info/param=0.02 diff=39.10    
[5]50: alignProb=1019.84 Info/param=0.02 diff=10.28    
[6]41: alignProb=996.23 Info/param=0.02 diff=4.20     
[7]19: alignProb=989.31 Info/param=0.02 diff=7.99      
[8]42: alignProb=1061.92 Info/param=0.02 diff=77.04    
[9]49: alignProb=1051.23 Info/param=0.02 diff=5.31     
[10]28: alignProb=951.01 Info/param=0.02 diff=0.49      
Optimizing...
New align prob=1391.58
New align prob=1398.76
New align prob=1402.70
New align prob=1405.36
New align prob=1408.24
New align prob=1409.40
New align prob=1409.50
Step 4 of 5:  Getting start-codon usage
Step 5 of 5:  Running Glimmer3
Starting at Wed Apr 23 13:18:

* Mycobacterium

In [7]:
# Glimmer-from-training
folder = "Mycobacterium"
portion = "1.3"
process_gff(
    gff_path=f"Verified-genome-ALL/{folder}/verified.gff",
    output_path=f"{folder}/coords-{portion}.nh",
    portion=portion
)

# script
!cd Mycobacterium && csh g3-from-training.csh sequence.fasta coords-1.3.nh training
!cd Mycobacterium && mkdir -p training-1.3
!cd Mycobacterium && mv training.detail training.icm training.motif training.predict training.train training.upstream training-1.3/

✅ Saved 1.3 GFF to: Mycobacterium/coords-1.3.nh
Step 1 of 5:  Extracting training sequences
Step 2 of 5:  Building ICM
Step 3 of 5:  Making PWM from upstream regions
[1]72: alignProb=491.46 Info/param=0.01 diff=4.65     
[2]59: alignProb=517.74 Info/param=0.02 diff=0.40     
[3]63: alignProb=475.25 Info/param=0.02 diff=27.62    
[4]20: alignProb=477.64 Info/param=0.02 diff=15.89    
[5]59: alignProb=502.05 Info/param=0.02 diff=8.98     
[6]83: alignProb=515.07 Info/param=0.02 diff=6.06     
[7]45: alignProb=544.40 Info/param=0.02 diff=18.60    
[8]50: alignProb=517.28 Info/param=0.02 diff=25.59    
[9]71: alignProb=469.45 Info/param=0.02 diff=25.98    
[10]37: alignProb=511.32 Info/param=0.02 diff=15.74    
Optimizing...
New align prob=643.26
New align prob=647.38
New align prob=648.08
New align prob=648.55
Step 4 of 5:  Getting start-codon usage
Step 5 of 5:  Running Glimmer3
Starting at Wed Apr 23 16:38:32 2025

Sequence file = sequence.fasta
Number of sequences = 1
ICM model file = 

In [6]:
# Glimmer-from-training
folder = "Mycobacterium"
portion = "1.2"
process_gff(
    gff_path=f"Verified-genome-ALL/{folder}/verified.gff",
    output_path=f"{folder}/coords-{portion}.nh",
    portion=portion
)

# script
!cd Mycobacterium && csh g3-from-training.csh sequence.fasta coords-1.2.nh training
!cd Mycobacterium && mkdir -p training-1.2
!cd Mycobacterium && mv training.detail training.icm training.motif training.predict training.train training.upstream training-1.2/

✅ Saved 1.2 GFF to: Mycobacterium/coords-1.2.nh
Step 1 of 5:  Extracting training sequences
Step 2 of 5:  Building ICM
Step 3 of 5:  Making PWM from upstream regions
[1]58: alignProb=711.55 Info/param=0.02 diff=1.82     
[2]46: alignProb=668.50 Info/param=0.02 diff=16.42    
[3]38: alignProb=630.56 Info/param=0.02 diff=49.27    
[4]44: alignProb=680.60 Info/param=0.02 diff=13.35    
[5]35: alignProb=646.38 Info/param=0.02 diff=42.64    
[6]34: alignProb=722.46 Info/param=0.02 diff=67.93    
[7]25: alignProb=618.21 Info/param=0.02 diff=21.31    
[8]62: alignProb=675.70 Info/param=0.02 diff=4.53     
[9]36: alignProb=662.46 Info/param=0.02 diff=25.77    
[10]33: alignProb=686.35 Info/param=0.02 diff=2.07     
Optimizing...
New align prob=918.30
New align prob=921.57
New align prob=923.39
New align prob=926.21
New align prob=928.82
New align prob=931.62
New align prob=932.77
New align prob=933.64
Step 4 of 5:  Getting start-codon usage
Step 5 of 5:  Running Glimmer3
Starting at Wed Apr 23

In [37]:
# Glimmer-from-training
folder = "Mycobacterium"
portion = "2.3"
process_gff(
    gff_path=f"Verified-genome-ALL/{folder}/verified.gff",
    output_path=f"{folder}/coords-{portion}.nh",
    portion=portion
)

# script
!cd Mycobacterium && csh g3-from-training.csh sequence.fasta coords-2.3.nh training
!cd Mycobacterium && mkdir -p training-2.3
!cd Mycobacterium && mv training.detail training.icm training.motif training.predict training.train training.upstream training-2.3/

✅ Saved 2.3 GFF to: Mycobacterium/coords-2.3.nh
Step 1 of 5:  Extracting training sequences
Step 2 of 5:  Building ICM
Step 3 of 5:  Making PWM from upstream regions
[1]56: alignProb=898.79 Info/param=0.01 diff=12.01    
[2]48: alignProb=825.22 Info/param=0.02 diff=19.15    
[3]43: alignProb=764.30 Info/param=0.02 diff=37.25    
[4]60: alignProb=884.42 Info/param=0.02 diff=3.29     
[5]41: alignProb=813.71 Info/param=0.01 diff=20.10     
[6]87: alignProb=919.83 Info/param=0.02 diff=13.72     
[7]42: alignProb=868.47 Info/param=0.02 diff=47.05     
[8]89: alignProb=864.76 Info/param=0.01 diff=3.89     
[9]58: alignProb=821.65 Info/param=0.02 diff=32.97    
[10]36: alignProb=803.35 Info/param=0.02 diff=14.14    
Optimizing...
New align prob=1229.15
New align prob=1232.70
New align prob=1233.05
Step 4 of 5:  Getting start-codon usage
Step 5 of 5:  Running Glimmer3
Starting at Wed Apr 23 13:19:09 2025

Sequence file = sequence.fasta
Number of sequences = 1
ICM model file = training.icm
Exc

In [38]:
# Glimmer-from-training
folder = "Mycobacterium"
portion = "3.3"
process_gff(
    gff_path=f"Verified-genome-ALL/{folder}/verified.gff",
    output_path=f"{folder}/coords-{portion}.nh",
    portion=portion
)

# script
!cd Mycobacterium && csh g3-from-training.csh sequence.fasta coords-3.3.nh training
!cd Mycobacterium && mkdir -p training-3.3
!cd Mycobacterium && mv training.detail training.icm training.motif training.predict training.train training.upstream training-3.3/

✅ Saved 3.3 GFF to: Mycobacterium/coords-3.3.nh
Step 1 of 5:  Extracting training sequences
Step 2 of 5:  Building ICM
Step 3 of 5:  Making PWM from upstream regions
[1]28: alignProb=1053.70 Info/param=0.02 diff=1.12      
[2]33: alignProb=1018.16 Info/param=0.01 diff=15.44     
[3]74: alignProb=1129.16 Info/param=0.02 diff=5.33     
[4]40: alignProb=1095.45 Info/param=0.01 diff=15.55    
[5]32: alignProb=1009.71 Info/param=0.01 diff=15.55    
[6]35: alignProb=950.86 Info/param=0.01 diff=6.22      
[7]39: alignProb=1041.56 Info/param=0.01 diff=42.11    
[8]57: alignProb=994.57 Info/param=0.01 diff=10.24    
[9]22: alignProb=1029.65 Info/param=0.02 diff=5.61     
[10]34: alignProb=1122.30 Info/param=0.01 diff=6.04     
Optimizing...
New align prob=1751.25
New align prob=1783.92
New align prob=1805.86
New align prob=1809.08
New align prob=1809.64
Step 4 of 5:  Getting start-codon usage
Step 5 of 5:  Running Glimmer3
Starting at Wed Apr 23 13:19:34 2025

Sequence file = sequence.fasta
Num

* Natronomonas

In [39]:
# Glimmer-from-training
folder = "Natronomonas"
portion = "1.3"
process_gff(
    gff_path=f"Verified-genome-ALL/{folder}/verified.gff",
    output_path=f"{folder}/coords-{portion}.nh",
    portion=portion
)

# script
!cd Natronomonas && csh g3-from-training.csh sequence.fasta coords-1.3.nh training
!cd Natronomonas && mkdir -p training-1.3
!cd Natronomonas && mv training.detail training.icm training.motif training.predict training.train training.upstream training-1.3/

✅ Saved 1.3 GFF to: Natronomonas/coords-1.3.nh
Step 1 of 5:  Extracting training sequences
Step 2 of 5:  Building ICM
Step 3 of 5:  Making PWM from upstream regions
[1]39: alignProb=244.05 Info/param=0.03 diff=7.65     
[2]24: alignProb=251.42 Info/param=0.03 diff=1.24     
[3]37: alignProb=262.66 Info/param=0.04 diff=5.29     
[4]11: alignProb=283.39 Info/param=0.04 diff=36.40    
[5]40: alignProb=283.67 Info/param=0.04 diff=4.54     
[6]18: alignProb=254.42 Info/param=0.03 diff=22.76    
[7]7: alignProb=263.57 Info/param=0.04 diff=8.86     
[8]41: alignProb=269.98 Info/param=0.03 diff=2.84     
[9]24: alignProb=278.13 Info/param=0.04 diff=16.73    
[10]91: alignProb=276.59 Info/param=0.03 diff=3.51     
Optimizing...
New align prob=310.44
Step 4 of 5:  Getting start-codon usage
Step 5 of 5:  Running Glimmer3
Starting at Wed Apr 23 13:19:55 2025

Sequence file = sequence.fasta
Number of sequences = 3
ICM model file = training.icm
Excluded regions file = none
List of orfs file = none
I

In [40]:
# Glimmer-from-training
folder = "Natronomonas"
portion = "1.2"
process_gff(
    gff_path=f"Verified-genome-ALL/{folder}/verified.gff",
    output_path=f"{folder}/coords-{portion}.nh",
    portion=portion
)

# script
!cd Natronomonas && csh g3-from-training.csh sequence.fasta coords-1.2.nh training
!cd Natronomonas && mkdir -p training-1.2
!cd Natronomonas && mv training.detail training.icm training.motif training.predict training.train training.upstream training-1.2/

✅ Saved 1.2 GFF to: Natronomonas/coords-1.2.nh
Step 1 of 5:  Extracting training sequences
Step 2 of 5:  Building ICM
Step 3 of 5:  Making PWM from upstream regions
[1]73: alignProb=389.29 Info/param=0.04 diff=1.95     
[2]34: alignProb=399.73 Info/param=0.04 diff=6.70     
[3]54: alignProb=393.43 Info/param=0.03 diff=3.96     
[4]11: alignProb=360.35 Info/param=0.03 diff=43.70    
[5]21: alignProb=398.10 Info/param=0.04 diff=0.87     
[6]51: alignProb=398.43 Info/param=0.03 diff=11.91    
[7]40: alignProb=350.25 Info/param=0.03 diff=16.41    
[8]47: alignProb=404.71 Info/param=0.03 diff=12.72    
[9]32: alignProb=397.48 Info/param=0.03 diff=4.10     
[10]48: alignProb=359.70 Info/param=0.03 diff=4.24     
Optimizing...
New align prob=448.36
New align prob=451.19
New align prob=452.74
New align prob=453.72
New align prob=454.51
New align prob=455.13
Step 4 of 5:  Getting start-codon usage
Step 5 of 5:  Running Glimmer3
Starting at Wed Apr 23 13:20:09 2025

Sequence file = sequence.fast

In [41]:
# Glimmer-from-training
folder = "Natronomonas"
portion = "2.3"
process_gff(
    gff_path=f"Verified-genome-ALL/{folder}/verified.gff",
    output_path=f"{folder}/coords-{portion}.nh",
    portion=portion
)

# script
!cd Natronomonas && csh g3-from-training.csh sequence.fasta coords-2.3.nh training
!cd Natronomonas && mkdir -p training-2.3
!cd Natronomonas && mv training.detail training.icm training.motif training.predict training.train training.upstream training-2.3/

✅ Saved 2.3 GFF to: Natronomonas/coords-2.3.nh
Step 1 of 5:  Extracting training sequences
Step 2 of 5:  Building ICM
Step 3 of 5:  Making PWM from upstream regions
[1]47: alignProb=532.12 Info/param=0.03 diff=3.22     
[2]27: alignProb=577.55 Info/param=0.04 diff=32.93    
[3]44: alignProb=531.01 Info/param=0.03 diff=14.69    
[4]28: alignProb=532.08 Info/param=0.03 diff=6.60     
[5]44: alignProb=527.05 Info/param=0.03 diff=0.75     
[6]53: alignProb=521.06 Info/param=0.04 diff=38.79    
[7]46: alignProb=542.59 Info/param=0.03 diff=6.30     
[8]28: alignProb=477.61 Info/param=0.03 diff=27.72    
[9]24: alignProb=528.62 Info/param=0.04 diff=10.86    
[10]33: alignProb=524.78 Info/param=0.04 diff=1.08     
Optimizing...
New align prob=624.48
New align prob=627.25
New align prob=627.21
Step 4 of 5:  Getting start-codon usage
Step 5 of 5:  Running Glimmer3
Starting at Wed Apr 23 13:20:23 2025

Sequence file = sequence.fasta
Number of sequences = 3
ICM model file = training.icm
Excluded r

In [42]:
# Glimmer-from-training
folder = "Natronomonas"
portion = "3.3"
process_gff(
    gff_path=f"Verified-genome-ALL/{folder}/verified.gff",
    output_path=f"{folder}/coords-{portion}.nh",
    portion=portion
)

# script
!cd Natronomonas && csh g3-from-training.csh sequence.fasta coords-3.3.nh training
!cd Natronomonas && mkdir -p training-3.3
!cd Natronomonas && mv training.detail training.icm training.motif training.predict training.train training.upstream training-3.3/

✅ Saved 3.3 GFF to: Natronomonas/coords-3.3.nh
Step 1 of 5:  Extracting training sequences
Step 2 of 5:  Building ICM
Step 3 of 5:  Making PWM from upstream regions
[1]37: alignProb=672.86 Info/param=0.03 diff=6.26     
[2]23: alignProb=664.98 Info/param=0.02 diff=21.07    
[3]35: alignProb=692.80 Info/param=0.02 diff=28.14    
[4]45: alignProb=732.35 Info/param=0.03 diff=17.99     
[5]52: alignProb=746.74 Info/param=0.03 diff=21.34    
[6]34: alignProb=657.73 Info/param=0.03 diff=32.47    
[7]48: alignProb=642.41 Info/param=0.03 diff=8.13     
[8]15: alignProb=695.19 Info/param=0.03 diff=20.18    
[9]37: alignProb=756.26 Info/param=0.03 diff=18.93    
[10]63: alignProb=755.39 Info/param=0.03 diff=20.57    
Optimizing...
New align prob=889.04
New align prob=893.48
New align prob=893.86
New align prob=893.92
Step 4 of 5:  Getting start-codon usage
Step 5 of 5:  Running Glimmer3
Starting at Wed Apr 23 13:20:40 2025

Sequence file = sequence.fasta
Number of sequences = 3
ICM model file = 

* Roseobacter

In [43]:
# Glimmer-from-training
folder = "Roseobacter"
portion = "1.3"
process_gff(
    gff_path=f"Verified-genome-ALL/{folder}/verified.gff",
    output_path=f"{folder}/coords-{portion}.nh",
    portion=portion
)

# script
!cd Roseobacter && csh g3-from-training.csh sequence.fasta coords-1.3.nh training
!cd Roseobacter && mkdir -p training-1.3
!cd Roseobacter && mv training.detail training.icm training.motif training.predict training.train training.upstream training-1.3/

✅ Saved 1.3 GFF to: Roseobacter/coords-1.3.nh
Step 1 of 5:  Extracting training sequences
Step 2 of 5:  Building ICM
Step 3 of 5:  Making PWM from upstream regions
[1]36: alignProb=398.78 Info/param=0.03 diff=8.93     
[2]58: alignProb=411.43 Info/param=0.04 diff=19.05    
[3]12: alignProb=407.70 Info/param=0.03 diff=1.62     
[4]22: alignProb=368.73 Info/param=0.03 diff=6.50     
[5]21: alignProb=387.68 Info/param=0.03 diff=7.09     
[6]30: alignProb=404.63 Info/param=0.04 diff=6.37     
[7]13: alignProb=402.90 Info/param=0.04 diff=42.31    
[8]35: alignProb=368.42 Info/param=0.02 diff=2.37     
[9]32: alignProb=390.62 Info/param=0.04 diff=13.46    
[10]48: alignProb=362.08 Info/param=0.03 diff=2.14     
Optimizing...
New align prob=494.90
New align prob=506.04
New align prob=506.30
Step 4 of 5:  Getting start-codon usage
Step 5 of 5:  Running Glimmer3
Starting at Wed Apr 23 13:20:54 2025

Sequence file = sequence.fasta
Number of sequences = 5
ICM model file = training.icm
Excluded re

In [44]:
# Glimmer-from-training
folder = "Roseobacter"
portion = "1.2"
process_gff(
    gff_path=f"Verified-genome-ALL/{folder}/verified.gff",
    output_path=f"{folder}/coords-{portion}.nh",
    portion=portion
)

# script
!cd Roseobacter && csh g3-from-training.csh sequence.fasta coords-1.2.nh training
!cd Roseobacter && mkdir -p training-1.2
!cd Roseobacter && mv training.detail training.icm training.motif training.predict training.train training.upstream training-1.2/

✅ Saved 1.2 GFF to: Roseobacter/coords-1.2.nh
Step 1 of 5:  Extracting training sequences
Step 2 of 5:  Building ICM
Step 3 of 5:  Making PWM from upstream regions
[1]45: alignProb=508.83 Info/param=0.04 diff=21.23    
[2]10: alignProb=483.22 Info/param=0.03 diff=58.43    
[3]29: alignProb=587.31 Info/param=0.03 diff=19.12    
[4]29: alignProb=493.95 Info/param=0.03 diff=3.51     
[5]17: alignProb=468.97 Info/param=0.03 diff=3.29     
[6]65: alignProb=586.59 Info/param=0.02 diff=9.76     
[7]49: alignProb=490.06 Info/param=0.02 diff=8.42     
[8]36: alignProb=489.16 Info/param=0.03 diff=4.86     
[9]70: alignProb=541.21 Info/param=0.02 diff=2.88     
[10]113: alignProb=577.43 Info/param=0.02 diff=1.44    
Optimizing...
New align prob=714.30
New align prob=720.42
New align prob=722.34
New align prob=722.68
New align prob=723.08
New align prob=723.10
New align prob=723.16
Step 4 of 5:  Getting start-codon usage
Step 5 of 5:  Running Glimmer3
Starting at Wed Apr 23 13:21:15 2025

Sequence

In [45]:
# Glimmer-from-training
folder = "Roseobacter"
portion = "2.3"
process_gff(
    gff_path=f"Verified-genome-ALL/{folder}/verified.gff",
    output_path=f"{folder}/coords-{portion}.nh",
    portion=portion
)

# script
!cd Roseobacter && csh g3-from-training.csh sequence.fasta coords-2.3.nh training
!cd Roseobacter && mkdir -p training-2.3
!cd Roseobacter && mv training.detail training.icm training.motif training.predict training.train training.upstream training-2.3/

✅ Saved 2.3 GFF to: Roseobacter/coords-2.3.nh
Step 1 of 5:  Extracting training sequences
Step 2 of 5:  Building ICM
Step 3 of 5:  Making PWM from upstream regions
[1]34: alignProb=634.18 Info/param=0.02 diff=16.45    
[2]43: alignProb=657.13 Info/param=0.02 diff=10.24    
[3]47: alignProb=730.68 Info/param=0.03 diff=14.91    
[4]62: alignProb=667.92 Info/param=0.03 diff=7.07     
[5]37: alignProb=675.44 Info/param=0.03 diff=27.52    
[6]52: alignProb=703.15 Info/param=0.02 diff=0.05     
[7]72: alignProb=722.43 Info/param=0.03 diff=13.42    
[8]31: alignProb=680.59 Info/param=0.03 diff=40.52    
[9]44: alignProb=675.19 Info/param=0.03 diff=7.22     
[10]65: alignProb=714.88 Info/param=0.03 diff=8.55     
Optimizing...
New align prob=980.66
New align prob=985.42
New align prob=987.18
New align prob=987.27
Step 4 of 5:  Getting start-codon usage
Step 5 of 5:  Running Glimmer3
Starting at Wed Apr 23 13:21:37 2025

Sequence file = sequence.fasta
Number of sequences = 5
ICM model file = tr

In [46]:
# Glimmer-from-training
folder = "Roseobacter"
portion = "3.3"
process_gff(
    gff_path=f"Verified-genome-ALL/{folder}/verified.gff",
    output_path=f"{folder}/coords-{portion}.nh",
    portion=portion
)

# script
!cd Roseobacter && csh g3-from-training.csh sequence.fasta coords-3.3.nh training
!cd Roseobacter && mkdir -p training-3.3
!cd Roseobacter && mv training.detail training.icm training.motif training.predict training.train training.upstream training-3.3/

✅ Saved 3.3 GFF to: Roseobacter/coords-3.3.nh
Step 1 of 5:  Extracting training sequences
Step 2 of 5:  Building ICM
Step 3 of 5:  Making PWM from upstream regions
[1]64: alignProb=1013.93 Info/param=0.02 diff=7.74     
[2]37: alignProb=1000.99 Info/param=0.03 diff=57.75    
[3]97: alignProb=1067.75 Info/param=0.03 diff=12.56    
[4]9: alignProb=808.30 Info/param=0.03 diff=184.86    
[5]74: alignProb=1142.90 Info/param=0.03 diff=11.46    
[6]33: alignProb=979.82 Info/param=0.03 diff=1.13     
[7]42: alignProb=1021.22 Info/param=0.03 diff=7.70     
[8]39: alignProb=1097.34 Info/param=0.03 diff=32.51    
[9]28: alignProb=938.39 Info/param=0.03 diff=0.71     
[10]56: alignProb=1099.08 Info/param=0.03 diff=8.22     
Optimizing...
New align prob=1472.60
New align prob=1487.75
New align prob=1494.58
New align prob=1494.66
New align prob=1494.94
Step 4 of 5:  Getting start-codon usage
Step 5 of 5:  Running Glimmer3
Starting at Wed Apr 23 13:22:00 2025

Sequence file = sequence.fasta
Number of

##### -------------------------------------------------------------------------------- GeneMarkS --------------------------------------------------------------------------------

> Installation: https://genemark.bme.gatech.edu/

In [4]:
import os
os.environ["PATH"] += os.pathsep + "/home/biolab-office-1/DATALAB/GeneLM-Benchmarck/genemark_suite_linux_64/gmsuite"

In [5]:
# !cd Ecoli/GeneMarkS && gmsn.pl ../sequence.fasta --species Ecoli --clean
# !cd Ecoli/GeneMarkS && gc ../sequence.fasta
# !cd Ecoli/GeneMarkS && gmhmmp -m ../../../../genemark_suite_linux_64/gmsuite/heuristic_mod/heu_11_50.mod -f G -o genemark-hmm.gff ../sequence.fasta

In [13]:
!cd Ecoli/GeneMark-HMM && gmsn.pl ../sequence.fasta --species Ecoli --clean
!cd Ecoli/GeneMark-HMM && gc ../sequence.fasta
!cd Ecoli/GeneMark-HMM && gmhmmp -m ../../../../genemark-2.5m/linux_64/heuristic_mod/heu_11_50.mod -f G -o genemark-hmm-2.5m.gff ../sequence.fasta

../sequence.fasta: GC% = 50.8


In [9]:
# !cd Halobacterium/GeneMarkS && gmsn.pl ../sequence.fasta --species Halobacterium --clean
# !cd Halobacterium/GeneMarkS && gc ../sequence.fasta
# !cd Halobacterium/GeneMarkS && gmhmmp -m ../../../../genemark_suite_linux_64/gmsuite/heuristic_mod/heu_11_65.mod -f G -o genemark-hmm.gff ../sequence.fasta

In [10]:
!cd Halobacterium/GeneMark-HMM && gmsn.pl ../sequence.fasta --species Halobacterium --clean
!cd Halobacterium/GeneMark-HMM && gc ../sequence.fasta
!cd Halobacterium/GeneMark-HMM && gmhmmp -m ../../../../genemark-2.5m/linux_64/heuristic_mod/heu_11_65.mod -f G -o genemark-hmm-2.5m.gff ../sequence.fasta

../sequence.fasta: GC% = 65.7


In [11]:
# !cd Mycobacterium/GeneMarkS && gmsn.pl ../sequence.fasta --species Mycobacterium --clean
# !cd Mycobacterium/GeneMarkS && gc ../sequence.fasta
# !cd Mycobacterium/GeneMarkS && gmhmmp -m ../../../../genemark_suite_linux_64/gmsuite/heuristic_mod/heu_11_65.mod -f G -o genemark-hmm.gff ../sequence.fasta

In [12]:
!cd Mycobacterium/GeneMark-HMM && gmsn.pl ../sequence.fasta --species Mycobacterium --clean
!cd Mycobacterium/GeneMark-HMM && gc ../sequence.fasta
!cd Mycobacterium/GeneMark-HMM && gmhmmp -m ../../../../genemark-2.5m/linux_64/heuristic_mod/heu_11_65.mod -f G -o genemark-hmm-2.5m.gff ../sequence.fasta

../sequence.fasta: GC% = 65.6


In [13]:
# !cd Natronomonas/GeneMarkS && gmsn.pl ../sequence.fasta --species Natronomonas --clean
# !cd Natronomonas/GeneMarkS && gc ../sequence.fasta
# !cd Natronomonas/GeneMarkS && gmhmmp -m ../../../../genemark_suite_linux_64/gmsuite/heuristic_mod/heu_11_63.mod -f G -o genemark-hmm.gff ../sequence.fasta

In [None]:
!cd Natronomonas/GeneMark-HMM && gmsn.pl ../sequence.fasta --species Natronomonas --clean
!cd Natronomonas/GeneMark-HMM && gc ../sequence.fasta
!cd Natronomonas/GeneMark-HMM && gmhmmp -m ../../../../genemark-2.5m/linux_64/heuristic_mod/heu_11_63.mod -f G -o genemark-hmm-2.5m.gff ../sequence.fasta

../sequence.fasta: GC% = 63.1


In [15]:
# !cd Roseobacter/GeneMarkS && gmsn.pl ../sequence.fasta --species Roseobacter --clean
# !cd Roseobacter/GeneMarkS && gc ../sequence.fasta
# !cd Roseobacter/GeneMarkS && gmhmmp -m ../../../../genemark_suite_linux_64/gmsuite/heuristic_mod/heu_11_58.mod -f G -o genemark-hmm.gff ../sequence.fasta

In [17]:
!cd Roseobacter/GeneMark-HMM && gmsn.pl ../sequence.fasta --species Roseobacter --clean
!cd Roseobacter/GeneMark-HMM && gc ../sequence.fasta
!cd Roseobacter/GeneMark-HMM && gmhmmp -m ../../../../genemark-2.5m/linux_64/heuristic_mod/heu_11_63.mod -f G -o genemark-hmm-2.5m.gff ../sequence.fasta

../sequence.fasta: GC% = 58.9


##### -------------------------------------------------------------------------------- Covert output to suitable format --------------------------------------------------------------------------------

> NC_008209.1	verified	CDS	3553	4539	.	+	0	gene_id=NC008209_+_4536 start_codon=ATG type=equal


In [15]:
import os
import re

def parse_genemark_lst_to_gff(lst_file_path, gff_file_path):
    with open(lst_file_path, 'r') as infile, open(gff_file_path, 'w') as outfile:
        outfile.write("##gff-version 3\n")
        
        seq_id = None
        in_table = False
        
        for line in infile:
            line = line.strip()
            
            # Get FASTA sequence ID
            if line.startswith("FASTA definition line:"):
                seq_id = line.split(":")[1].strip()
            
            # Look for start of the prediction table
            if line.startswith("Predicted genes"):
                in_table = True
                continue
            
            if in_table and re.match(r"^\s*\d+\s", line):
                parts = re.split(r'\s+', line)
                if len(parts) < 6:
                    continue

                gene_id = parts[0]
                strand = parts[1]
                start = parts[2].replace('<', '')  # Remove '<' if present
                end = parts[3].replace('>', '')    # Remove '>' if present
                length = parts[4]
                class_id = parts[5]

                strand = '+' if strand == '+' else '-'

                # Gene feature
                outfile.write(f"{seq_id}\tGeneMark.hmm\tgene\t{start}\t{end}\t.\t{strand}\t.\tID=gene{gene_id};length={length};class={class_id}\n")
                # CDS feature
                outfile.write(f"{seq_id}\tGeneMark.hmm\tCDS\t{start}\t{end}\t.\t{strand}\t0\tID=cds{gene_id};Parent=gene{gene_id}\n")

folders = ["Ecoli", "Halobacterium", "Mycobacterium", "Natronomonas", "Roseobacter"]
glimmer_methods = {
    "from-scratch": "from-scratch.predict",
    "iterated": "iterated.predict",
    # "training-1.2": "training.predict",
    # "training-1.3": "training.predict",
    # "training-2.3": "training.predict",
    # "training-3.3": "training.predict"
}

In [16]:
for folder in folders:
    # -----------------------------------------------------------------
    # GeneMarkS
    # -----------------------------------------------------------------
    # input_path = os.path.join(folder, "GeneMarkS", "sequence.fasta.lst")
    # output_root = os.path.join(folder, "All")
    # output_path = os.path.join(folder, "All", "genemark-s.gff")
    # os.makedirs(output_root, exist_ok=True)

    # if not os.path.exists(input_path):
    #     print(f"File not found: {input_path}")
    #     continue

    # with open(input_path, "r") as f:
    #     lines = f.readlines()

    # current_seq_id = None
    # gff_lines = []
    # for line in lines:
    #     line = line.strip()
    #     if line.startswith("FASTA definition line:"):
    #         match = re.search(r"FASTA definition line:\s*(\S+)", line)
    #         if match:
    #             current_seq_id = match.group(1)
    #     elif re.match(r"^\d", line):
    #         parts = line.split()
    #         if len(parts) >= 6:
    #             gene_id = parts[0]
    #             strand = parts[1]
    #             left = parts[2].replace('<', '')
    #             right = parts[3].replace('>', '')
    #             gene_length = parts[4]
    #             gff_line = f"{current_seq_id}\tGeneMarkS\tCDS\t{left}\t{right}\t.\t{strand}\t0\tGeneID={gene_id};Length={gene_length}"
    #             gff_lines.append(gff_line)
                
    # with open(output_path, "w") as out:
    #     out.write("\n".join(gff_lines))
    # print(f"\nGeneMark: GFF written for {folder} to {output_path}")
    
    # -----------------------------------------------------------------
    # GeneMark-hmm
    # -----------------------------------------------------------------
    input_path = os.path.join(folder, "GeneMark-HMM", "genemark-hmm-2.5m.gff")
    output_root = os.path.join(folder, "All")
    output_path = os.path.join(output_root, "genemark-hmm-2.5m.gff")
    os.makedirs(output_root, exist_ok=True)

    if not os.path.exists(input_path):
        print(f"File not found: {input_path}")
    else:
        gff_lines = []

        with open(input_path, "r") as f:
            for line in f:
                line = line.strip()
                if line.startswith("#"):
                    continue  # Skip header/comments
                parts = line.split("\t")
                if len(parts) == 9 and parts[2] == "CDS":
                    gff_lines.append(line)

        with open(output_path, "w") as out:
            out.write("##gff-version 2\n")
            out.write("\n".join(gff_lines))
        print(f"GeneMark-HMM: GFF saved to {output_path}")
    
    # -----------------------------------------------------------------
    # Glimmer-scratch
    # -----------------------------------------------------------------
    for method_folder, predict_file in glimmer_methods.items():
        input_path = os.path.join(folder, method_folder, predict_file)
        output_path = os.path.join(folder, "All", f"glimmer-{method_folder}.gff")

        if not os.path.exists(input_path):
            print(f"File not found: {input_path}")
            continue

        with open(input_path, "r") as f:
            lines = f.readlines()

        gff_lines = []
        current_seq_id = None

        for line in lines:
            line = line.strip()
            if line.startswith(">"):
                current_seq_id = line[1:].split()[0]
            elif line:
                parts = line.split()
                if len(parts) >= 5:
                    gene_id = parts[0]
                    start = int(parts[1])
                    end = int(parts[2])
                    strand_code = parts[3]
                    score = parts[4]
                    strand = '+' if '-' not in strand_code else '-'
                    gff_line = f"{current_seq_id}\tGlimmer\tCDS\t{start}\t{end}\t.\t{strand}\t0\tGeneID={gene_id};Score={score}"
                    gff_lines.append(gff_line)

        os.makedirs(os.path.dirname(output_path), exist_ok=True)
        with open(output_path, "w") as out:
            out.write("\n".join(gff_lines))

        print(f"Glimer\t: GFF written for {folder}/{method_folder} to {output_path}")

GeneMark-HMM: GFF saved to Ecoli/All/genemark-hmm-2.5m.gff
GeneMark-HMM: GFF saved to Halobacterium/All/genemark-hmm-2.5m.gff
GeneMark-HMM: GFF saved to Mycobacterium/All/genemark-hmm-2.5m.gff
GeneMark-HMM: GFF saved to Natronomonas/All/genemark-hmm-2.5m.gff
GeneMark-HMM: GFF saved to Roseobacter/All/genemark-hmm-2.5m.gff


In [None]:
# END