In [11]:
import pandas as pd
import re
import gzip

# Define the input and output file paths
gtf_file = r"C:\Users\brschroy\OneDrive - Vrije Universiteit Brussel\Documenten\PhD_Haloferax_mediterranei_2023\8. Research\TetR - ChipSeq\Genome and GTF\Haloferax_mediterranei_atcc_33500_gca_000306765.ASM30676v2.59.gtf.gz"
output_bed_file = r"C:\Users\brschroy\OneDrive - Vrije Universiteit Brussel\Documenten\PhD_Haloferax_mediterranei_2023\8. Research\TetR - ChipSeq\Genome and GTF\output.bed"

# Read the GTF file (assuming it's gzipped)
with gzip.open(gtf_file, 'rt') as f:
    gtf_df = pd.read_csv(f, sep='\t', comment='#', header=None, names=[
        'Seqname', 'Source', 'Feature', 'Start', 'End', 'Score', 'Strand', 'Frame', 'Attributes'
    ])

# Filter the DataFrame to only include rows where Feature is 'gene'
gtf_df = gtf_df[gtf_df['Feature'] == 'gene']

# Function to extract gene_name and gene_id from the attribute field
def extract_gene_name_or_id(attribute):
    gene_name_match = re.search(r'gene_name "([^"]+)"', attribute)
    gene_id_match = re.search(r'gene_id "([^"]+)"', attribute)
    if gene_name_match:
        return gene_name_match.group(1)
    elif gene_id_match:
        return gene_id_match.group(1)
    return None

# Apply the function to extract gene_name or gene_id
gtf_df['Name'] = gtf_df['Attributes'].apply(extract_gene_name_or_id)

# Filter for the necessary columns and rename them accordingly
bed6_df = gtf_df[['Seqname', 'Start', 'End', 'Name', 'Score', 'Strand']].copy()

# Ensure start is 0-based for BED format (GTF is 1-based)
bed6_df.loc[:, 'Start'] = bed6_df['Start'] - 1

# Optionally, set the score to '.' if not available
bed6_df.loc[:, 'Score'] = '.'

# Save as BED6 file
bed6_df.to_csv(output_bed_file, sep='\t', header=False, index=False)
