In [2]:
import re
import pandas as pd

def parse_genbank_v2(file_path):
    # Regex patterns for various fields
    locus_pattern = re.compile(r"LOCUS\s+(\S+)")
    accession_pattern = re.compile(r"ACCESSION\s+(\S+)")
    version_pattern = re.compile(r"VERSION\s+(\S+)")
    project_pattern = re.compile(r"BioProject:\s*(PRJNA\d+)")
    source_pattern = re.compile(r"\/isolation_source=\"([^\"]+)\"")

    data = []
    with open(file_path, 'r') as file:
        content = file.read()
        records = content.split('//')  # Split records, each ends with '//'

        for record in records:
            locus = locus_pattern.search(record)
            accession = accession_pattern.search(record)
            version = version_pattern.search(record)
            project = project_pattern.search(record)
            source = source_pattern.search(record)

            data.append({
                'Locus': locus.group(1) if locus else None,
                'Accession': accession.group(1) if accession else None,
                'Version': version.group(1) if version else None,
                'Project': project.group(1) if project else None,
                'Isolation Source': source.group(1) if source else None
            })

    return data

def simplify_source(text):
    if text is None:
        return None
    if "feces" in text:
        return "human feces"
    elif "water" in text or "seawater" in text:
        return "water"
    elif "soil" in text:
        return "soil"
    else:
        # Default to the raw text if no simplifications apply
        return text.split(',')[0].split(';')[0].strip()

# Load and parse the GenBank file
file_path = "C:\\Users\\DuYih\\Desktop\\sequence-PVC.gb"  # Update this path to your file location
genbank_data = parse_genbank_v2(file_path)

# Convert to DataFrame
df = pd.DataFrame(genbank_data)

# Apply the simplification function
df['Isolation Source Simplified'] = df['Isolation Source'].apply(simplify_source)

# Drop rows where all key fields (Locus, Accession, Version) are missing
df.dropna(subset=['Locus', 'Accession', 'Version'], how='all', inplace=True)

# Display the DataFrame
print(df.head())


      Locus Accession     Version Project  \
0  MT193413  MT193413  MT193413.1    None   
1  MT193412  MT193412  MT193412.1    None   
2  KT122326  KT122326  KT122326.1    None   
3  KT122322  KT122322  KT122322.1    None   
4  KT122301  KT122301  KT122301.1    None   

                                    Isolation Source  \
0                     patina on cave quartzitic rock   
1                     patina on cave quartzitic rock   
2  inundated soil of 155m of Shibaozhai,\n       ...   
3  inundated soil of 155m of Shibaozhai,\n       ...   
4  inundated soil of 155m of Shibaozhai,\n       ...   

      Isolation Source Simplified  
0  patina on cave quartzitic rock  
1  patina on cave quartzitic rock  
2                           water  
3                           water  
4                           water  


In [4]:
df.to_csv('C:\\Users\\DuYih\\Desktop\\sequence-PVC.csv', index=False)  # Save the DataFrame to a CSV file