In [3]:
import pandas as pd

# Cell to parse a FASTA file and produce a two-column table: sequence, identifier

def parse_sequence_txt(path):
    """Yield (identifier, sequence) pairs from a FASTA file."""

    f = open(path, 'r')
    f.readline()
    data = dict()
    data['genus'] = []
    data['species'] = []
    data['identifier'] = []
    data['is_complete'] = []
    data['sequence'] = []
    for line in f:
        line = line.strip().split('\t')
        long = line[0].split(' ')
        data['genus'].append(long[1])
        data['species'].append(long[2])
        data['identifier'].append(long[0])
        data['is_complete'].append(line[1].strip(' '))
        data['sequence'].append(line[2].strip())
    f.close()
    df = pd.DataFrame(data)
    df = df[['genus','species','sequence','identifier','is_complete']]
    return df

# Path to your FASTA file (you wrote "seqence.txt")
input_path = 'lossers/data/sequence.txt'
df = parse_sequence_txt(input_path)
df.head()

Unnamed: 0,genus,species,sequence,identifier,is_complete
0,Alitibacter,langaaensis,ATTGAAGAGTTTGATCATGGCTCAGATTGAACGCTGGCGGCAGGCT...,NR_118751.1,partial sequence
1,Alitibacter,langaaensis,ATTGAACGCTGGCGGCAGGCTTAACACATGCAAGTCGAACGGTAAC...,NR_042885.1,partial sequence
2,Roseovarius,maritimus,CAACTTGAGAGTTTGATCCTGGCTCAGAACGAACGCTGGCGGCAGG...,NR_200035.1,complete sequence
3,Roseovarius,roseus,CAACTTGAGAGTTTGATCCTGGCTCAGAACGAACGCTGGCGGCAGG...,NR_200034.1,complete sequence
4,Planosporangium,spinosum,TTGTTGGAGAGTTTGATCCTGGCTCAGGACGAACGCTGGCGGCGTG...,NR_200033.1,complete sequence


In [4]:
df['full_name'] = df['genus'] + '_' + df['species']
# df['label'] = df['full_name'].astype('category').cat.codes
num_labels = df['full_name'].nunique()
print(f'Number of unique labels: {num_labels}')

Number of unique labels: 20797


In [5]:
df['length'] = df['sequence'].str.len()
df = df[df['is_complete'].isin(['partial sequence', 'complete sequence'])]

# keep only full_name values that appear at least twice
counts = df['full_name'].value_counts()
valid_full_names = counts[counts >= 2].index
df = df[df['full_name'].isin(valid_full_names)].reset_index(drop=True)

print(f"Rows after filtering: {len(df)}")
print(f"Unique full_name after filtering: {df['full_name'].nunique()}")
# sanity check: every remaining full_name appears at least twice
assert df['full_name'].value_counts().min() >= 2

df = df[(df['length'] > 1300) & (df['length'] < 1600)].reset_index(drop=True)

Rows after filtering: 11135
Unique full_name after filtering: 4203


In [7]:
# filter out genera with fewer than 10 observations
genus_counts = df['genus'].value_counts()
valid_genus = genus_counts[genus_counts >= 10].index

print(f"Genera before: {len(genus_counts)}, genera kept (>=10): {len(valid_genus)}")
df = df[df['genus'].isin(valid_genus)].reset_index(drop=True)
print(f"Rows after genus filter: {len(df)}")
# sanity check: every remaining genus appears at least 10 times
assert df['genus'].value_counts().min() >= 10

Genera before: 1349, genera kept (>=10): 222
Rows after genus filter: 6779


In [8]:
df['genus_label'] = df['genus'].astype('category').cat.codes
num_genus_labels = df['genus_label'].nunique()
print(f'Number of unique genus labels: {num_genus_labels}')
df.head()

Number of unique genus labels: 222


Unnamed: 0,genus,species,sequence,identifier,is_complete,full_name,length,genus_label
0,Kitasatospora,hibisci,TTCACGGAGAGTTTGATCCTGGCTCAGGACGAACGCTGGCGGCGTG...,NR_200017.1,complete sequence,Kitasatospora_hibisci,1517,99
1,Peterkaempfera,podocarpi,TTCACGGAGAGTTTGATCCTGGCTCAGGACGAACGCTGGCGGCGTG...,NR_200001.1,complete sequence,Peterkaempfera_podocarpi,1516,166
2,Streptomyces,citrinus,AGAGTTTGATCCTGGCTCAGGACGAACGCTGGCGGCGTGCTTAACA...,NR_199987.1,partial sequence,Streptomyces_citrinus,1489,205
3,Dickeya,ananatis,AAATTGAAGAGTTTGATCATGGCTCAGATTGAACGCTGGCGGCAGG...,NR_199979.1,complete sequence,Dickeya_ananatis,1542,63
4,Microbacterium,wangruii,AGAGTTTGATCATGGCTCAGGATGAACGCTGGCGGCGTGCTTAACA...,NR_199966.1,partial sequence,Microbacterium_wangruii,1487,130


In [9]:
df['species_label'] = df['species'].astype('category').cat.codes
num_species_labels = df['species_label'].nunique()
print(f'Number of unique species labels: {num_species_labels}')

Number of unique species labels: 2198


In [12]:
df['genus'].value_counts().head(20)

genus
Streptomyces       1010
Nocardia            196
Paenibacillus       189
Vibrio              148
Pseudomonas         147
Streptococcus       129
Clostridium         114
Corynebacterium      90
Enterococcus         89
Bacillus             84
Halorubrum           70
Mycobacterium        68
Shewanella           67
Haloarcula           66
Flavobacterium       64
Campylobacter        63
Kitasatospora        62
Lactobacillus        61
Acetobacter          59
Sphingomonas         59
Name: count, dtype: int64

In [14]:
df.to_csv('lossers/data/sequence-cleaner.tsv',sep='\t',header=True,index=False)