In [4]:
# Import functions
import pandas as pd
from sklearn.model_selection import train_test_split
import regex as re

# Preprocessing

## Load data

In [21]:
data_dir='~/PROJECTS/GaTech/FCGR_classifier/final_project/salmonella_data/data/'
kmc5_arrays='~/PROJECTS/GaTech/FCGR_classifier/final_project/salmonella_kmc5_arrays/'
kmc7_arrays='~/PROJECTS/GaTech/FCGR_classifier/final_project/salmonella_kmc7_arrays/'
mlst_results = data_dir + 'mlst_results_combined.tsv'
seqsero_results = data_dir + 'seqsero2_results.tsv'
quast_result = data_dir + 'quast_results_combined.tsv'
mlst_df = pd.read_csv(mlst_results, sep='\t', header=None)
seqsero_df = pd.read_csv(seqsero_results, sep='\t', header=None)
quast_df = pd.read_csv(quast_result, sep='\t')
duplicates_df = pd.read_csv('../assets/dups.tsv', header=None)

In [9]:
mlst_df.head(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,/scratch/salmonella/ncbi_dataset/data/GCA_0007...,senterica_achtman_2,62,aroC(32),dnaN(30),hemD(34),hisD(30),purE(24),sucA(31),thrA(32)
1,/scratch/salmonella/ncbi_dataset/data/GCA_0002...,senterica_achtman_2,-,aroC(32),dnaN(29),hemD(33),hisD(29),purE(24),sucA(30),"thrA(52,950)"


In [10]:
seqsero_df.tail(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
379871,GCA_021088405.1_PDT001197945.1_genomic.fna,$HOME/PROJECTS/GaTech/FCGR_classifier/seqsero2...,GCA_021088405.1_PDT001197945.1_genomic.fna,4,d,17,Salmonella enterica subspecies enterica (subsp...,"4:d:1,7",Schwarzengrund,
379872,GCA_021088425.1_ASM2108842v1_genomic.fna,$HOME/PROJECTS/GaTech/FCGR_classifier/seqsero2...,GCA_021088425.1_ASM2108842v1_genomic.fna,9,"g,m",-,Salmonella enterica subspecies enterica (subsp...,"9:g,m:-",Enteritidis,Detected Sdf I that is characteristic of commo...


In [11]:
quast_df.head(2)

Unnamed: 0,Assembly,# contigs (>= 0 bp),# contigs (>= 1000 bp),# contigs (>= 5000 bp),# contigs (>= 10000 bp),# contigs (>= 25000 bp),# contigs (>= 50000 bp),Total length (>= 0 bp),Total length (>= 1000 bp),Total length (>= 5000 bp),...,Total length (>= 50000 bp),# contigs,Largest contig,Total length,N50,N90,auN,L50,L90,# N's per 100 kbp
0,GCA_000006945.2_ASM694v2_genomic,2,2,2,2,2,2,4951383,4951383,4951383,...,4951383,2,4857450,4951383,4857450,4857450,4767081.0,1,1,0.0
1,GCA_000007545.1_ASM754v1_genomic,1,1,1,1,1,1,4791961,4791961,4791961,...,4791961,1,4791961,4791961,4791961,4791961,4791961.0,1,1,0.23


In [12]:
!ls $kmc5_arrays | head -2
!ls $kmc7_arrays | head -2

GCA_000006945.2_ASM694v2_genomic_k5_k5.npy
GCA_000007545.1_ASM754v1_genomic_k5_k5.npy
ls: write error: Broken pipe
GCA_000006945.2_ASM694v2_genomic_k7_k7.npy
GCA_000007545.1_ASM754v1_genomic_k7_k7.npy
ls: write error: Broken pipe


In [22]:
duplicates_df.head(2)

Unnamed: 0,0,1
0,0002b8807bca3d539de5e248fb83d39a,$HOME/PROJECTS/GaTech/FCGR_classifier/salmonel...
1,0002b8807bca3d539de5e248fb83d39a,$HOME/PROJECTS/GaTech/FCGR_classifier/salmonel...


## Remove duplicates

In [36]:
# Basename file path
duplicates_df.iloc[:, 1] = duplicates_df.iloc[:, 1].apply(lambda x: x.split('/')[-1].replace('_k5_k5.npy', ''))

# Randomly sort dataframe
duplicates_df = duplicates_df.sample(frac=1, random_state=42).reset_index(drop=True)

# Iterate and randomly select samples to drop based on hash value
file_hashes = set()
samples_to_drop = []

for idx, row in duplicates_df.iterrows():
    hash_value = row[0]
    if hash_value in file_hashes:
        samples_to_drop.append(idx)
    else:
        file_hashes.add(hash_value)

# Drop from quast dataframe
original_count = quast_df.shape[0]
print(f"{len(samples_to_drop)} total samples dropped of {original_count}")
quast_df = quast_df.drop(index=samples_to_drop).reset_index(drop=True)
print(quast_df.shape[0]/original_count*100, "% of samples retained after duplicate filtering")

23826 total samples dropped of 568313
95.80759194317217 % of samples retained after duplicate filtering


In [30]:
duplicates_df.head(2)

Unnamed: 0,0,1
0,d24dd81492c4bda5cb79d4c92a9f06d0,GCF_016409265.1_ASM1640926v1_genomic
1,9a63952faa8c309848691f0395141f5a,GCA_049672895.1_FUJO3088_genomic


## Remove poor qualtiy genomes

In [None]:
# Remove very low quality assemblies
quast_df_filt=quast_df[quast_df['N50'] >= 5000]['Assembly']
print(quast_df_filt.shape[0]/quast_df.shape[0]*100, "% of samples retained after N50 filtering")

99.9735530875852 % of samples retained after N50 filtering


## MLST Pre-processing

In [38]:
# Remove samples with no MLST calls
mlst_df_filt = mlst_df[mlst_df.iloc[:, 2] != '-'].iloc[:, [0, 2]]
print(mlst_df_filt.shape[0]/mlst_df.shape[0]*100, "% of samples retained after mlst filtering")

# Basename the first column of mlst_df_filt
mlst_df_filt.iloc[:, 0] = mlst_df_filt.iloc[:, 0].apply(lambda x: x.split('/')[-1].replace('.fna', ''))

# Keep only samples present in both quast and mlst filtered dataframes
common_samples = set(quast_df_filt).intersection(set(mlst_df_filt.iloc[:, 0]))
mlst_df_filt = mlst_df_filt[mlst_df_filt.iloc[:, 0].isin(common_samples)]

# Keep only samples with 100 replicates of the same sequence type
mlst_counts = mlst_df_filt.iloc[:, 1].value_counts()
mlst_df_filt = mlst_df_filt[mlst_df_filt.iloc[:, 1].isin(mlst_counts[mlst_counts >= 100].index)]

print(mlst_df_filt.shape[0]/mlst_df.shape[0]*100, "% of samples retained after merging quast and mlst filtered dataframes")
print(mlst_df_filt.shape[0], "# samples retained after merging quast and mlst filtered dataframes")

# Randomly sample 100 samples from each sequence type
mlst_df_filt = mlst_df_filt.groupby(mlst_df_filt.columns[1]).apply(lambda x: x.sample(100, random_state=42)).reset_index(drop=True)
print(mlst_df_filt.shape[0]/mlst_df.shape[0]*100, "% of samples retained after sampling 50 samples from each sequence type")
print(mlst_df_filt.shape[0], "# samples retained after sampling 50 samples from each sequence type")

# Print number of unique sequence types
print(mlst_df_filt.iloc[:, 1].nunique(), "# unique sequence types")

# Split data into train, validation, and test sets
train_val_df, test_df = train_test_split(mlst_df_filt, test_size=0.2, stratify=mlst_df_filt.iloc[:, 1], random_state=42)
train_df, val_df = train_test_split(train_val_df, test_size=0.2, stratify=train_val_df.iloc[:, 1], random_state=42)
print(train_df.shape[0], "# samples in training set")
print(val_df.shape[0], "# samples in validation set")
print(test_df.shape[0], "# samples in test set")

# Save dataframes to csv
train_df.to_csv('../assets/mlst_train_set.csv', index=False, header=False)
val_df.to_csv('../assets/mlst_val_set.csv', index=False, header=False)
test_df.to_csv('../assets/mlst_test_set.csv', index=False, header=False)

98.45705018821108 % of samples retained after mlst filtering
78.07436137375038 % of samples retained after merging quast and mlst filtered dataframes
368778 # samples retained after merging quast and mlst filtered dataframes
4.911695339393914 % of samples retained after sampling 50 samples from each sequence type
23200 # samples retained after sampling 50 samples from each sequence type
232 # unique sequence types
14848 # samples in training set
3712 # samples in validation set
4640 # samples in test set


  mlst_df_filt = mlst_df_filt.groupby(mlst_df_filt.columns[1]).apply(lambda x: x.sample(100, random_state=42)).reset_index(drop=True)


## Serotype Pre-processing

In [39]:
# Filter for salmonella enterica and a full serotype call
seqsero2_df_filt = seqsero_df[
    (seqsero_df.iloc[:, 6] == 'Salmonella enterica subspecies enterica (subspecies I)')
    &
    (seqsero_df.iloc[:, 8].apply(lambda x: isinstance(x, str) and bool(re.fullmatch(r"[A-Za-z]+(?:[-_][A-Za-z]+)*", x))))
].iloc[:, [0, 8]]

# Basename the first column
seqsero2_df_filt.iloc[:, 0] = seqsero2_df_filt.iloc[:, 0].str.replace('.fna', '', regex=False)

# Keep only samples present in both quast and mlst filtered dataframes
common_samples = set(quast_df_filt).intersection(set(seqsero2_df_filt.iloc[:, 0]))
seqsero2_df_filt = seqsero2_df_filt[seqsero2_df_filt.iloc[:, 0].isin(common_samples)]

# Keep only samples with 100 replicates of the same sequence type
serotype_counts = seqsero2_df_filt.iloc[:, 1].value_counts()
seqsero2_df_filt = seqsero2_df_filt[seqsero2_df_filt.iloc[:, 1].isin(serotype_counts[serotype_counts >= 100].index)]

print(seqsero2_df_filt.shape[0]/seqsero_df.shape[0]*100, "% of samples retained after merging quast and serotype filtered dataframes")
print(seqsero2_df_filt.shape[0], "# samples retained after merging quast and mlst filtered dataframes")

# Randomly sample 100 samples from each sequence type
seqsero2_df_filt = seqsero2_df_filt.groupby(seqsero2_df_filt.columns[1]).apply(lambda x: x.sample(100, random_state=42)).reset_index(drop=True)
print(seqsero2_df_filt.shape[0]/seqsero_df.shape[0]*100, "% of samples retained after sampling 50 samples from each serotype")
print(seqsero2_df_filt.shape[0], "# samples retained after sampling 100 samples from each serotype type")

# Print number of unique sequence types
print(seqsero2_df_filt.iloc[:, 1].nunique(), "# unique serotypes")

# Split data into train, validation, and test sets
train_val_df, test_df = train_test_split(seqsero2_df_filt, test_size=0.2, stratify=seqsero2_df_filt.iloc[:, 1], random_state=42)
train_df, val_df = train_test_split(train_val_df, test_size=0.2, stratify=train_val_df.iloc[:, 1], random_state=42)
print(train_df.shape[0], "# samples in training set")
print(val_df.shape[0], "# samples in validation set")
print(test_df.shape[0], "# samples in test set")

# Save dataframes to csv
train_df.to_csv('../assets/serotype_train_set.csv', index=False, header=False)
val_df.to_csv('../assets/serotype_val_set.csv', index=False, header=False)
test_df.to_csv('../assets/serotype_test_set.csv', index=False, header=False)

64.14459569382399 % of samples retained after merging quast and serotype filtered dataframes
243668 # samples retained after merging quast and mlst filtered dataframes
2.737757092501968 % of samples retained after sampling 50 samples from each serotype
10400 # samples retained after sampling 100 samples from each serotype type
104 # unique serotypes
6656 # samples in training set
1664 # samples in validation set
2080 # samples in test set


  seqsero2_df_filt = seqsero2_df_filt.groupby(seqsero2_df_filt.columns[1]).apply(lambda x: x.sample(100, random_state=42)).reset_index(drop=True)


## Subspecies Pre-processing

In [None]:
# Filter for salm
subspecies_df_filt = seqsero_df[(seqsero_df.iloc[:, 6] != '-')].iloc[:, [0, 6]]

# Basename the first column
subspecies_df_filt.iloc[:, 0] = subspecies_df_filt.iloc[:, 0].str.replace('.fna', '', regex=False)

# Keep only samples present in both quast and mlst filtered dataframes
common_samples = set(quast_df_filt).intersection(set(subspecies_df_filt.iloc[:, 0]))
subspecies_df_filt = subspecies_df_filt[subspecies_df_filt.iloc[:, 0].isin(common_samples)]

# Keep only samples with 100 replicates of the same sequence type
serotype_counts = subspecies_df_filt.iloc[:, 1].value_counts()
subspecies_df_filt = subspecies_df_filt[subspecies_df_filt.iloc[:, 1].isin(serotype_counts[serotype_counts >= 100].index)]

print(subspecies_df_filt.shape[0]/seqsero_df.shape[0]*100, "% of samples retained after merging quast and serotype filtered dataframes")
print(subspecies_df_filt.shape[0], "# samples retained after merging quast and subspecies filtered dataframes")

# Randomly sample 100 samples from each sequence type
subspecies_df_filt = subspecies_df_filt.groupby(subspecies_df_filt.columns[1]).apply(lambda x: x.sample(100, random_state=42)).reset_index(drop=True)
print(subspecies_df_filt.shape[0]/seqsero_df.shape[0]*100, "% of samples retained after sampling 50 samples from each serotype")
print(subspecies_df_filt.shape[0], "# samples retained after sampling 100 samples from each serotype type")

# Print number of unique sequence types
print(subspecies_df_filt.iloc[:, 1].nunique(), "# unique subspecies")

# Split data into train, validation, and test sets
train_val_df, test_df = train_test_split(subspecies_df_filt, test_size=0.2, stratify=subspecies_df_filt.iloc[:, 1], random_state=42)
train_df, val_df = train_test_split(train_val_df, test_size=0.2, stratify=train_val_df.iloc[:, 1], random_state=42)
print(train_df.shape[0], "# samples in training set")
print(val_df.shape[0], "# samples in validation set")
print(test_df.shape[0], "# samples in test set")

# Save dataframes to csv
train_df.to_csv('../assets/subspecies_train_set.csv', index=False, header=False)
val_df.to_csv('../assets/subspecies_val_set.csv', index=False, header=False)
test_df.to_csv('../assets/subspecies_test_set.csv', index=False, header=False)

81.09368130927967 % of samples retained after merging quast and serotype filtered dataframes
308053 # samples retained after merging quast and mlst filtered dataframes
0.15794752456742123 % of samples retained after sampling 50 samples from each serotype
600 # samples retained after sampling 100 samples from each serotype type
6 # unique subspecies
384 # samples in training set
96 # samples in validation set
120 # samples in test set


  subspecies_df_filt = subspecies_df_filt.groupby(subspecies_df_filt.columns[1]).apply(lambda x: x.sample(100, random_state=42)).reset_index(drop=True)
