In [28]:
# Filtering readbased KMA detection of AMR gene and VF
# Author: Dr. Haley Sanderson haley.sanderson@agr.gc.ca
# Copyright: Government of Canada
# License: MIT
# Version 0.1
import os
import pandas as pd

# Set the directory path
directory_path = './'

# Get a list of all files ending with "AMR.res"
amr_files = [f for f in os.listdir(directory_path) if f.endswith('AMR.res')]

# Initialize an empty DataFrame to store the concatenated data
combined_data = pd.DataFrame()

# Loop through each file and concatenate the data
for amr_file in amr_files:
    file_path = os.path.join(directory_path, amr_file)

    # Specify column names while reading the file
    gene_data = pd.read_csv(file_path, sep='\t', names=["#Template", "Score", "Expected", "Template_length", "Template_Identity", "Template_Coverage", "Query_Identity", "Query_Coverage", "Depth", "q_value", "p_value"])

    # Add a new column with the file name
    gene_data['FileName'] = amr_file

    # Filter out rows where '#Template' column contains 'RequiresSNPConfirmation'
    gene_data_noSNP = gene_data[~gene_data['#Template'].str.contains('RequiresSNPConfirmation', case=False)]
    
    # Concatenate the data to the combined DataFrame
    combined_data = pd.concat([combined_data, gene_data_noSNP], ignore_index=True)
    combined_data = combined_data[~combined_data['#Template'].str.contains('#template', case=False)]
# Display the combined DataFrame
print(combined_data)


                                            #Template     Score  Expected  \
0   MEG_997|Drugs|Aminoglycosides|Aminoglycoside_O...      5296       648   
1   MEG_6997|Drugs|Tetracyclines|Tetracycline_resi...     11826      1452   
2   MEG_7112|Drugs|Tetracyclines|Tetracycline_resi...      8327      1450   
3   MEG_3641|Drugs|MLS|MLS_resistance_ABC_efflux_p...      6424      1344   
4   MEG_3648|Drugs|Multi-drug_resistance|Multi-dru...      4183       728   
5   MEG_7112|Drugs|Tetracyclines|Tetracycline_resi...      7660      1738   
6   MEG_3896|Metals|Mercury_resistance|Mercury_res...      1349       297   
7   MEG_2566|Drugs|Trimethoprim|Dihydrofolate_redu...       736       332   
8   MEG_988|Drugs|Aminoglycosides|Aminoglycoside_O...      3691       609   
9   MEG_997|Drugs|Aminoglycosides|Aminoglycoside_O...     10080       602   
10  MEG_1000|Drugs|Aminoglycosides|Aminoglycoside_...      6402       607   
11  MEG_1004|Drugs|Aminoglycosides|Aminoglycoside_...      4091       569   

In [29]:
combined_data.to_csv("MG_AMR_read_mapping_80%_cutoff_nosnp.csv")

In [30]:
unique_file_names = combined_data['FileName'].unique()
unique_file_names

array(['SN005-20171106_AMR.res', 'SN005-20191104_AMR.res',
       'SN018-20210712_AMR.res', 'SN021-20211018_AMR.res',
       'SN005-20180430_AMR.res', 'SN010-20170704_AMR.res',
       'SN020-20201019_AMR.res', 'SN005-20160718_AMR.res',
       'SN019-20201102_AMR.res', 'SN020-20191021_AMR.res',
       'SN019-20181113_AMR.res', 'SN019-20190429_AMR.res',
       'SN019-20170704_AMR.res', 'SN024-20181113_AMR.res',
       'SN005-20191021_AMR.res', 'SN021-20180807_AMR.res',
       'SN006-20201102_AMR.res', 'SN006-20211004_AMR.res',
       'SN005-20201102_AMR.res', 'SN005-20181015_AMR.res'], dtype=object)

In [23]:
import os
import pandas as pd

# Set the directory path
directory_path = './'

# Get a list of all files ending with "AMR.res"
amr_files = [f for f in os.listdir(directory_path) if f.endswith('VFDB.res')]

# Initialize an empty DataFrame to store the concatenated data
combined_data = pd.DataFrame()

# Loop through each file and concatenate the data
for amr_file in amr_files:
    file_path = os.path.join(directory_path, amr_file)

    # Specify column names while reading the file
    gene_data = pd.read_csv(file_path, sep='\t', names=["#Template", "Score", "Expected", "Template_length", "Template_Identity", "Template_Coverage", "Query_Identity", "Query_Coverage", "Depth", "q_value", "p_value"])

    # Add a new column with the file name
    gene_data['FileName'] = amr_file

    # Filter out rows where '#Template' column contains 'RequiresSNPConfirmation'
    gene_data_noSNP = gene_data[~gene_data['#Template'].str.contains('RequiresSNPConfirmation', case=False)]
    
    # Concatenate the data to the combined DataFrame
    combined_data = pd.concat([combined_data, gene_data_noSNP], ignore_index=True)
    combined_data = combined_data[~combined_data['#Template'].str.contains('#template', case=False)]
# Display the combined DataFrame
print(combined_data)

                                            #Template     Score  Expected  \
0   VFG038340(gb|WP_005320774) (acrG) Type III sec...       844       955   
1   VFG010906(gb|WP_011213314) (csrA) carbon stora...       856       492   
2   VFG010906(gb|WP_011213314) (csrA) carbon stora...       489       381   
3   VFG038663(gb|WP_005298795) (fliO) flagellar bi...      1108       862   
4   VFG000863(gb|BAA94855) (east1) heat-stable ent...       187       430   
5   VFG001855(gb|WP_197535493) (htpB) Hsp60, 60K h...      8829      6075   
6   VFG010906(gb|WP_011213314) (csrA) carbon stora...       539       718   
7   VFG010906(gb|WP_011213314) (csrA) carbon stora...       732       348   
8   VFG038659(gb|WP_011705280) (fliN) flagellar mo...      2756      1364   
9   VFG001225(gb|NP_249099) (pilG) twitching motil...      3256      1416   
10  VFG038346(gb|WP_005320760) (acr1) Type III sec...      1241       751   
11  VFG010906(gb|WP_011213314) (csrA) carbon stora...      1117       661   

In [24]:
combined_data.to_csv("MG_VFDB_read_mapping_80%_cutoff_nosnp.csv")

In [25]:
unique_file_names = combined_data['FileName'].unique()

In [26]:
unique_file_names

array(['SN021-20210628_VFDB.res', 'SN018-20181113_VFDB.res',
       'SN024-20161107_VFDB.res', 'SN018-20190624_VFDB.res',
       'SN019-20201102_VFDB.res', 'SN018-20210712_VFDB.res',
       'SN020-20201019_VFDB.res', 'SN019-20210531_VFDB.res',
       'SN010-20190513_VFDB.res', 'SN024-20210517_VFDB.res',
       'SN005-20191021_VFDB.res', 'SN020-20180528_VFDB.res',
       'SN018-20191021_VFDB.res', 'SN018-20180807_VFDB.res',
       'SN021-20170704_VFDB.res'], dtype=object)