In [None]:
import pandas as pd
import os

In [None]:
# This script filters AMGs in a specified folder. The format for AMG files is the one generated by VIBRANT 
# (VIBRANT_annotations.tsv). Note: After running this script it is necessary to combine all files (e.g. use cat) 
# and remove rows in which AMGs were classified as having lowscore. 

def mark_amg_lowscores(df):
    for i, row in df.iterrows():
        if row['AMG'] == 'AMG':
            scaffold = row['scaffold']
            current_row_index = i
            
            # Check for flanking genes on both upstream and downstream sides
            flank_upstream_count = 0
            flank_downstream_count = 0
            flank_threshold = 2  # Mark as 'lowscoreamg' if there are 2 flanking genes on each side
            
            # Look at the previous rows (excluding the current row)
            for j in range(i - 1, max(-1, i - 3), -1):  # Adjusted range to cover only 2 rows upstream
                if df.at[j, 'scaffold'] == scaffold:
                    if pd.notna(df.at[j, 'KO v-score']) and df.at[j, 'KO v-score'] <= 0.25:
                        flank_upstream_count += 1
                        if flank_upstream_count >= flank_threshold:
                            break  # Exit loop if 2 flanking genes are found upstream
                    elif df.at[j, 'AMG'] == 'AMG':
                        # Reset flank_upstream_count if a non-flanking gene is encountered
                        flank_upstream_count = 0
                else:
                    break  # Exit loop if scaffold changes
            
            # Look at the next rows
            for j in range(i + 1, min(len(df), i + 3)):  # Adjusted range to cover only 2 rows downstream
                if df.at[j, 'scaffold'] == scaffold:
                    if pd.notna(df.at[j, 'KO v-score']) and df.at[j, 'KO v-score'] <= 0.25:
                        flank_downstream_count += 1
                        if flank_downstream_count >= flank_threshold:
                            break  # Exit loop if 2 flanking genes are found downstream
                    elif df.at[j, 'AMG'] == 'AMG':
                        # Reset flank_downstream_count if a non-flanking gene is encountered
                        flank_downstream_count = 0
                else:
                    break  # Exit loop if scaffold changes
            
            # If there are 2 flanking genes upstream and 2 flanking genes downstream
            if flank_upstream_count >= flank_threshold and flank_downstream_count >= flank_threshold:
                df.at[i, 'AMG'] = 'lowscoreamg'  # Update the value in the 'AMG' column
    
    return df

def remove_edge_located_proteins(df):
    grouped = df.groupby('scaffold')
    indices_to_remove = []
    for _, group in grouped:
        first_index = group.index[0]
        last_index = group.index[-1]
        indices_to_remove.extend([first_index, last_index])
    df_filtered = df.drop(indices_to_remove)
    return df_filtered

def add_file_name(df, file_name):
    df['file_name'] = file_name
    return df

def process_files_in_directory(directory):
    for file_name in os.listdir(directory):
        if file_name.endswith('.tsv'):  # Process only TSV files
            file_path = os.path.join(directory, file_name)
            df = pd.read_csv(file_path, sep='\t')
            updated_df = mark_amg_lowscores(df)
            filtered_df = remove_edge_located_proteins(updated_df)
            df_with_file_name = add_file_name(filtered_df, file_name)
            output_file_path = os.path.join(directory, f"filtered_{file_name}")
            df_with_file_name.to_csv(output_file_path, sep='\t', index=False)


directory_path = '/path/AMG_annotations'  # Directory containing your TSV files
process_files_in_directory(directory_path)


In [None]:
# Additional filtering (Use file with combined outputs from previous step)
file_path_combined_AMGs = "combined_AMG_filtered_annotations.tsv"
combined_AMGs_df = pd.read_csv(file_path_combined_AMGs, sep='\t', low_memory=False)

#Filter out non-AMG rows
combined_AMGs_df = combined_AMGs_df[combined_AMGs_df['AMG'].notna()]
filtered_combined_AMGs_df = combined_AMGs_df[combined_AMGs_df['AMG'].str.contains('AMG')]

#Drop repeated headers
filtered_combined_AMGs_df = filtered_combined_AMGs_df[~(filtered_combined_AMGs_df['protein'] == 'protein')]
# Resetting the index after dropping rows
filtered_combined_AMGs_df.reset_index(drop=True, inplace=True)

# Convert 'KO v-score' and 'Pfam v-score' columns to numeric
combined_AMGs_df_copy = combined_AMGs_df.copy()
combined_AMGs_df_copy['KO v-score'] = pd.to_numeric(combined_AMGs_df_copy['KO v-score'], errors='coerce')
combined_AMGs_df_copy['Pfam v-score'] = pd.to_numeric(combined_AMGs_df_copy['Pfam v-score'], errors='coerce')

# Filter out rows containing the string 'DNA methylase'
filtered_combined_AMGs_df = combined_AMGs_df_copy[~combined_AMGs_df_copy['Pfam name'].str.contains('methylase', case=False, na=False)]

# Filter AMGs with v-scores (KEGG and Pfam v-scores) >= 1
filtered_combined_AMGs_df = filtered_combined_AMGs_df[(filtered_combined_AMGs_df['KO v-score'] <= 1) & 
                                                      (filtered_combined_AMGs_df['Pfam v-score'] <= 1)]

# Resetting the index after filtering rows
filtered_combined_AMGs_df.reset_index(drop=True, inplace=True)