In [None]:
import pandas as pd
import numpy as np
from os import listdir

In [None]:
cols = ["HMM_model", "orf_id", "HMM_length", "hmm_start", "hmm_end", 
        "identical_match", "conserved_match", "perc_identical", "perc_conserved", 
        "bitscore", "indipendent_evalue", "conditional_evalue", "dels", 
        "sequence_match", "contig", "mags", "sample"]


In [None]:
cutGA = True

threshold = 80 # choose 70 or 80 or 90

In [None]:
final=pd.DataFrame(None, columns=cols)

samples = ["CC4", "CD4", "MC4", "MD4", "FC4", "FD4"]

master_folder = "/mnt/thor/bigdata/ideARG_2020/results_coassembly_megahit_MAGs_90_10/"


for s in samples:
    folder_sample_result = f"{master_folder}/{s}/mags_hmms_AMRFinderPlus_CONSERVEDMATCHES"
    if cutGA:
        folder_sample_result += "_cutGA"
    
    for mag_file in listdir(folder_sample_result):
        if not mag_file.endswith("_presence_results.tsv"):
            continue
        
        df = pd.read_csv(f"{folder_sample_result}/{mag_file}", sep="\t")
        df = df[df["perc_conserved"] > threshold]
        
        temp = pd.concat([final, df], ignore_index=True, )
        final = temp.copy()


percCons = f"_percConserved{threshold}" 
        
filename = f"conserved_matches{percCons}_cutGA.tsv" if cutGA else f"conserved_matches{percCons}.tsv"

final.sort_values(by="perc_identical", ascending=False, inplace=True)

final.to_csv(f"/mnt/thor/bigdata/ideARG_2020/report_scripts_MAGs_90_10/AMRFPlus_conserved_matches_onMAGs/{filename}",
        sep="\t", index=False, header=True)

In [None]:
final.shape

In [None]:
final.head()

In [None]:
AMRFinder  = pd.read_csv("../HMMS_AMRFinderPlus/NCBIfam-AMRFinder.tsv", sep="\t")
print(AMRFinder.shape)
AMRFinder.head(3)

In [None]:
AMR_filenames  = pd.read_csv("../HMMS_AMRFinderPlus/mapping_HMMsfile2ACC.tsv", sep="\t", names=["hmm_raw_filename", "#hmm_accession"])
AMR_filenames["name"] = AMR_filenames.apply(lambda x: x["hmm_raw_filename"].split("HMM/")[1].split(".HMM:ACC")[0] , axis=1)
AMR_filenames["filename"] = AMR_filenames.apply(lambda x: x["hmm_raw_filename"].split("HMM/")[1].split(".HMM:ACC")[0] , axis=1)
del AMR_filenames["hmm_raw_filename"]
AMR_filenames

In [None]:
AMR_mapping = pd.merge(AMRFinder, AMR_filenames, how="left", on="#hmm_accession")
AMR_mapping.rename(columns={"#hmm_accession": "hmm_accession"}, inplace=True)
AMR_mapping

In [None]:
AMR_mapping.to_csv("./AMRFPlus_conserved_matches_onMAGs/AMRFinderPlus_mapping.tsv", sep="\t", header=True, index=False)

In [None]:
AMR_mapping["class"].unique()

In [None]:
def get_sample_results(final_df, my_sample):
    assert my_sample in ["CC4", "CD4", "MC4", "MD4", "FC4", "FD4"]

    mySample = final_df[final_df["sample"] == my_sample].loc[:, ["HMM_model", "mags", "sample"]]
    mySamplegrp = mySample.groupby(by=["HMM_model", "mags"]).count().reset_index()
    mySamplegrp = pd.merge(mySamplegrp, AMR_mapping.loc[:, ["name", "hmm_accession"]], how="left", left_on="HMM_model", right_on="name")
    mySamplegrp = mySamplegrp.loc[:,["mags", "hmm_accession", "sample"]]
    mySample_matrix = mySamplegrp.pivot(index="mags", columns="hmm_accession", values="sample").reset_index()
    mySample_matrix.index.name = None
    mySample_matrix.columns.name = None
    mySample_matrix = mySample_matrix.replace(np.nan, 0, regex=True)
    mySample_matrix.rename(columns={"mags": "MAGs"}, inplace=True)
    
    return mySample_matrix

In [None]:
CC4 = get_sample_results(final, my_sample="CC4")
CC4 = pd.merge(CC4, pd.read_csv("./COASSEMBLY_MEGAHIT_binning_based_gtdbtk/CC4.tsv", sep="\t"), how="left", on="MAGs")

if cutGA:
    CC4.to_csv(f"./AMRFPlus_conserved_matches_onMAGs/CC4{percCons}_cutGA.tsv", sep="\t", header=True, index=False)
else:
    CC4.to_csv(f"./AMRFPlus_conserved_matches_onMAGs/CC4{percCons}.tsv", sep="\t", header=True, index=False)

CC4

In [None]:
CD4 = get_sample_results(final, my_sample="CD4")
CD4 = pd.merge(CD4, pd.read_csv("./COASSEMBLY_MEGAHIT_binning_based_gtdbtk/CD4.tsv", sep="\t"), how="left", on="MAGs")

if cutGA:
    CD4.to_csv(f"./AMRFPlus_conserved_matches_onMAGs/CD4{percCons}_cutGA.tsv", sep="\t", header=True, index=False)
else:
    CD4.to_csv(f"./AMRFPlus_conserved_matches_onMAGs/CD4{percCons}.tsv", sep="\t", header=True, index=False)

CD4

In [None]:
MC4 = get_sample_results(final, my_sample="MC4")
MC4 = pd.merge(MC4, pd.read_csv("./COASSEMBLY_MEGAHIT_binning_based_gtdbtk/MC4.tsv", sep="\t"), how="left", on="MAGs")

if cutGA:
    MC4.to_csv(f"./AMRFPlus_conserved_matches_onMAGs/MC4{percCons}_cutGA.tsv", sep="\t", header=True, index=False)
else:
    MC4.to_csv(f"./AMRFPlus_conserved_matches_onMAGs/MC4{percCons}.tsv", sep="\t", header=True, index=False)

MC4

In [None]:
MD4 = get_sample_results(final, my_sample="MD4")
MD4 = pd.merge(MD4, pd.read_csv("./COASSEMBLY_MEGAHIT_binning_based_gtdbtk/MD4.tsv", sep="\t"), how="left", on="MAGs")

if cutGA:
    MD4.to_csv(f"./AMRFPlus_conserved_matches_onMAGs/MD4{percCons}_cutGA.tsv", sep="\t", header=True, index=False)
else:
    MD4.to_csv(f"./AMRFPlus_conserved_matches_onMAGs/MD4{percCons}.tsv", sep="\t", header=True, index=False)

MD4

In [None]:
FC4 = get_sample_results(final, my_sample="FC4")
FC4 = pd.merge(FC4, pd.read_csv("./COASSEMBLY_MEGAHIT_binning_based_gtdbtk/FC4.tsv", sep="\t"), how="left", on="MAGs")

if cutGA:
    FC4.to_csv(f"./AMRFPlus_conserved_matches_onMAGs/FC4{percCons}_cutGA.tsv", sep="\t", header=True, index=False)
else:
    FC4.to_csv(f"./AMRFPlus_conserved_matches_onMAGs/FC4{percCons}.tsv", sep="\t", header=True, index=False)

FC4

In [None]:
FD4 = get_sample_results(final, my_sample="FD4")
FD4 = pd.merge(FD4, pd.read_csv("./COASSEMBLY_MEGAHIT_binning_based_gtdbtk/FD4.tsv", sep="\t"), how="left", on="MAGs")

if cutGA:
    FD4.to_csv(f"./AMRFPlus_conserved_matches_onMAGs/FD4{percCons}_cutGA.tsv", sep="\t", header=True, index=False)
else:
    FD4.to_csv(f"./AMRFPlus_conserved_matches_onMAGs/FD4{percCons}.tsv", sep="\t", header=True, index=False)

FD4