In [None]:
import pandas as pd
from os import listdir
import numpy as np
from Bio.SeqIO.FastaIO import SimpleFastaParser
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA

In [None]:
samples = [
    "CC4",
    "CD4",
    "MC4",
    "MD4",
    "FC4",
    "FD4",
]

hmms = []

for s in samples:
    base = f"../../results_coassembly_megahit_MAGs_90_10/{s}/ASSEMBLY_hmms_AMRFinderPlus_CONSERVEDMATCHES_cutGA/presence_results.tsv"

    rawdf = pd.read_csv(base, sep="\t")
    rawdf = rawdf[rawdf["perc_identical"] >= 50]
    
    hmms.append(rawdf) 



In [None]:
all_hits = []
for i in hmms:
    all_hits += list(i.HMM_model.unique())
    
unique_all_hit = sorted(set(all_hits))
print(len(unique_all_hit))


m_hits = pd.DataFrame(unique_all_hit, columns=["HMM_model"])
m_hits

In [None]:
master = pd.concat(hmms, ignore_index=True)
master

In [None]:
master.to_csv("./AMRFinderPlus_tables_data/conserved_matches_percIdentity50_cutGA.tsv", 
              sep="\t", header=True, index=False)

In [None]:
grp = master.loc[:, ["HMM_model", "sample", "orf_id"]].groupby(by=["HMM_model", "sample"]).count().reset_index()

In [None]:
grp

In [None]:
AMRFinder  = pd.read_csv("../../HMMS_AMRFinderPlus/NCBIfam-AMRFinder.tsv", sep="\t")
print(AMRFinder.shape)
AMRFinder.head(3)

In [None]:
AMR_filenames  = pd.read_csv("../../HMMS_AMRFinderPlus/mapping_HMMsfile2ACC.tsv", sep="\t", names=["hmm_raw_filename", "#hmm_accession"])
AMR_filenames["name"] = AMR_filenames.apply(lambda x: x["hmm_raw_filename"].split("HMM/")[1].split(".HMM:ACC")[0] , axis=1)
AMR_filenames["filename"] = AMR_filenames.apply(lambda x: x["hmm_raw_filename"].split("HMM/")[1].split(".HMM:ACC")[0] , axis=1)
del AMR_filenames["hmm_raw_filename"]
AMR_filenames

In [None]:
AMR_mapping = pd.merge(AMRFinder, AMR_filenames, how="left", on="#hmm_accession")
AMR_mapping.rename(columns={"#hmm_accession": "hmm_accession"}, inplace=True)
AMR_mapping

In [None]:
merged = pd.merge(grp, AMR_mapping.loc[:, ["name", "hmm_accession"]], 
                  how="left", 
                  left_on="HMM_model", 
                  right_on="name")
merged

In [None]:
merged = merged.loc[:,["sample", "hmm_accession", "orf_id"]]
merged

In [None]:
matrix = merged.pivot(index="sample", columns="hmm_accession", values="orf_id").reset_index()
matrix.index.name = None
matrix.columns.name = None
matrix = matrix.replace(np.nan, 0, regex=True)


matrix

In [None]:
macro_sample_location = {
    "C": "Suna Canottieri",
    "M": "Teatro Maggiore",
    "F": "Fondo Toce",
}

matrix["macro_sample_location"] = matrix.apply(lambda x: macro_sample_location[x["sample"][0]], axis=1)
matrix["coast_deep_shapes"] = matrix.apply(lambda x: "coast" if x["sample"][1] == "C" else "deep", axis=1)

In [None]:
matrix.to_csv("./AMRFinderPlus_tables_data/SAMPLES_percIdentity50.tsv", 
              sep="\t", header=True, index=False)