## Imports

In [None]:
import pandas as pd

## Functions

In [None]:
def data_filter(df_raw, column1, df_filter, column2, name):
    
    """
    Filter the data from one dataset to only match the information present in another DF. 
    It checks both indexes are the same type, then filters and, finally, saves the new DF.  
    
    Parameters:
        df_raw (DataFrame): Dataframe with all the information.
        column1(str): Column used as index for the Raw Dataframe.
        df_filter (DataFrame): Dataframe to use as a filter.
        column2(str): Column used as index for the Filter Dataframe.
        name (str): Name for the new file.
        
    Returns:

        Filtered dataframe.
    
    """

    df_raw[column1] = df_raw[column1].astype(str)
    df_filter[column2] = df_filter[column2].astype(str)

    df_filtered = df_raw[df_raw[column1].isin(df_filter[column2])]
    df_filtered.to_csv(f'../data/interim/filtered_{name}_dataset.csv')
    
    return df_filtered

## Extraction

In [5]:
raw = pd.read_csv(r'../data/raw/microbigge.csv')
ast_dataset = pd.read_csv(r'../data/interim/ast_dataset_abg.csv')

## Visualization

In [6]:
raw.head()

Unnamed: 0,#Scientific name,Protein,BioSample,Isolate,Contig,Start,Stop,Strand,Element symbol,Element name,Type,Scope,Subtype,Class,Subclass,Method,% Coverage of reference,% Identity to reference
0,Salmonella enterica,HAC6049176.1,SAMN02147121,PDT000000005.4,DAAMCG010000001.1,236896,237147,-,asr,acid resistance repetitive basic protein Asr,STRESS,plus,ACID,,,HMM,100.0,55.88
1,Salmonella enterica,HAC6044684.1,SAMN02147122,PDT000000006.4,DAAMCH010000001.1,578563,579027,-,golS,Au(I) sensor transcriptional regulator GolS,STRESS,plus,METAL,GOLD,GOLD,EXACTP,100.0,100.0
2,Salmonella enterica,HAC6076697.1,SAMN02147123,PDT000000007.4,DAAMCI010000001.1,13878,14993,+,iroB,salmochelin biosynthesis C-glycosyltransferase...,VIRULENCE,plus,VIRULENCE,,,BLASTP,100.0,86.79
3,Listeria monocytogenes,MOA88727.1,SAMN02265470,PDT000000011.3,SATG01000001.1,383828,385654,-,inlK,class 1 internalin InlK,VIRULENCE,plus,VIRULENCE,,,BLASTP,100.0,90.62
4,Listeria monocytogenes,MOA97367.1,SAMN02265455,PDT000000012.3,SATD01000001.1,37892,39721,+,inl-lmo0514,lmo0514 family class 1 internalin Inl-lmo0514,VIRULENCE,plus,VIRULENCE,,,BLASTP,100.0,94.14


In [7]:
ast_dataset.head()

Unnamed: 0,uid,title,accession,date,publicationdate,modificationdate,organization,taxonomy,organism,sourcesample,sampledata,identifiers,infraspecies,package,sortkey,BioSample ID
0,46923997,Pathogen: clinical or host-associated sample f...,SAMN46923997,2025/02/20,2025/02/20,2025/02/20,Rhode Island Department of Health State Health...,485,Neisseria gonorrhoeae,BioSample:SAMN46923997,"<BioSample access=""public"" publication_date=""2...",BioSample: SAMN46923997; Sample name: RISHL25N...,isolate: RISHL25NGS004,Pathogen: clinical or host-associated; version...,20250220,46923997
1,46841726,Pathogen: clinical or host-associated sample f...,SAMN46841726,2025/02/14,2025/02/14,2025/02/14,Instituto de Pesquisa Pele Pequeno Principe,573,Klebsiella pneumoniae,BioSample:SAMN46841726,"<BioSample access=""public"" publication_date=""2...",BioSample: SAMN46841726; Sample name: Kp181,isolate: K181 winter,Pathogen: clinical or host-associated; version...,20250214,46841726
2,46841725,Pathogen: clinical or host-associated sample f...,SAMN46841725,2025/02/14,2025/02/14,2025/02/14,Instituto de Pesquisa Pele Pequeno Principe,573,Klebsiella pneumoniae,BioSample:SAMN46841725,"<BioSample access=""public"" publication_date=""2...",BioSample: SAMN46841725; Sample name: K174,isolate: K174 autumn,Pathogen: clinical or host-associated; version...,20250214,46841725
3,46841724,Pathogen: clinical or host-associated sample f...,SAMN46841724,2025/02/14,2025/02/14,2025/02/14,Instituto de Pesquisa Pele Pequeno Principe,573,Klebsiella pneumoniae,BioSample:SAMN46841724,"<BioSample access=""public"" publication_date=""2...",BioSample: SAMN46841724; Sample name: K159,isolate: K159 spring,Pathogen: clinical or host-associated; version...,20250214,46841724
4,46841723,Pathogen: clinical or host-associated sample f...,SAMN46841723,2025/02/14,2025/02/14,2025/02/14,Instituto de Pesquisa Pele Pequeno Principe,573,Klebsiella pneumoniae,BioSample:SAMN46841723,"<BioSample access=""public"" publication_date=""2...",BioSample: SAMN46841723; Sample name: K158,isolate: K158 spring,Pathogen: clinical or host-associated; version...,20250214,46841723


## Filter

In [None]:
filtered_df = data_filter(raw, "BioSample", ast_dataset, "accession", "genetic")
filtered_df.head()

Unnamed: 0,#Scientific name,Protein,BioSample,Isolate,Contig,Start,Stop,Strand,Element symbol,Element name,Type,Scope,Subtype,Class,Subclass,Method,% Coverage of reference,% Identity to reference
2729,"Salmonella enterica subsp. enterica serovar 4,...",ECH9088530.1,SAMN02640777,PDT000003687.3,AAITMG010000001.1,536274,539927,-,iroC,salmochelin/enterobactin export ABC transporte...,VIRULENCE,plus,VIRULENCE,,,BLASTP,98.85,80.0
2730,Salmonella enterica subsp. enterica serovar Ke...,EBM9789732.1,SAMN02640778,PDT000003688.4,AAGEIK010000001.1,306785,307147,-,arsD,arsenite efflux transporter metallochaperone ArsD,STRESS,plus,METAL,ARSENIC,ARSENITE,HMM,99.17,57.5
2731,Salmonella enterica subsp. enterica serovar Ke...,EBO3121721.1,SAMN02640780,PDT000003689.4,AAGICK010000001.1,306785,307147,-,arsD,arsenite efflux transporter metallochaperone ArsD,STRESS,plus,METAL,ARSENIC,ARSENITE,HMM,99.17,57.5
2732,Salmonella enterica subsp. enterica serovar Hadar,EBV6716087.1,SAMN02640788,PDT000003690.3,AAHFZG010000001.1,348371,349057,-,gtgA,type III secretion system effector protease GtgA,VIRULENCE,plus,VIRULENCE,,,EXACTP,100.0,100.0
2733,Salmonella enterica subsp. enterica,ECE0280831.1,SAMN02640789,PDT000003691.3,AAIGQB010000006.1,15091,16206,+,iroB,salmochelin biosynthesis C-glycosyltransferase...,VIRULENCE,plus,VIRULENCE,,,BLASTP,100.0,86.52
