### Beebiome curation script
Using the new file with all non Apoidea genera (generaFilter)

Two text files were downloaded from NCBI, one for all genera and one for the Apoidea genera only. Entries in the Apoidea genera file were then substracted from the all genera file. This new file will help to automatically identify records in the XMLs which are non Apoidea and set the flag to "autoNoG". This will make the number of rows to check smaller and simplify the next curation steps by focusing only on records which might have an Apoidea host.

The curation works in 2 steps: 

    - drop1 will convert all biosample XML files to CSV, then delete all records with hosts in nonApoidea genera and save as *Drop1.csv;
    
    - drop2 will check if host is on the white list and update the taxid, then save to *Drop2.csv and save to one file all the result files.
    
LM, 14/MAR/2022

In [1]:
#init
import sys, os, re, datetime
import xmltodict, pathlib, glob
import pandas as pd

dnIn = '\\Apoidea2Mar22\\' # Set the data directory
dnIn = os.path.normpath(dnIn)
print('Start time: ' + str(datetime.datetime.now()))
print("-Apoidea folder: " + dnIn)
    
dnWork = os.getcwd() # Set the working directory
dn, dName  = os.path.split(dnWork)

dnOut = dnIn + '\\biosample_XMLtoCSVs\\' # Set the output folder
if not os.path.exists(dnOut): os.makedirs(dnOut)

whiteL = pd.read_csv(dnWork + '\\host_white_list.csv')
whiteL["host"] = whiteL["host"].str.lower()
nonApoideaGenera = pd.read_csv(dnWork + '\\genera_not_apoidea.txt', header=None)
beta_data = pd.read_csv(dnWork + '\\in-beta_9Mar22.csv')
auto_no = pd.read_csv(dnWork + '\\auto-no.csv')
auto_no = [str(x) for x in auto_no['autoNo if host in'].tolist()]
auto_no = [x.casefold() for x in auto_no]
auto_no = "(" + ")|(".join(auto_no) + ")"
useCSV = 1

def dfInit(df):
    df['has_proj'] = pd.Series(['' for x in range(len(df.index))])
    col = df.pop('has_proj')
    df.insert(0, col.name, col)
    df['has_sra'] = pd.Series(['' for x in range(len(df.index))])
    col = df.pop('has_sra')   
    df.insert(0, col.name, col)
    df['load'] = pd.Series(['' for x in range(len(df.index))])
    col = df.pop('load')
    df.insert(0, col.name, col)
    df['in_beta'] = pd.Series(['' for x in range(len(df.index))])
    col = df.pop('in_beta')
    df.insert(0, col.name, col)
    df['taxid'] = pd.Series(['' for x in range(len(df.index))])
    col = df.pop('taxid')
    df.insert(0, col.name, col)
    df['host'] = pd.Series(['' for x in range(len(df.index))])
    col = df.pop('host')
    df.insert(0, col.name, col)
    return df

def getHost(listItems):
    # Find the host name and return it if exists, if no return None 
    name = 'None'
    if type(listItems) != 'list':
        i1 = listItems
        if '@attribute_name' in i1:
            if i1['@attribute_name'] == 'host':            
                name = i1['#text'] #print('yess one')
            return name
    for i1 in listItems:
        if '@attribute_name' in i1:
            if i1['@attribute_name'] == 'host':            
                name = i1['#text']
                break
    return name

def getProj(listItems):
    # Find the project name and return it if exists, if no return None 
    name = 'None'
    #print(type(listItems.items())) #print(sys.getsizeof(listItems.items()))
    if type(listItems) != 'list':
        i1 = listItems
        if '@target' in i1:
            if i1['@target'] == 'bioproject':            
                name = i1['@label']
            return name
    for i1 in listItems:
        #print(i1.keys())
        if '@target' in i1:
            if i1['@target'] == 'bioproject':            
                name = i1['@label']
                break
    return name

def getSRA(listItems):
    # Find the SRA acc. and return it if exists, if no return None 
    name = 'None'
    if type(listItems) != 'list':
        i1 = listItems
        if '@db' in i1:
            if i1['@db'] == 'SRA':            
                name = i1['#text']
            return name
    for i1 in listItems:
        #print(i1.keys())
        if '@db' in i1:
            if i1['@db'] == 'SRA':            
                name = i1['#text']
                break
    return name

Start time: 2022-03-14 17:35:38.632920
-Apoidea folder: D:\arh2021\_poGc21\bPortal\Apoidea2Mar22


In [10]:
# Step Drop1, save XML files to CSV

listF = list(pathlib.Path(dnIn).glob('*.xml'))
for f_name in listF:
        dn0, fn0  = os.path.split(f_name)
        if (fn0.startswith('Apoidea_biosample.9')) and os.path.getsize(f_name) > 0: #.2.
                #print(f_name)
                fnIn = dnIn + '\\' + fn0
                fnOut = fn0.replace('.', '_')
                fn = open(fnIn); fn.seek(0); xml = fn.read() #read xml file
                dsdocs = xmltodict.parse(xml) #convert xml to python dict
                df = pd.DataFrame(dsdocs ['BioSampleSet']['BioSample'])
                s0=len(df.index)
                print('--Processing: ' + fn0 + '  biosamples:' + str(s0))
                dfIndDrop = []
                df = dfInit(df) #add 6 columns with flags

                #for row in df.itertuples(name='row'):   print(row) might be faster
                for index, row0 in df.iterrows():
                        row = row0
                        strSkip = ''
                        row['load'] = ''
                        row['taxid'] = ''
                        row['in_beta'] = ''
                        row['host'] = getHost(row['Attributes']['Attribute'])
                        str1 = row['host'] #print(str1)
                        if type(str1) != str: print(index, str1) #host name is a number??continue
                        value = str1.split()[0]
                        if nonApoideaGenera.isin([value]).any().any() or (len(str1) < 3) or str1.lower()=='n/a': 
                                #check if value exist in nonApoideaGenera, then drop
                                row['load'] = 'autoNoG'    #print(value + ' autoNoG')
                                row['taxid'] = 'NotApoidea'
                                row['in_beta'] = 'inBetaN/A'
                                strSkip = 'In nonApoideaGenera, skip'
                        if re.match(auto_no, str1.casefold()): 
                                #check if host exist in autoNo (black list), then drop
                                if row['in_beta'] == 'inBeta': print('Warning!!!')
                                #if row['host'] == 'Ceratina':  print('Warning!!!'); break
                                row['load'] = 'autoNoSkip'
                                row['taxid'] = 'NotApoidea'
                                row['in_beta'] = 'inBetaN/A'
                                strSkip = 'On black list, skip' #continue
                        row['has_sra'] = getSRA(row['Ids']['Id'])
                        try:
                                row['has_proj'] = getProj(row['Links']['Link']) #break
                        except:
                                row['has_proj'] = 'None' #print('no2')
                        if row['@accession'] in beta_data.loc[:,"BioSample acc"].values: 
                                #if inBeta, skip
                                row['in_beta'] = 'inBetaYes'
                                row['load'] = 'inBeta'
                                strSkip = 'In beta, skip'
                        elif row['load'] == '':
                                row['in_beta'] = 'inBetaNo'
                                row['load'] = 'Load'
                                row['taxid'] = 'Load2'
                                strSkip = 'Load2'
                                if row['has_sra'] == 'None':
                                        row['load'] = 'autoNoSRA'
                                        strSkip = 'autoNoSRA, skip'
                        # set taxid if in the host_white_list 
                        v1 = whiteL.loc[whiteL['host'] == str1.casefold()]
                        v2 = v1['taxid'].tolist()
                        if v2: # != 'None': #row['host'] in host_white_list.loc[:,"host"].values:
                                row['taxid'] = str(v2[0])                     
                        if row['load'] != 'Load': 
                                dfIndDrop.append(index)
                        #os.system('cls')
                        print(fn0 + ' ==> ' + str(index + 1) + '/' + str(s0) + '    ', row['host'],row['in_beta'],row['load'],row['has_sra'],row['has_proj'],strSkip)
                        row0 = row
                df.to_csv(dnOut + fnOut + 'L.csv', encoding='utf-8', index=False) 
                df.drop(dfIndDrop, inplace=True)
                df.to_csv(dnOut + fnOut + 'Drop.csv', encoding='utf-8', index=False) 
#import ctypes ctypes.windll.user32.MessageBoxW(0, "Done step1", "Processing", 1)
print('Drop1 done.') #print('End time: ' + str(datetime.datetime.now()))

--Processing: Apoidea_biosample.9.xml  biosamples:161
Apoidea_biosample.9.xml ==> 1/161     Oryza sativa cv. BPT5209 inBetaN/A autoNoSkip SRS11686923 PRJNA797556 On black list, skip
Apoidea_biosample.9.xml ==> 2/161     Oryza sativa cv. BPT5208 inBetaN/A autoNoSkip SRS11686922 PRJNA797556 On black list, skip
Apoidea_biosample.9.xml ==> 3/161     Oryza sativa cv. BPT5207 inBetaN/A autoNoSkip SRS11686921 PRJNA797556 On black list, skip
Apoidea_biosample.9.xml ==> 4/161     Oryza sativa cv. BPT5206 inBetaN/A autoNoSkip SRS11686920 PRJNA797556 On black list, skip
Apoidea_biosample.9.xml ==> 5/161     Oryza sativa cv. BPT5205 inBetaN/A autoNoSkip SRS11686919 PRJNA797556 On black list, skip
Apoidea_biosample.9.xml ==> 6/161     Oryza sativa cv. BPT5204 inBetaN/A autoNoSkip SRS11686918 PRJNA797556 On black list, skip
Apoidea_biosample.9.xml ==> 7/161     Hymenoptera: Apidae inBetaN/A autoNoSkip SRS9464940 PRJNA744643 On black list, skip
Apoidea_biosample.9.xml ==> 8/161     Meliponula sp. inB

In [11]:
# Step Drop2, run on *Drop.csv files and get all in one csv

listF = list(pathlib.Path(dnOut).glob('*Drop.csv'))
for f_name in listF: #os.listdir(dnIn + '.xml'):     #print(f_name)
        dn0, fn0  = os.path.split(f_name)
        if (fn0.startswith('Apoidea_biosample_')): #_2_
                fn = dn0 + '\\' + fn0  
                df = pd.read_csv(fn)
                s0 = len(df.index)
                print('--', fn, str(s0)); 
                #df = df_.reindex(columns=['host', 'taxid', 'in_beta', 'load', 'has_sra', 'has_proj', '@access', '@publication_date', '@last_update', '@submission_date', '@id', '@accession', 'Ids', 'Description', 'Owner', 'Models', 'Package', 'Attributes', 'Links', 'Status'])
                for index, row0 in df.iterrows(): #iterrows(): #index = row0.Index   print(row0._7)
                        row = row0
                        str1 = row['host']
                        strSkip = ''
                        if type(str1) == 'str': 
                                print('Not string host:',str1) #error host
                                continue
                        
                        #if host is in updated white list, update taxid
                        if row['taxid'] == 'Load3':
                                v2 = 'noTaxid'
                                v1 = whiteL.loc[whiteL['host'] == str1.lower()]
                                v2 = v1['taxid'].tolist() #print(v2)
                                if v2:
                                        s1 = str(v2[0])
                                        df.at[index,'taxid'] = s1
                                        df.at[index,'load'] = 'Load56'
                                        strSkip = 'Add host, taxid found' #print(v2)
                                elif re.match(auto_no, str1.casefold()): 
                                        #check if host exist in autoNo (black list), then drop
                                        if row['in_beta'] == 'inBeta': print('Warning!!!')
                                        #if row['host'] == 'Ceratina':  print('Warning!!!'); break
                                        df.at[index,'taxid'] = 'NotApoidea'
                                        df.at[index,'load'] = 'autoNoSkip'
                                        df.at[index,'in_beta'] = 'inBetaN/A'
                                        strSkip = 'On black list, skip'
                                print(str(index + 1) + '/' + str(s0) + '    ', row['host'],s1,row['in_beta'],row['taxid'],row['load'],row['has_sra'],row['has_proj'],strSkip)                     
                        row0 = row
                df.to_csv(fn + 'Drop2.csv', encoding='utf-8', index=False) 
#combine all files in one file
listCSVs = sorted(glob.glob(dnOut + '*Drop2.csv'))
combined_csv = pd.concat([pd.read_csv(f) for f in listCSVs ])
combined_csv.to_csv( dnOut + "combinedAll_missed.csv", index=False, encoding='utf-8-sig')

-- D:\arh2021\_poGc21\bPortal\Apoidea2Mar22\biosample_XMLtoCSVs\Apoidea_biosample_10_xmlDrop.csv 57
-- D:\arh2021\_poGc21\bPortal\Apoidea2Mar22\biosample_XMLtoCSVs\Apoidea_biosample_11_xmlDrop.csv 0
-- D:\arh2021\_poGc21\bPortal\Apoidea2Mar22\biosample_XMLtoCSVs\Apoidea_biosample_12_xmlDrop.csv 403
-- D:\arh2021\_poGc21\bPortal\Apoidea2Mar22\biosample_XMLtoCSVs\Apoidea_biosample_13_xmlDrop.csv 20
-- D:\arh2021\_poGc21\bPortal\Apoidea2Mar22\biosample_XMLtoCSVs\Apoidea_biosample_14_xmlDrop.csv 27
-- D:\arh2021\_poGc21\bPortal\Apoidea2Mar22\biosample_XMLtoCSVs\Apoidea_biosample_15_xmlDrop.csv 2
-- D:\arh2021\_poGc21\bPortal\Apoidea2Mar22\biosample_XMLtoCSVs\Apoidea_biosample_16_xmlDrop.csv 295
-- D:\arh2021\_poGc21\bPortal\Apoidea2Mar22\biosample_XMLtoCSVs\Apoidea_biosample_17_xmlDrop.csv 806
-- D:\arh2021\_poGc21\bPortal\Apoidea2Mar22\biosample_XMLtoCSVs\Apoidea_biosample_18_xmlDrop.csv 12
-- D:\arh2021\_poGc21\bPortal\Apoidea2Mar22\biosample_XMLtoCSVs\Apoidea_biosample_19_xmlDrop.csv 23