In [1]:
import re
import numpy as np
import pandas as pd
import requests
import bs4
from bs4 import BeautifulSoup 

pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 1500)
pd.set_option('max_colwidth', 100)

In [4]:
# Create the DataFrame that will store everythin in the end
dataset = pd.DataFrame(columns=['scientificName','commonName','classification','etymology','environment','depthRange',
                               'usualRange','distributionRangeName','distributionRangeCoordinates','distributionDescription',
                               'sizeRange','maxLength','commonLength','maxWeight','maxAge','biologyInfo','lifeAndMateInfo',
                               'redListStatus','dorsalSpines','dorsalSoftRays','analSpines','analSoftRays','vertebrae',
                               'shortDescription','threatToHumans','preferredTemperatureMin','preferredTemperatureMean',
                               'preferredTemperatureMax','cellsBasedOn','phylogeneticDiversity','bayesianLengthWeightA',
                                'bayesianLengthWeightB','resilience','priorR','sdRange','vulnerability','priceCategory','source'])

speciesList = pd.read_csv('../speciesList.csv') # Catalog of Mediterranean Species
for i, row in speciesList.iterrows():
    species = row['Species'].replace(' ', '-')
    
    # Collect page
    page_de = requests.get('https://www.fishbase.de/summary/'+species+'.html')
    page_ca = requests.get('https://www.sealifebase.ca/summary/'+species+'.html')
    
    # Create a BeautifulSoup object
    soup_de = BeautifulSoup(page_de.text, 'html.parser')
    soup_ca = BeautifulSoup(page_ca.text, 'html.parser')

    if ('Species name is not in the public version of FishBase.' not in str(soup_de)):
        print('Using www.fishbase.de for\t:',row['Species'])
        website = 'de'
        soup = soup_de
    elif ('Species name is not in the public version of SeaLifeBase.' not in str(soup_ca)):
        print('Using www.sealifebase.ca for\t:',row['Species'])
        website = 'ca'
        soup = soup_ca
    else:
        print(row['Species'], 'species not existing in any of www.fishbase.de and www.sealifebase.ca')
        website = 'non'
    
    if (website=='de'):  

        # Initialize every final variable to nan
        commonName, classification, etymology, environment, depthRange, usualRange, distributionRangeName, distributionRangeCoordinates, distributionDescription = np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan
        sizeRange, maxLength, commonLength, maxWeight, maxAge, biologyInfo, lifeAndMateInfo, redListStatus = np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan
        dorsalSpines, dorsalSoftRays, analSpines, analSoftRays, vertebrae, shortDescription, threatToHumans, preferredTemperatureMin, preferredTemperatureMean, preferredTemperatureMax, cellsBasedOn = np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan
        phylogeneticDiversity, bayesianLengthWeightA, bayesianLengthWeightB, resilience, priorR, sdRange, vulnerability, priceCategory = np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan


        # Scientific name and Common Name
        scientificName = ''
        divScientificName = soup.find(id='ss-sciname') # Find a specific id in html
        divScientificNameUrls = divScientificName.find_all('a') # Find all specific tags and return a list
        for name in divScientificNameUrls: # Loop because name is in one or more <a> tags
            scientificName += name.contents[0]+" " # Contents gets the text of the beautifulsoup element
        scientificName = scientificName.rstrip() # Remove all whitespaces at the end of string

        # Collect common name
        commonName = soup.find(class_='sheader2').contents[0]
        commonName = re.sub(r"[\n\t\r]*", "", commonName) # Remove new lines and tabs if exist
        if (commonName == ''):
            commonName = np.nan    



        # Get fields and headers to use later
        divMain = soup.find(id='ss-main')
        fields = divMain.find_all(class_='smallSpace') # Here is all the content we need
        fieldHeaders = divMain.find_all('h1') # Store the headers because not all of them are always present

        classificationField, environmentField, distributionField, metricsField, shortDescriptionField, biologyField, lifeAndMateField, redListStatusField, threatToHumansField, humanUsesField, estimatesPropertiesField = False, False, False, False, False, False, False, False, False, False, False   
        for i, header in enumerate(fieldHeaders):
            fieldHeader = header.get_text()
            fieldHeader = re.sub(r"[\n\t\r]*", "", fieldHeader)

            # Each header has always the strings specified in the if statements. If such a string does not exist, then the header and the content are missing
            if ('Classification / Names' in fieldHeader):
                classificationField = fields[i]
            if ('Environment: milieu' in fieldHeader):
                environmentField = fields[i]
                environmentField = environmentField.find('span').contents
            if ('Distribution' in fieldHeader):
                distributionField = fields[i]
                distributionField = distributionField.find('span').contents
            if ('Size' in fieldHeader):
                metricsField = fields[i]
                metricsField = metricsField.find('span').contents
            if ('Short description' in fieldHeader):
                shortDescriptionField = fields[i]
                shortDescriptionField = shortDescriptionField.find('span').contents
            if ('Biology' in fieldHeader):
                biologyField = fields[i]   
                biologyField = biologyField.find('span').contents
            if ('Life cycle' in fieldHeader):
                lifeAndMateField = fields[i]   
                lifeAndMateField = lifeAndMateField.find('span').contents
            if ('Red List' in fieldHeader):
                redListStatusField = fields[i]
                redListStatusField = redListStatusField.find('span').contents
            if ('Threat to humans' in fieldHeader):
                threatToHumansField = fields[i]  
                threatToHumansField = threatToHumansField.find('span').contents
            if ('Human uses' in fieldHeader):
                humanUsesField = fields[i]   
                humanUsesField = humanUsesField.find('span').contents
            if ('Estimates of' in fieldHeader):
                estimatesPropertiesField = fields[i]

        # Classification and Etymology
        # Multiple types of content (Text, Tag, NonType). Tags need the get_text(). NonType should be removed. Use rstrip() to remove white spaces at the end of text.
        if (classificationField != False): # If there is no field all variables are nan.
            classification = ''
            etymology = ''
            for field in classificationField.contents:
                if ('Etymology' in str(field)):
                    field = re.sub(r"[\n\t\r]*", "", field.get_text())
                    etymology = field.split("Etymology: ",1)[1] # Split text in 2 parts and get the second.
                    continue 
                if (type(field.find('a')) == int): # If the field contains no <a> tags.
                    field = re.sub(r"[\n\t\r]*", "", field)
                    classification += field+" "

                elif (type(field.find('a')) is bs4.element.Tag): # If the field contains <a> tags.
                    field = re.sub(r"[\n\t\r]*", "", field.get_text())
                    classification += field+" "
                else: # If the field is NonType (like <br/>).     
                    continue

            if (classification==''):
                classification = np.nan
            else:
                classification = classification.rstrip() 
            if (etymology==''):
                etymology = np.nan
            else:
                etymology = re.sub(r'[^\x00-\x7f]',r'',etymology.rstrip())
                etymology = re.sub('(\(Ref.)+ \d*\)', '', etymology) # Remove strings like (Ref. 1234)
        else:
            classification = np.nan
            etymology = np.nan



        # Environment, Climate Zone, Depth, Distribution Coordinates
        #I used .extract() which removes all tags and their content. In this case it the the url tags, which we don't need. A bit of text preprocessing. Use re.search and get text in between for depth range and usual range. Use the same re to get string before substring. Use split(,1) to get text after a substring. Use re to distinguish numbers. Use try-except to handle errors.
        if (environmentField != False):
            environmentInfo = ''
            for field in environmentField:
                field = field.extract()
                field = re.sub(r"[\n\t\r]*", "", field)
                environmentInfo += field
            environmentInfo = re.sub(r'[^\x00-\x7f]',r'',environmentInfo.rstrip())
            environmentInfo = environmentInfo.replace('(Ref. )', '')

            # Get environment values
            try:
                beforeDepthRange = re.search('(.+?)depth range', environmentInfo).group(1).strip() # Match whater is before specific string
                environment = ''
                for value in beforeDepthRange.split(';'):
                    if (':' not in value):
                        environment += value+","
                environment = re.sub('( ){2,}', ' ', environment) # Make 2 or more spaces 1.
                environment = environment.rstrip(',')
            except AttributeError:
                environment = np.nan 

            # Look for depth range in environmentInfo.
            try:
                depthRange = re.search('depth range(.+?)m', environmentInfo).group(1).strip() # Match everything between 2 specific strings.
            except AttributeError:
                depthRange = np.nan 

            # Look for usual range in environmentInfo.
            try:
                usualRange = re.search('usually(.+?)m', environmentInfo).group(1).strip()
            except AttributeError:
                usualRange = np.nan     

            # Look for distribution range in environmentInfo.
            try:
                distributionRange = environmentInfo.split("m . ",1)[1]
                distributionRangeName = ''
                for value in distributionRange.split(';'): # Split in case many locations exist.

                    if (bool(re.search(r'\d', value)) is False):
                        distributionRangeName += value
                    else:
                        distributionRangeCoordinates = value.strip()
            except IndexError:
                try:
                    distributionRange = environmentInfo.split("m. ",1)[1] # Split and get 2nd part.
                    distributionRangeName = ''
                    for value in distributionRange.split(';'): # Split in case many locations exist.

                        if (bool(re.search(r'\d', value)) is False):
                            distributionRangeName += value
                        else:
                            distributionRangeCoordinates = value.strip()
                except IndexError:             
                    distributionRangeName = np.nan
                    distributionRangeCoordinates = np.nan   

        else:
            environment = np.nan
            depthRange = np.nan
            usualRange = np.nan
            distributionRangeName = np.nan
            distributionRangeCoordinates = np.nan


        # Distribution Description
        if (distributionField != False):    
            distributionDescriptionInfo = ''
            for field in distributionField:
                field = field.extract()
                field = re.sub(r"[\n\t\r]*", "", field)
                distributionDescriptionInfo += field
            distributionDescriptionInfo = re.sub(r'[^\x00-\x7f]',r'',distributionDescriptionInfo.rstrip())
            distributionDescriptionInfo = distributionDescriptionInfo.replace('(Ref. )', '')

            distributionDescription = distributionDescriptionInfo

        else:
            distributionDescription = np.nan


        # Length, Weight, Age
        if (metricsField != False):    
            metricsInfo = ''
            for field in metricsField:
                field = field.extract()
                field = re.sub(r"[\n\t\r]*", "", field)    
                metricsInfo += field
            metricsInfo = re.sub(r'[^\x00-\x7f]',r'',metricsInfo.rstrip())
            metricsInfo = metricsInfo.replace('(Ref. )', '')

            try:
                sizeRange = re.search('range(.+?)cm', metricsInfo).group(1).strip()
            except AttributeError:
                sizeRange = np.nan   

            try:
                maxLength = re.search('Max length :(.+?)cm', metricsInfo).group(1).strip()
                maxLength = maxLength.replace(',','')
            except AttributeError:
                maxLength = np.nan    

            try:
                commonLength = re.search('common length :(.+?)cm', metricsInfo).group(1).strip()
                commonLength = commonLength.replace(',','')
            except AttributeError:
                commonLength = np.nan 

            try:
                maxWeight = re.search('max. published weight:(.+?)kg', metricsInfo).group(1).strip()
                maxWeight = maxWeight.replace(',','')
            except AttributeError:
                maxWeight = np.nan         

            try:
                maxAge = re.search('max. reported age:(.+?)years', metricsInfo).group(1).strip()
            except AttributeError:
                maxAge = np.nan

        else:
            sizeRange = np.nan
            maxLength = np.nan
            commonLength = np.nan
            maxWeight = np.nan
            maxAge = np.nan


        # Short Description and other metrics
        if (shortDescriptionField != False):    
            shortDescriptionInfo = ''
            for field in shortDescriptionField:
                if (type(field) is bs4.element.NavigableString): # if the field contains no <a> tags
                    field = re.sub(r"[\n\t\r]*", "", field)
                    shortDescriptionInfo += field+" "

                elif (type(field) is bs4.element.Tag): # if the field contains <a> tags  
                    field = re.sub(r"[\n\t\r]*", "", field.get_text())
                    shortDescriptionInfo += field+" "

                else: # if the field is NonType (like <br/>)
                    continue
            shortDescriptionInfo = shortDescriptionInfo.rstrip()      
            shortDescriptionInfo = re.sub(r'[^\x00-\x7f]',r'',shortDescriptionInfo.rstrip())
            shortDescriptionInfo = re.sub('( \(Ref.)  \d* \)', '', shortDescriptionInfo) 
            shortDescriptionInfo = re.sub('( \(Ref.)  \d* ,  \d* \)', '', shortDescriptionInfo) 

            try:
                dorsalSpines = re.search('Dorsal  spines \(total\):(.+?)(\.|;)', shortDescriptionInfo).group(1).strip()
            except AttributeError:
                dorsalSpines = np.nan

            try:
                dorsalSoftRays = re.search('Dorsal  soft rays \(total\):(.+?)(\.|;)', shortDescriptionInfo).group(1).strip()
            except AttributeError:
                dorsalSoftRays = np.nan
            dorsalSoftRays

            try:
                analSpines = re.search('Anal  spines : (.+?)(\.|;)', shortDescriptionInfo).group(1).strip()
            except AttributeError:
                analSpines = np.nan

            try:
                analSoftRays = re.search('Anal  soft rays :(.+?)(\.|;)', shortDescriptionInfo).group(1).strip()
            except AttributeError:
                analSoftRays = np.nan

            try:
                vertebrae = re.search('Vertebrae :(.+?)(\.|;)', shortDescriptionInfo).group(1).strip()
            except AttributeError:
                vertebrae = np.nan

            shortDescription = ''
            if (len(shortDescriptionInfo.split('.'))==2 and 'Dorsal  spines' not in shortDescriptionInfo.split('.')[0]):
                shortDescription = shortDescriptionInfo.split('.')[0]
            elif ('Dorsal  spines' not in shortDescriptionInfo.split('.')[0]):
                for line in shortDescriptionInfo.split('.')[0:]:
                    shortDescription += line
                shortDescription += '.'        
            else:
                for line in shortDescriptionInfo.split('.')[1:]:
                    shortDescription += line
                shortDescription += '.'
            if (shortDescription == ''):
                shortDescription = np.nan
        else:
            dorsalSpines = np.nan
            dorsalSoftRays = np.nan
            analSpines = np.nan
            analSoftRays = np.nan
            vertebrae = np.nan
            shortDescription = np.nan


        # Biology
        if (biologyField != False):
            biologyInfo = ''
            for field in biologyField:
                if (type(field) is bs4.element.NavigableString): # if the field contains no <a> tags
                    field = re.sub(r"[\n\t\r]*", "", field)
                    biologyInfo += field+" "

                elif (type(field) is bs4.element.Tag): # if the field contains <a> tags  
                    field = re.sub(r"[\n\t\r]*", "", field.get_text())
                    biologyInfo += field+" "

                else: # if the field is NonType (like <br/>)
                    continue
            biologyInfo = biologyInfo.rstrip()      
            biologyInfo = re.sub(r'[^\x00-\x7f]',r'',biologyInfo.strip())
            biologyInfo = re.sub('( \(Ref.)  \d* \)', '', biologyInfo)
            biologyInfo = re.sub('\(Ref.  \d* ,  \d* \)', '', biologyInfo)
            biologyInfo = re.sub('\(Ref.  \d* ,  \d* ,  \d* \)', '', biologyInfo)
        else:
            biologyInfo = np.nan


        # Life cycle and mating behavior
        if (lifeAndMateField != False):
            lifeAndMateInfo = ''
            for field in lifeAndMateField:
                if (type(field) is bs4.element.NavigableString): # if the field contains no <a> tags
                    field = re.sub(r"[\n\t\r]*", "", field)
                    lifeAndMateInfo += field+" "

                elif (type(field) is bs4.element.Tag): # if the field contains <a> tags  
                    field = re.sub(r"[\n\t\r]*", "", field.get_text())
                    lifeAndMateInfo += field+" "

                else: # if the field is NonType (like <br/>)
                    continue
            lifeAndMateInfo = lifeAndMateInfo.rstrip()      
            lifeAndMateInfo = re.sub(r'[^\x00-\x7f]',r'',lifeAndMateInfo.strip())  
            lifeAndMateInfo = re.sub('( \(Ref.)  \d* \)', '', lifeAndMateInfo)
            lifeAndMateInfo = re.sub('\(Ref.  \d* ,  \d* \)', '', lifeAndMateInfo)
            lifeAndMateInfo = re.sub('\(Ref.  \d* ,  \d* ,  \d* \)', '', lifeAndMateInfo)
            if (lifeAndMateInfo == ''):
                lifeAndMateInfo = np.nan

        else:
            lifeAndMateInfo = np.nan


        # Red List Status
        if (redListStatusField != False):
            redListStatus = ''
            for field in redListStatusField:
                if (type(field) is bs4.element.NavigableString): # if the field contains no <a> tags
                    field = re.sub(r"[\n\t\r]*", "", field)
                    redListStatus += field+" "

                elif (type(field) is bs4.element.Tag): # if the field contains <a> tags  
                    field = re.sub(r"[\n\t\r]*", "", field.get_text())
                    redListStatus += field+" "

                else: # if the field is NonType (like <br/>)
                    continue
            redListStatus = redListStatus.rstrip()      
            redListStatus = re.sub(r'[^\x00-\x7f]',r'',redListStatus.strip())        
        else:
            redListStatus = np.nan

        # Threat to humans
        if (threatToHumansField != False):
            threatToHumans = ''
            for field in threatToHumansField:
                if (type(field) is bs4.element.NavigableString): # if the field contains no <a> tags
                    field = re.sub(r"[\n\t\r]*", "", field)
                    threatToHumans += field+" "

                elif (type(field) is bs4.element.Tag): # if the field contains <a> tags  
                    field = re.sub(r"[\n\t\r]*", "", field.get_text())
                    threatToHumans += field+" "

                else: # if the field is NonType (like <br/>)
                    continue
            threatToHumans = threatToHumans.rstrip()      
            threatToHumans = re.sub(r'[^\x00-\x7f]',r'',threatToHumans.strip())
            threatToHumans = re.sub('\(Ref. \d* \)', '', threatToHumans)
            threatToHumans = re.sub('\(Ref. \d*\)', '', threatToHumans)
        else:
            threatToHumans = np.nan

        # Human Uses
        if (humanUsesField != False):
            humanUses = ''
            for field in humanUsesField:
                if (type(field) is bs4.element.NavigableString): # if the field contains no <a> tags
                    field = re.sub(r"[\n\t\r]*", "", field)
                    humanUses += field+" "

                elif (type(field) is bs4.element.Tag): # if the field contains <a> tags  
                    field = re.sub(r"[\n\t\r]*", "", field.get_text())
                    humanUses += field+" "

                else: # if the field is NonType (like <br/>)
                    continue
            if (humanUses.strip() == ''):
                humanUses = np.nan
            else:
                humanUses = humanUses.rstrip()      
                humanUses = re.sub(r'[^\x00-\x7f]',r'',humanUses.strip())        
        else:
            humanUses = np.nan

        # Other properties
        if (estimatesPropertiesField != False):
            propertiesInfo = ''
            for field in estimatesPropertiesField.contents:
                if (type(field) is bs4.element.NavigableString): # if the field contains no <a> tags
                    field = re.sub(r"[\n\t\r]*", "", field)
                    propertiesInfo += field+" "

                elif (type(field) is bs4.element.Tag): # if the field contains <a> tags  
                    field = re.sub(r"[\n\t\r]*", "", field.get_text())
                    propertiesInfo += field+" "

                else: # if the field is NonType (like <br/>)
                    continue
            propertiesInfo = propertiesInfo.rstrip()      
            propertiesInfo = re.sub(r'[^\x00-\x7f]',r'',propertiesInfo.strip()) 
            # Create separator for split()
            propertiesInfo = re.sub(r'Preferred temperature',r'~Preferred temperature ',propertiesInfo)  
            propertiesInfo = re.sub(r'Phylogenetic diversity index',r'~Phylogenetic diversity index ',propertiesInfo)  
            propertiesInfo = re.sub(r'Bayesian length-weight',r'~Bayesian length-weight ',propertiesInfo)  
            propertiesInfo = re.sub(r'Trophic Level',r'~Trophic Level ',propertiesInfo)  
            propertiesInfo = re.sub(r'Resilience',r'~Resilience (Ref ',propertiesInfo)  
            propertiesInfo = re.sub(r'Prior r',r'~Prior r ',propertiesInfo)  
            propertiesInfo = re.sub(r'Vulnerability',r'~Vulnerability ',propertiesInfo)  
            propertiesInfo = re.sub(r'Price category',r'~Price category ',propertiesInfo)  
            propertiesInfo = re.sub('(\(Ref.)+ \d*\)', '', propertiesInfo)
            propertiesInfo = re.sub('(\(Ref.  )+\d*( \))', '', propertiesInfo)
            propertiesInfo = re.sub('(\(Ref)( )*', '', propertiesInfo)

            preferredTemperatureMin = np.nan
            preferredTemperatureMean = np.nan
            preferredTemperatureMax = np.nan
            phylogeneticDiversity = np.nan
            bayesianLengthWeightA = np.nan
            bayesianLengthWeightB = np.nan
            tropicLevel = np.nan
            resilience = np.nan
            priorR = np.nan
            sdRange = np.nan
            vulnerability = np.nan
            priceCategory = np.nan
            for value in propertiesInfo.split('~'):
                try:
                    preferredTemperatureMean = re.search('mean(.+?)\(based', value).group(1).strip()
                    preferredTemperatureMax = re.search('- (.+?),', value).group(1).strip()
                    if ('Bayesian length' not in value):
                        preferredTemperatureMin = re.search(': (.+?) -', value).group(1).strip()
                    cellsBasedOn = re.search('\(based on (.+?)\ cells\)', value).group(1).strip()
                except AttributeError:
                    0    
                try:
                    phylogeneticDiversity = re.search('PD50 =(.+?)\[Uniqueness', value).group(1).strip()
                except AttributeError:
                    0  
                try:
                    bayesianLengthWeightA = re.search('a=(.+?)\(', value).group(1).strip()
                except AttributeError:
                    0 
                try:
                    bayesianLengthWeightB = re.search('b=(.+?)\(', value).group(1).strip()
                except AttributeError:
                    0    
                try:
                    re.search('Trophic Level(.+?)', value).group(1).strip()
                    tropicLevel = value.split("Trophic Level   :",1)[1].strip()
                except AttributeError:
                    0     
                try:
                    re.search('Resilience(.+?)', value).group(1).strip()
                    resilience = value.split("Resilience :",1)[1].strip()
                except AttributeError:
                    0  
                try:
                    priorR = re.search('Prior r  =(.+?),', value).group(1).strip()
                except AttributeError:
                    0 
                try:
                    sdRange = re.search('SD range =(.+?),', value).group(1).strip()
                except AttributeError:
                    0         
                try:
                    re.search('Vulnerability(.+?)', value).group(1).strip()
                    vulnerability = value.split("Vulnerability   :",1)[1].strip()
                except AttributeError:
                    0      
                try:
                    re.search('Price category(.+?)', value).group(1).strip()
                    priceCategory = value.split("Price category   :",1)[1].strip()
                    priceCategory = re.sub(r"\.", "", priceCategory)
                except AttributeError:
                    0      
        else:
            preferredTemperature = np.nan
            phylogeneticDiversity = np.nan
            bayesianLengthWeightA = np.nan
            bayesianLengthWeightB = np.nan
            tropicLevel = np.nan
            resilience = np.nan
            priorR = np.nan
            sdRange = np.nan
            vulnerability = np.nan
            priceCategory = np.nan

        dataset = dataset.append({
                                'scientificName':scientificName,
                                'commonName':commonName,
                                'classification':classification,
                                'etymology':etymology,
                                'environment':environment,
                                'depthRange':depthRange,
                                'usualRange':usualRange,
                                'distributionRangeName':distributionRangeName,
                                'distributionRangeCoordinates':distributionRangeCoordinates,
                                'distributionDescription':distributionDescription,
                                'sizeRange':sizeRange,
                                'maxLength':maxLength,
                                'commonLength':commonLength,
                                'maxWeight':maxWeight,
                                'maxAge':maxAge,
                                'biologyInfo':biologyInfo,
                                'lifeAndMateInfo':lifeAndMateInfo,
                                'redListStatus':redListStatus,
                                'dorsalSpines':dorsalSpines,
                                'dorsalSoftRays':dorsalSoftRays,
                                'analSpines':analSpines,
                                'analSoftRays':analSoftRays,
                                'vertebrae':vertebrae,
                                'shortDescription':shortDescription,
                                'threatToHumans':threatToHumans,
                                'preferredTemperatureMin':preferredTemperatureMin,
                                'preferredTemperatureMean':preferredTemperatureMean,
                                'preferredTemperatureMax':preferredTemperatureMax,
                                'cellsBasedOn':cellsBasedOn,
                                'phylogeneticDiversity':phylogeneticDiversity,
                                'bayesianLengthWeightA':bayesianLengthWeightA,
                                'bayesianLengthWeightB':bayesianLengthWeightB,
                                'resilience':resilience,
                                'priorR':priorR,
                                'sdRange':sdRange,
                                'vulnerability':vulnerability,
                                'priceCategory':priceCategory,
                                'source':website
                                }, ignore_index=True)  
        
    if (website=='ca'):
        
        # Initialize every final variable to nan
        commonName, classification, etymology, environment, depthRange, usualRange, distributionRangeName, distributionRangeCoordinates, distributionDescription = np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan
        sizeRange, maxLength, commonLength, maxWeight, maxAge, biologyInfo, lifeAndMateInfo, redListStatus = np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan
        dorsalSpines, dorsalSoftRays, analSpines, analSoftRays, vertebrae, shortDescription, threatToHumans, preferredTemperatureMin, preferredTemperatureMean, preferredTemperatureMax, cellsBasedOn = np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan
        phylogeneticDiversity, bayesianLengthWeightA, bayesianLengthWeightB, resilience, priorR, sdRange, vulnerability, priceCategory = np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan

        # Collect scientific name of species
        scientificName = ''
        divScientificName = soup.find(id='ss-sciname') # Find a specific id in html
        divScientificNameUrls = divScientificName.find('b')
        scientificName = divScientificNameUrls.contents[0]
        scientificName

        # Collect common name
        commonName = soup.find(class_='sheader2').contents[0]
        commonName = re.sub(r"[\n\t\r]*", "", commonName)
        if (commonName == ''):
            commonName = np.nan    

        # Classification
        classification, etymology = '', ''
        fields = soup.find_all(class_='slabel bottomBorder') 
        for i, field in enumerate(fields):
            if ('Classification / Names' in field.text):
                classification = field.next_sibling.replace('|','>')+field.next_sibling.next_sibling.text+" > "+field.next_sibling.next_sibling.next_sibling.next_sibling.text
                if ('Etymology' in str(classification)):
                    etymology = classification.split("Etymology: ",1)[1] # Split text in 2 parts and get the second.

        classification = re.sub(r"[\n\t\r]*", "", classification)
        etymology = re.sub(r"[\n\t\r]*", "", etymology)
        if (classification==''):
            classification = np.nan
        if (etymology==''):
            etymology = np.nan    

        # I don't initialize them with np.nan from the beginning, because they might exist and be empty ''.
        environmentInfo, distributionDescription, metricsInfo, biologyInfo, lifeAndMateInfo = '', '', '', '', ''
        divMain = soup.find(id='ss-main')
        fields = soup.find_all('span', recursive=True) 
        for i, field in enumerate(fields):
            # Environment, Climate Zone, Depth, Distribution Coordinates
            if ('Ecology' == field.text and 'Ecology' != fields[i+1].text and 'Diet' != fields[i+1].text):
                environmentInfo = fields[i+1].text
                environmentInfo = re.sub(r"[\n\t\r]*", "", environmentInfo)
                environmentInfo = re.sub('(\(Ref. )+\d*(\))', '', environmentInfo)
                environmentInfo = re.sub(r'[^\x00-\x7f]',r'',environmentInfo.rstrip())
                # Get environment values
                try:
                    beforeDepthRange = re.search('(.+?)depth range', environmentInfo).group(1).strip() # Match whater is before specific string
                    environment = ''
                    for value in beforeDepthRange.split(';'):
                        if (':' not in value):
                            environment += value+","
                    environment = re.sub('( ){2,}', ' ', environment) # Make 2 or more spaces 1.
                    environment = environment.rstrip(' ,')
                except AttributeError:
                    environment = np.nan  

                # Look for depth range in environmentInfo.
                try:
                    depthRange = re.search('depth range(.+?)m', environmentInfo).group(1).strip() # Match everything between 2 specific strings.
                except AttributeError:
                    depthRange = np.nan     

                try:
                    usualRange = re.search('usually(.+?)m', environmentInfo).group(1).strip()
                except AttributeError:
                    usualRange = np.nan   

                # Look for distribution range in environmentInfo.
                try:
                    distributionRange = environmentInfo.split("m . ",1)[1]
                    distributionRangeName = ''
                    for value in distributionRange.split(';'): # Split in case many locations exist.

                        if (bool(re.search(r'(\d*C)', value)) is True):
                            distributionRangeName += value                
                        elif (bool(re.search(r'\d', value)) is False):
                            distributionRangeName += value
                        else:
                            distributionRangeCoordinates = value.strip()
                except IndexError:
                    try:
                        distributionRange = environmentInfo.split("m. ",1)[1] # Split and get 2nd part.
                        distributionRangeName = ''
                        for value in distributionRange.split(';'): # Split in case many locations exist.

                            if (bool(re.search(r'\d', value)) is False):
                                distributionRangeName += value
                            else:
                                distributionRangeCoordinates = value.strip()
                    except IndexError:             
                        distributionRangeName = np.nan
                        distributionRangeCoordinates = np.nan   

            # Distribution Description
            if ('Introductions' == field.text and 'Stocks' != fields[i+1].text):
                distributionDescription = fields[i+1].text.strip()                 

            # Length, Weight, Age   
            if ('Glossary' in field.text and 'Glossary' not in fields[i-1].text):
                metricsInfo = fields[i-1].text
                metricsInfo = re.sub('(\(Ref. )+\d*(\))', '', metricsInfo)
                metricsInfo = re.sub(r'[^\x00-\x7f]',r'',metricsInfo.strip())

                try:
                    sizeRange = re.search('range(.+?)cm', metricsInfo).group(1).strip()
                except AttributeError:
                    sizeRange = np.nan   

                try:
                    maxLength = re.search('Max length :(.+?)cm', metricsInfo).group(1).strip()
                    maxLength = maxLength.replace(',','')
                except AttributeError:
                    maxLength = np.nan    

                try:
                    commonLength = re.search('common length :(.+?)cm', metricsInfo).group(1).strip()
                    commonLength = commonLength.replace(',','')
                except AttributeError:
                    commonLength = np.nan 

                try:
                    maxWeight = re.search('max. published weight:(.+?)kg', metricsInfo).group(1).strip()
                    maxWeight = maxWeight.replace(',','')
                except AttributeError:
                    maxWeight = np.nan         

                try:
                    maxAge = re.search('max. reported age:(.+?)years', metricsInfo).group(1).strip()
                except AttributeError:
                    maxAge = np.nan   

            # Biology        
            if ('(e.g.  epibenthic)' in field.text and 'Glossary' not in fields[i+1].text):
                biologyInfo = fields[i+1].text
                biologyInfo = re.sub(r'[^\x00-\x7f]',r'',biologyInfo.strip())
                biologyInfo = re.sub('(\(Ref.) \d*\)', '', biologyInfo)
                biologyInfo = re.sub('(\(Ref. )\d*, p. \d*\)', '', biologyInfo)
                biologyInfo = re.sub('( \(Ref.)  \d* \)', '', biologyInfo)
                biologyInfo = re.sub('\(Ref.  \d* ,  \d* \)', '', biologyInfo)
                biologyInfo = re.sub('\(Ref.  \d* ,  \d* ,  \d* \)', '', biologyInfo)        

            # Life cycle and mating behavior    
            if ('Larvae' == field.text and 'Abundance' != fields[i+1].text):
                lifeAndMateInfo = fields[i+1].text  
                lifeAndMateInfo = lifeAndMateInfo.rstrip()      
                lifeAndMateInfo = re.sub(r'[^\x00-\x7f]',r'',lifeAndMateInfo.strip())  
                lifeAndMateInfo = re.sub('( \(Ref.)  \d* \)', '', lifeAndMateInfo)
                lifeAndMateInfo = re.sub('\(Ref.  \d* ,  \d* \)', '', lifeAndMateInfo)
                lifeAndMateInfo = re.sub('\(Ref.  \d* ,  \d* ,  \d* \)', '', lifeAndMateInfo)    

        if (environmentInfo == ''):
            environment = np.nan
            depthRange = np.nan
            usualRange = np.nan
            distributionRangeName = np.nan
            distributionRangeCoordinates = np.nan
        if (distributionDescription == ''):
            distributionDescription = np.nan    
        if (metricsInfo == ''):
            sizeRange = np.nan
            maxLength = np.nan
            commonLength = np.nan
            maxWeight = np.nan
            maxAge = np.nan 
        if (biologyInfo == ''):
            biologyInfo = np.nan    
        if (lifeAndMateInfo == ''):
            lifeAndMateInfo = np.nan  

        # Red List Status
        redListStatus = ''
        urls = soup.find_all('a', recursive=True) 
        fields = soup.find_all('a', href=True) 
        for i, field in enumerate(fields):
            if ('www.iucnredlist.org' in field['href']):
                redListStatus = field.text
                redListStatus = re.sub(r"[\n\t\r]*", "", redListStatus)

        if (redListStatus == ''):
            redListStatus = np.nan  

        # Short Description and other metrics,
        shortDescriptionInfo, threatToHumans, humanUses= '', '', ''
        fields = soup.find_all(class_='slabel bottomBorder') 
        for i, field in enumerate(fields):
            if ('Short description' in field.text): # Short Description
                shortDescriptionInfo = field.next_sibling.next_sibling.text
                shortDescriptionInfo = re.sub(r'[^\x00-\x7f]',r'',shortDescriptionInfo.strip())
                shortDescriptionInfo = re.sub('( \(Ref.)  \d* \)', '', shortDescriptionInfo) 
                shortDescriptionInfo = re.sub('( \(Ref.)  \d* ,  \d* \)', '', shortDescriptionInfo) 

                try:
                    dorsalSpines = re.search('Dorsal  spines \(total\):(.+?)(\.|;)', shortDescriptionInfo).group(1).strip()
                except AttributeError:
                    dorsalSpines = np.nan

                try:
                    dorsalSoftRays = re.search('Dorsal  soft rays \(total\):(.+?)(\.|;)', shortDescriptionInfo).group(1).strip()
                except AttributeError:
                    dorsalSoftRays = np.nan
                dorsalSoftRays

                try:
                    analSpines = re.search('Anal  spines : (.+?)(\.|;)', shortDescriptionInfo).group(1).strip()
                except AttributeError:
                    analSpines = np.nan

                try:
                    analSoftRays = re.search('Anal  soft rays :(.+?)(\.|;)', shortDescriptionInfo).group(1).strip()
                except AttributeError:
                    analSoftRays = np.nan

                try:
                    vertebrae = re.search('Vertebrae :(.+?)(\.|;)', shortDescriptionInfo).group(1).strip()
                except AttributeError:
                    vertebrae = np.nan

                shortDescription = ''
                if (len(shortDescriptionInfo.split('.'))==2 and 'Dorsal  spines' not in shortDescriptionInfo.split('.')[0]):
                    shortDescription = shortDescriptionInfo.split('.')[0]
                elif ('Dorsal  spines' not in shortDescriptionInfo.split('.')[0]):
                    for line in shortDescriptionInfo.split('.')[0:]:
                        shortDescription += line
                    shortDescription += '.'        
                else:
                    for line in shortDescriptionInfo.split('.')[1:]:
                        shortDescription += line
                    shortDescription += '.'    


            # Threat to humans
            if ('Threat to humans ' in field.text): # Threat to humans
                threatToHumans = field.next_sibling.next_sibling.text
                threatToHumans = re.sub(r'[^\x00-\x7f]',r'',threatToHumans.strip())      
                threatToHumans = re.sub('\(Ref. \d* \)', '', threatToHumans)
                threatToHumans = re.sub('\(Ref. \d*\)', '', threatToHumans)
            
            # Human Uses
            if ('Human uses' in field.text): # Human Uses
                humanUses = field.next_sibling.next_sibling.text
                humanUses = re.sub(r'[^\x00-\x7f]',r'',humanUses.strip()) 

        # Other properties  
        preferredTemperatureMean, preferredTemperatureMin, preferredTemperatureMax, phylogeneticDiversity, bayesianLengthWeightA, bayesianLengthWeightB, tropicLevel, resilience, priorR, sdRange, vulnerability, priceCategory= '','','', '', '', '', '', '', '', '', '', ''
        fields = soup.find_all(class_='slabel') 
        for i, field in enumerate(fields):
            if ('Preferred temperature' in field.text): # Preferred temperature
                preferredTemperatureInfo = field.next_sibling.next_sibling.next_sibling.strip()
                preferredTemperatureMean = re.search('mean(.+?)\(based', preferredTemperatureInfo).group(1).strip()
                if ('Bayesian length' not in preferredTemperatureInfo):
                    preferredTemperatureMin = re.search(': (.+?) -', preferredTemperatureInfo).group(1).strip()
                cellsBasedOn = re.search('\(based on (.+?)\ cells\)', preferredTemperatureInfo).group(1).strip()
                preferredTemperatureMax = re.search('- (.+?),', preferredTemperatureInfo).group(1).strip()
            if ('Vulnerability' in field.text): # Vulnerability
                vulnerability = field.next_sibling.next_sibling.next_sibling.next_sibling.text.strip()  
            if ('Price category' in field.text): # Price category
                priceCategory = field.next_sibling.next_sibling.next_sibling.next_sibling.text.strip()

        if (shortDescription == ''):
            dorsalSpines = np.nan
            dorsalSoftRays = np.nan
            analSpines = np.nan
            analSoftRays = np.nan
            vertebrae = np.nan
            shortDescription = np.nan
        if (threatToHumans == ''):  
            threatToHumans = np.nan
        if (humanUses == ''):  
            humanUses = np.nan
        if (preferredTemperatureMin == ''):  
            preferredTemperatureMin = np.nan 
        if (preferredTemperatureMean == ''):  
            preferredTemperatureMean = np.nan                
        if (preferredTemperatureMax == ''):  
            preferredTemperatureMax = np.nan       
        if (cellsBasedOn == ''):  
            cellsBasedOn = np.nan               
        if (phylogeneticDiversity == ''):  
            phylogeneticDiversity = np.nan  
        if (bayesianLengthWeightA == ''):  
            bayesianLengthWeightA = np.nan      
        if (bayesianLengthWeightB == ''):  
            bayesianLengthWeightB = np.nan      
        if (resilience == ''):  
            resilience = np.nan      
        if (priorR == ''):  
            priorR = np.nan      
        if (sdRange == ''):  
            sdRange = np.nan      
        if (vulnerability == ''):  
            vulnerability = np.nan      
        if (priceCategory == ''):  
            priceCategory = np.nan

        dataset = dataset.append({
                                'scientificName':scientificName,
                                'commonName':commonName,
                                'classification':classification,
                                'etymology':etymology,
                                'environment':environment,
                                'depthRange':depthRange,
                                'usualRange':usualRange,
                                'distributionRangeName':distributionRangeName,
                                'distributionRangeCoordinates':distributionRangeCoordinates,
                                'distributionDescription':distributionDescription,
                                'sizeRange':sizeRange,
                                'maxLength':maxLength,
                                'commonLength':commonLength,
                                'maxWeight':maxWeight,
                                'maxAge':maxAge,
                                'biologyInfo':biologyInfo,
                                'lifeAndMateInfo':lifeAndMateInfo,
                                'redListStatus':redListStatus,
                                'dorsalSpines':dorsalSpines,
                                'dorsalSoftRays':dorsalSoftRays,
                                'analSpines':analSpines,
                                'analSoftRays':analSoftRays,
                                'vertebrae':vertebrae,
                                'shortDescription':shortDescription,
                                'threatToHumans':threatToHumans,
                                'preferredTemperatureMin':preferredTemperatureMin,
                                'preferredTemperatureMean':preferredTemperatureMean,
                                'preferredTemperatureMax':preferredTemperatureMax,
                                'cellsBasedOn':cellsBasedOn,
                                'phylogeneticDiversity':phylogeneticDiversity,
                                'bayesianLengthWeightA':bayesianLengthWeightA,
                                'bayesianLengthWeightB':bayesianLengthWeightB,
                                'resilience':resilience,
                                'priorR':priorR,
                                'sdRange':sdRange,
                                'vulnerability':vulnerability,
                                'priceCategory':priceCategory,
                                'source':website
                                }, ignore_index=True)              

Using www.sealifebase.ca for	: Abra alba
Using www.sealifebase.ca for	: Abralia veranyi
Using www.sealifebase.ca for	: Abraliopsis pfefferi
Using www.sealifebase.ca for	: Abylopsis tetragona
Using www.sealifebase.ca for	: Acanthocardia aculeata
Using www.sealifebase.ca for	: Acanthocardia echinata
Using www.sealifebase.ca for	: Acanthocardia paucicostata
Using www.sealifebase.ca for	: Acanthocardia spinosa
Using www.sealifebase.ca for	: Acanthocardia tuberculata
Using www.sealifebase.ca for	: Acanthochitona fascicularis
Using www.fishbase.de for	: Acanthocybium solandri
Using www.sealifebase.ca for	: Acanthodasys aculeatus
Using www.fishbase.de for	: Acantholabrus palloni
Using www.sealifebase.ca for	: Acanthomysis longicornis
Using www.sealifebase.ca for	: Acar plicata
Using www.fishbase.de for	: Acipenser gueldenstaedtii
Using www.fishbase.de for	: Acipenser naccarii
Using www.fishbase.de for	: Acipenser nudiventris
Using www.fishbase.de for	: Acipenser stellatus
Using www.fishbase.d

Using www.fishbase.de for	: Bathypterois dubius
Using www.fishbase.de for	: Bathysolea profundicola
Using www.sealifebase.ca for	: Beania hirtissima
Using www.sealifebase.ca for	: Beania magellanica
Using www.sealifebase.ca for	: Beania mirabilis
Using www.fishbase.de for	: Bellottia apoda
Using www.fishbase.de for	: Belone belone
Using www.fishbase.de for	: Belone svetovidovi
Using www.sealifebase.ca for	: Benthomangelia macra
Using www.sealifebase.ca for	: Benthonella tenella
Using www.fishbase.de for	: Benthosema glaciale
Using www.sealifebase.ca for	: Berthella stellata
Using www.fishbase.de for	: Beryx decadactylus
Using www.sealifebase.ca for	: Bhawania goodei
Using www.sealifebase.ca for	: Bispira crassicornis
Using www.sealifebase.ca for	: Bittium reticulatum
Using www.fishbase.de for	: Blennius ocellaris
Using www.sealifebase.ca for	: Bolinus brandaris
Using www.sealifebase.ca for	: Bolitaena pygmaea
Using www.sealifebase.ca for	: Bonellia viridis
Using www.fishbase.de for	: B

Using www.sealifebase.ca for	: Coscinasterias tenuispina
Using www.sealifebase.ca for	: Cotylorhiza tuberculata
Using www.sealifebase.ca for	: Cranchia scabra
Using www.sealifebase.ca for	: Crangon crangon
Using www.sealifebase.ca for	: Crassostrea angulata
Using www.sealifebase.ca for	: Crassostrea virginica
Using www.sealifebase.ca for	: Crenella decussata
Using www.sealifebase.ca for	: Crepidula fornicata
Using www.sealifebase.ca for	: Creseis acicula
Using www.sealifebase.ca for	: Creseis virgula
Using www.fishbase.de for	: Crystallogobius linearis
Using www.sealifebase.ca for	: Ctena decussata
Using www.fishbase.de for	: Ctenolabrus rupestris
Using www.fishbase.de for	: Cubiceps gracilis
Using www.sealifebase.ca for	: Cunina octonaria
Using www.sealifebase.ca for	: Cuspidaria rostrata
Using www.sealifebase.ca for	: Cuthona perca
Using www.sealifebase.ca for	: Cyathura carinata
Using www.fishbase.de for	: Cyclothone braueri
Using www.fishbase.de for	: Cyclothone microdon
Using www.

Using www.sealifebase.ca for	: Fenestrulina malusii
Using www.sealifebase.ca for	: Ficopomatus enigmaticus
Using www.sealifebase.ca for	: Figularia figularis
Using www.sealifebase.ca for	: Filellum serratum
Using www.sealifebase.ca for	: Firoloida desmarestia
Using www.sealifebase.ca for	: Flabelligera diplochaitos
Using www.sealifebase.ca for	: Flaccisagitta enflata
Using www.sealifebase.ca for	: Flaccisagitta hexaptera
Using www.sealifebase.ca for	: Flexopecten glaber
Using www.sealifebase.ca for	: Fritillaria pellucida
Using www.sealifebase.ca for	: Funchalia villosa
Using www.sealifebase.ca for	: Funchalia woodwardi
Using www.sealifebase.ca for	: Funchalia woodwardi
Using www.sealifebase.ca for	: Funiculina quadrangularis
Using www.sealifebase.ca for	: Fustiaria rubescens
Using www.fishbase.de for	: Gadella maraldi
Using www.fishbase.de for	: Gadiculus argenteus
Using www.sealifebase.ca for	: Gadila strangulata
Using www.fishbase.de for	: Gaidropsarus biscayensis
Using www.fishbase

Using www.sealifebase.ca for	: Javania cailleti
Using www.sealifebase.ca for	: Josephella marenzelleri
Using www.fishbase.de for	: Kajikia albida
Using www.fishbase.de for	: Katsuwonus pelamis
Using www.sealifebase.ca for	: Kellia suborbicularis
Using www.fishbase.de for	: Knipowitschia caucasica
Using www.sealifebase.ca for	: Kophobelemnon stelliferum
Using www.sealifebase.ca for	: Krohnitta subtilis
Using www.fishbase.de for	: Kyphosus sectatrix
Using www.sealifebase.ca for	: Labioleanira yhleni
Using www.fishbase.de for	: Labrus merula
Using www.fishbase.de for	: Labrus mixtus
Using www.fishbase.de for	: Labrus viridis
Using www.sealifebase.ca for	: Laevicardium crassum
Using www.fishbase.de for	: Lagocephalus lagocephalus
Using www.fishbase.de for	: Lamna nasus
Using www.fishbase.de for	: Lampanyctus crocodilus
Using www.fishbase.de for	: Lampanyctus pusillus
Using www.fishbase.de for	: Lampris guttatus
Using www.sealifebase.ca for	: Lanice conchilega
Using www.sealifebase.ca for	:

Using www.sealifebase.ca for	: Microcosmus claudicans
Using www.sealifebase.ca for	: Microcosmus sabatieri
Using www.sealifebase.ca for	: Microcosmus savignyi
Using www.sealifebase.ca for	: Microcosmus vulgaris
Using www.fishbase.de for	: Microlipophrys adriaticus
Using www.fishbase.de for	: Microlipophrys canevae
Using www.fishbase.de for	: Microlipophrys dalmatinus
Using www.fishbase.de for	: Microlipophrys nigriceps
Using www.fishbase.de for	: Micromesistius poutassou
Using www.sealifebase.ca for	: Microporella ciliata
Using www.fishbase.de for	: Microstoma microstoma
Using www.fishbase.de for	: Millerigobius macrocephalus
Using www.sealifebase.ca for	: Mimachlamys varia
Using www.fishbase.de for	: Minyichthys sentus
Using www.fishbase.de for	: Mobula mobular
Using www.sealifebase.ca for	: Modeeria rotunda
Using www.sealifebase.ca for	: Modiolus adriaticus
Using www.sealifebase.ca for	: Modiolus barbatus
Using www.fishbase.de for	: Mola mola
Using www.fishbase.de for	: Molva dyptery

Using www.sealifebase.ca for	: Paramysis arenosa
Using www.sealifebase.ca for	: Paranthura costana
Using www.sealifebase.ca for	: Paranthura nigropunctata
Using www.sealifebase.ca for	: Parapenaeus longirostris
Using www.sealifebase.ca for	: Paraprionospio pinnata
Using www.fishbase.de for	: Parapristipoma octolineatum
Using www.sealifebase.ca for	: Parapseudomma calloplura
Using www.sealifebase.ca for	: Parasagitta friderici
Using www.sealifebase.ca for	: Parasmittina tropica
Using www.sealifebase.ca for	: Parastichopus regalis
Using www.sealifebase.ca for	: Paraturbanella teissieri
Using www.sealifebase.ca for	: Parazoanthus axinellae
Using www.sealifebase.ca for	: Parellisina curvirostris
Using www.sealifebase.ca for	: Parerythrops obesa
Using www.sealifebase.ca for	: Paromola cuvieri
Using www.fishbase.de for	: Parophidion vassali
Using www.sealifebase.ca for	: Parvamussium fenestratum
Using www.sealifebase.ca for	: Pasiphaea sivado
Using www.sealifebase.ca for	: Patella ulyssipone

Using www.fishbase.de for	: Remora remora
Using www.sealifebase.ca for	: Reptadeonella violacea
Using www.sealifebase.ca for	: Rhinoclavis kochi
Using www.fishbase.de for	: Rhinoptera marginata
Using www.sealifebase.ca for	: Rhodine gracilior
Using www.sealifebase.ca for	: Rhodosoma turcicum
Using www.sealifebase.ca for	: Rocinela dumerilii
Using www.sealifebase.ca for	: Rondeletiola minor
Using www.sealifebase.ca for	: Rosacea cymbiformis
Using www.sealifebase.ca for	: Rosacea plicata
Using www.sealifebase.ca for	: Rossia macrosoma
Using www.fishbase.de for	: Rostroraja alba
Using www.sealifebase.ca for	: Royella sinon
Using www.sealifebase.ca for	: Ruditapes decussatus
Using www.fishbase.de for	: Ruvettus pretiosus
Using www.fishbase.de for	: Salaria basilisca
Using www.fishbase.de for	: Salaria pavo
Using www.sealifebase.ca for	: Salmacina dysteri
Using www.fishbase.de for	: Salmo salar
Using www.fishbase.de for	: Salmo trutta
Using www.sealifebase.ca for	: Salpa fusiformis
Using ww

Using www.fishbase.de for	: Symphodus tinca
Using www.fishbase.de for	: Symphurus nigrescens
Using www.sealifebase.ca for	: Synalpheus tumidomanus
Using www.fishbase.de for	: Synapturichthys kleinii
Using www.fishbase.de for	: Synchiropus phaeton
Using www.fishbase.de for	: Syngnathus abaster
Using www.fishbase.de for	: Syngnathus acus
Using www.fishbase.de for	: Syngnathus phlegon
Using www.fishbase.de for	: Syngnathus typhle
Using www.sealifebase.ca for	: Synisoma capito
Using www.sealifebase.ca for	: Synnotum aegyptiacum
Using www.fishbase.de for	: Synodus saurus
Using www.sealifebase.ca for	: Syphonota geographica
Using www.fishbase.de for	: Taeniura grabata
Using www.sealifebase.ca for	: Talochlamys multistriata
Using www.sealifebase.ca for	: Tanais dulongii
Using www.sealifebase.ca for	: Taningia danae
Using www.sealifebase.ca for	: Tellina donacina
Using www.sealifebase.ca for	: Tellina tenuis
Using www.sealifebase.ca for	: Telmatactis cricoides
Using www.sealifebase.ca for	: Te

In [3]:
dataset

Unnamed: 0,scientificName,commonName,classification,etymology,environment,depthRange,usualRange,distributionRangeName,distributionRangeCoordinates,distributionDescription,sizeRange,maxLength,commonLength,maxWeight,maxAge,biologyInfo,lifeAndMateInfo,redListStatus,dorsalSpines,dorsalSoftRays,analSpines,analSoftRays,vertebrae,shortDescription,threatToHumans,preferredTemperatureMin,preferredTemperatureMean,preferredTemperatureMax,cellsBasedOn,phylogeneticDiversity,bayesianLengthWeightA,bayesianLengthWeightB,resilience,priorR,sdRange,vulnerability,priceCategory,source
0,Tetraplatia volitans,,Hydrozoa > Narcomedusae > Tetraplatiidae,,Pelagic,0 - 900,,Tropical,,"Atlantic, Pacific Oceans and the Mediterranean: Japan, Chile, Algeria, Brazil, Canada and USA.",,,,,,Members of the order Narcomedusae include L-form hydroids. Life cycle: The zygote develops int...,Members of the order Narcomedusae include L-form hydroids. Life cycle: The zygote develops int...,Not Evaluated,,,,,,"Body up to 13 cm long; with 4 flying buttresses, joining the basal corners of the oral aboral po...",,3.3,12.3,21.7,1171,,,,,,,Low vulnerability (10 of 100),Unknown,ca
1,Tetraplatia volitans,,Hydrozoa > Narcomedusae > Tetraplatiidae,,Pelagic,0 - 900,,Tropical,,"Atlantic, Pacific Oceans and the Mediterranean: Japan, Chile, Algeria, Brazil, Canada and USA.",,,,,,Members of the order Narcomedusae include L-form hydroids. Life cycle: The zygote develops int...,Members of the order Narcomedusae include L-form hydroids. Life cycle: The zygote develops int...,Not Evaluated,,,,,,"Body up to 13 cm long; with 4 flying buttresses, joining the basal corners of the oral aboral po...",,3.3,12.3,21.7,1171,,,,,,,Low vulnerability (10 of 100),Unknown,ca
2,Tetraplatia volitans,,Hydrozoa > Narcomedusae > Tetraplatiidae,,Pelagic,0 - 900,,Tropical,,"Atlantic, Pacific Oceans and the Mediterranean: Japan, Chile, Algeria, Brazil, Canada and USA.",,,,,,Members of the order Narcomedusae include L-form hydroids. Life cycle: The zygote develops int...,Members of the order Narcomedusae include L-form hydroids. Life cycle: The zygote develops int...,Not Evaluated,,,,,,"Body up to 13 cm long; with 4 flying buttresses, joining the basal corners of the oral aboral po...",,3.3,12.3,21.7,1171,,,,,,,Low vulnerability (10 of 100),Unknown,ca
3,Tetraplatia volitans,,Hydrozoa > Narcomedusae > Tetraplatiidae,,Pelagic,0 - 900,,Tropical,,"Atlantic, Pacific Oceans and the Mediterranean: Japan, Chile, Algeria, Brazil, Canada and USA.",,,,,,Members of the order Narcomedusae include L-form hydroids. Life cycle: The zygote develops int...,Members of the order Narcomedusae include L-form hydroids. Life cycle: The zygote develops int...,Not Evaluated,,,,,,"Body up to 13 cm long; with 4 flying buttresses, joining the basal corners of the oral aboral po...",,3.3,12.3,21.7,1171,,,,,,,Low vulnerability (10 of 100),Unknown,ca
4,Tetraplatia volitans,,Hydrozoa > Narcomedusae > Tetraplatiidae,,Pelagic,0 - 900,,Tropical,,"Atlantic, Pacific Oceans and the Mediterranean: Japan, Chile, Algeria, Brazil, Canada and USA.",,,,,,Members of the order Narcomedusae include L-form hydroids. Life cycle: The zygote develops int...,Members of the order Narcomedusae include L-form hydroids. Life cycle: The zygote develops int...,Not Evaluated,,,,,,"Body up to 13 cm long; with 4 flying buttresses, joining the basal corners of the oral aboral po...",,3.3,12.3,21.7,1171,,,,,,,Low vulnerability (10 of 100),Unknown,ca
5,Tetraplatia volitans,,Hydrozoa > Narcomedusae > Tetraplatiidae,,Pelagic,0 - 900,,Tropical,,"Atlantic, Pacific Oceans and the Mediterranean: Japan, Chile, Algeria, Brazil, Canada and USA.",,,,,,Members of the order Narcomedusae include L-form hydroids. Life cycle: The zygote develops int...,Members of the order Narcomedusae include L-form hydroids. Life cycle: The zygote develops int...,Not Evaluated,,,,,,"Body up to 13 cm long; with 4 flying buttresses, joining the basal corners of the oral aboral po...",,3.3,12.3,21.7,1171,,,,,,,Low vulnerability (10 of 100),Unknown,ca
6,Tetraplatia volitans,,Hydrozoa > Narcomedusae > Tetraplatiidae,,Pelagic,0 - 900,,Tropical,,"Atlantic, Pacific Oceans and the Mediterranean: Japan, Chile, Algeria, Brazil, Canada and USA.",,,,,,Members of the order Narcomedusae include L-form hydroids. Life cycle: The zygote develops int...,Members of the order Narcomedusae include L-form hydroids. Life cycle: The zygote develops int...,Not Evaluated,,,,,,"Body up to 13 cm long; with 4 flying buttresses, joining the basal corners of the oral aboral po...",,3.3,12.3,21.7,1171,,,,,,,Low vulnerability (10 of 100),Unknown,ca
7,Tetraplatia volitans,,Hydrozoa > Narcomedusae > Tetraplatiidae,,Pelagic,0 - 900,,Tropical,,"Atlantic, Pacific Oceans and the Mediterranean: Japan, Chile, Algeria, Brazil, Canada and USA.",,,,,,Members of the order Narcomedusae include L-form hydroids. Life cycle: The zygote develops int...,Members of the order Narcomedusae include L-form hydroids. Life cycle: The zygote develops int...,Not Evaluated,,,,,,"Body up to 13 cm long; with 4 flying buttresses, joining the basal corners of the oral aboral po...",,3.3,12.3,21.7,1171,,,,,,,Low vulnerability (10 of 100),Unknown,ca
8,Tetraplatia volitans,,Hydrozoa > Narcomedusae > Tetraplatiidae,,Pelagic,0 - 900,,Tropical,,"Atlantic, Pacific Oceans and the Mediterranean: Japan, Chile, Algeria, Brazil, Canada and USA.",,,,,,Members of the order Narcomedusae include L-form hydroids. Life cycle: The zygote develops int...,Members of the order Narcomedusae include L-form hydroids. Life cycle: The zygote develops int...,Not Evaluated,,,,,,"Body up to 13 cm long; with 4 flying buttresses, joining the basal corners of the oral aboral po...",,3.3,12.3,21.7,1171,,,,,,,Low vulnerability (10 of 100),Unknown,ca
9,Tetraplatia volitans,,Hydrozoa > Narcomedusae > Tetraplatiidae,,Pelagic,0 - 900,,Tropical,,"Atlantic, Pacific Oceans and the Mediterranean: Japan, Chile, Algeria, Brazil, Canada and USA.",,,,,,Members of the order Narcomedusae include L-form hydroids. Life cycle: The zygote develops int...,Members of the order Narcomedusae include L-form hydroids. Life cycle: The zygote develops int...,Not Evaluated,,,,,,"Body up to 13 cm long; with 4 flying buttresses, joining the basal corners of the oral aboral po...",,3.3,12.3,21.7,1171,,,,,,,Low vulnerability (10 of 100),Unknown,ca


In [7]:
dataset['depthRange'] = dataset['depthRange'].str.replace(' - ','--')
dataset['usualRange'] = dataset['usualRange'].str.replace(' - ','--')
dataset['distributionRangeCoordinates'] = dataset['distributionRangeCoordinates'].str.replace(' - ','--')
dataset['sizeRange'] = dataset['sizeRange'].str.replace(' - ','--')
dataset['dorsalSpines'] = dataset['dorsalSpines'].str.replace(' - ','--')
dataset['dorsalSoftRays'] = dataset['dorsalSoftRays'].str.replace('-','--')
dataset['analSoftRays'] = dataset['analSoftRays'].str.replace(' - ','--')
dataset['vertebrae'] = dataset['vertebrae'].str.replace(' - ','--')
dataset['sdRange'] = dataset['sdRange'].str.replace(' - ','--')

dataset['maxLength'] = pd.to_numeric(dataset['maxLength'])
dataset['maxWeight'] = pd.to_numeric(dataset['maxWeight'])
dataset['maxAge'] = pd.to_numeric(dataset['maxAge'])
dataset['preferredTemperatureMin'] = pd.to_numeric(dataset['preferredTemperatureMin'])
dataset['preferredTemperatureMean'] = pd.to_numeric(dataset['preferredTemperatureMean'])
dataset['preferredTemperatureMax'] = pd.to_numeric(dataset['preferredTemperatureMax'])
dataset['cellsBasedOn'] = pd.to_numeric(dataset['cellsBasedOn'])
dataset['phylogeneticDiversity'] = pd.to_numeric(dataset['phylogeneticDiversity'])
dataset['bayesianLengthWeightA'] = pd.to_numeric(dataset['bayesianLengthWeightA'])
dataset['bayesianLengthWeightB'] = pd.to_numeric(dataset['bayesianLengthWeightB'])
dataset['priorR'] = pd.to_numeric(dataset['priorR'])

dataset.to_csv('speciesDataset2.csv')

In [15]:
dataset.to_csv('speciesDataset2.csv')