In [1]:
import pandas as pd
from pathlib import Path
import sys
import scrapy
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from io import StringIO
from scrapy.crawler import CrawlerProcess
from pathlib import Path
import time
import datetime

# Settings

In [2]:
pd.set_option('display.max_colwidth', None)

# Constants

In [3]:
PATH_DATA = Path.cwd().parent.parent.parent.parent.parent / 'data'
PATH_DATA.exists()

True

In [4]:
PATH_SCRAPED = PATH_DATA / 'raw' / 'scraped'
PATH_SCRAPED.exists()

True

In [5]:
PATH_PREPROCESSED = PATH_DATA / 'preprocessed'
PATH_PREPROCESSED.exists()

True

# Archaea

## Data Extraction

In [6]:
df_archaea_name_url = pd.read_csv(PATH_SCRAPED / 'archaea_name_url.csv')
df_archaea_name_url

Unnamed: 0,name,url
0,ANME-2_cluster_archaeon_HR1,https://ftp.ncbi.nlm.nih.gov/genomes/refseq/archaea/ANME-2_cluster_archaeon_HR1/
1,Acidianus_ambivalens,https://ftp.ncbi.nlm.nih.gov/genomes/refseq/archaea/Acidianus_ambivalens/
2,Acidianus_brierleyi,https://ftp.ncbi.nlm.nih.gov/genomes/refseq/archaea/Acidianus_brierleyi/
3,Acidianus_hospitalis,https://ftp.ncbi.nlm.nih.gov/genomes/refseq/archaea/Acidianus_hospitalis/
4,Acidianus_infernus,https://ftp.ncbi.nlm.nih.gov/genomes/refseq/archaea/Acidianus_infernus/
...,...,...
1232,uncultured_Nitrososphaera_sp.,https://ftp.ncbi.nlm.nih.gov/genomes/refseq/archaea/uncultured_Nitrososphaera_sp./
1233,uncultured_archaeon_A07HB70,https://ftp.ncbi.nlm.nih.gov/genomes/refseq/archaea/uncultured_archaeon_A07HB70/
1234,uncultured_archaeon_A07HN63,https://ftp.ncbi.nlm.nih.gov/genomes/refseq/archaea/uncultured_archaeon_A07HN63/
1235,uncultured_archaeon_A07HR60,https://ftp.ncbi.nlm.nih.gov/genomes/refseq/archaea/uncultured_archaeon_A07HR60/


In [7]:
df_archaea_name_formatting = df_archaea_name_url.copy()


## Name formatting

### GCM: 'sp.', 'spp.'

In [8]:
df_archaea_name_formatting['formatted_name_gcm'] = df_archaea_name_formatting['name'].apply(lambda x: x.split('sp.')[0] if ('sp.' in x) else x)
df_archaea_name_formatting['formatted_name_gcm'] = df_archaea_name_formatting['formatted_name_gcm'].str.replace('_', ' ')
df_archaea_name_formatting

Unnamed: 0,name,url,formatted_name_gcm
0,ANME-2_cluster_archaeon_HR1,https://ftp.ncbi.nlm.nih.gov/genomes/refseq/archaea/ANME-2_cluster_archaeon_HR1/,ANME-2 cluster archaeon HR1
1,Acidianus_ambivalens,https://ftp.ncbi.nlm.nih.gov/genomes/refseq/archaea/Acidianus_ambivalens/,Acidianus ambivalens
2,Acidianus_brierleyi,https://ftp.ncbi.nlm.nih.gov/genomes/refseq/archaea/Acidianus_brierleyi/,Acidianus brierleyi
3,Acidianus_hospitalis,https://ftp.ncbi.nlm.nih.gov/genomes/refseq/archaea/Acidianus_hospitalis/,Acidianus hospitalis
4,Acidianus_infernus,https://ftp.ncbi.nlm.nih.gov/genomes/refseq/archaea/Acidianus_infernus/,Acidianus infernus
...,...,...,...
1232,uncultured_Nitrososphaera_sp.,https://ftp.ncbi.nlm.nih.gov/genomes/refseq/archaea/uncultured_Nitrososphaera_sp./,uncultured Nitrososphaera
1233,uncultured_archaeon_A07HB70,https://ftp.ncbi.nlm.nih.gov/genomes/refseq/archaea/uncultured_archaeon_A07HB70/,uncultured archaeon A07HB70
1234,uncultured_archaeon_A07HN63,https://ftp.ncbi.nlm.nih.gov/genomes/refseq/archaea/uncultured_archaeon_A07HN63/,uncultured archaeon A07HN63
1235,uncultured_archaeon_A07HR60,https://ftp.ncbi.nlm.nih.gov/genomes/refseq/archaea/uncultured_archaeon_A07HR60/,uncultured archaeon A07HR60


In [9]:
df_archaea_name_formatting[df_archaea_name_formatting['name'].str.contains('sp(?!\.)', regex=True)].head()

Unnamed: 0,name,url,formatted_name_gcm
3,Acidianus_hospitalis,https://ftp.ncbi.nlm.nih.gov/genomes/refseq/archaea/Acidianus_hospitalis/,Acidianus hospitalis
38,Caldisphaera_lagunensis,https://ftp.ncbi.nlm.nih.gov/genomes/refseq/archaea/Caldisphaera_lagunensis/,Caldisphaera lagunensis
39,Caldisphaera_sp.,https://ftp.ncbi.nlm.nih.gov/genomes/refseq/archaea/Caldisphaera_sp./,Caldisphaera
71,Candidatus_Methanolliviera_sp._GoM_asphalt,https://ftp.ncbi.nlm.nih.gov/genomes/refseq/archaea/Candidatus_Methanolliviera_sp._GoM_asphalt/,Candidatus Methanolliviera
78,Candidatus_Methanosphaera_massiliense,https://ftp.ncbi.nlm.nih.gov/genomes/refseq/archaea/Candidatus_Methanosphaera_massiliense/,Candidatus Methanosphaera massiliense


### NCBI

In [10]:
df_archaea_name_formatting['formatted_name_ncbi'] = df_archaea_name_formatting['name'].str.replace('_', ' ')
df_archaea_name_formatting[:15]

Unnamed: 0,name,url,formatted_name_gcm,formatted_name_ncbi
0,ANME-2_cluster_archaeon_HR1,https://ftp.ncbi.nlm.nih.gov/genomes/refseq/archaea/ANME-2_cluster_archaeon_HR1/,ANME-2 cluster archaeon HR1,ANME-2 cluster archaeon HR1
1,Acidianus_ambivalens,https://ftp.ncbi.nlm.nih.gov/genomes/refseq/archaea/Acidianus_ambivalens/,Acidianus ambivalens,Acidianus ambivalens
2,Acidianus_brierleyi,https://ftp.ncbi.nlm.nih.gov/genomes/refseq/archaea/Acidianus_brierleyi/,Acidianus brierleyi,Acidianus brierleyi
3,Acidianus_hospitalis,https://ftp.ncbi.nlm.nih.gov/genomes/refseq/archaea/Acidianus_hospitalis/,Acidianus hospitalis,Acidianus hospitalis
4,Acidianus_infernus,https://ftp.ncbi.nlm.nih.gov/genomes/refseq/archaea/Acidianus_infernus/,Acidianus infernus,Acidianus infernus
5,Acidianus_manzaensis,https://ftp.ncbi.nlm.nih.gov/genomes/refseq/archaea/Acidianus_manzaensis/,Acidianus manzaensis,Acidianus manzaensis
6,Acidianus_sp._HS-5,https://ftp.ncbi.nlm.nih.gov/genomes/refseq/archaea/Acidianus_sp._HS-5/,Acidianus,Acidianus sp. HS-5
7,Acidianus_sp._RZ1,https://ftp.ncbi.nlm.nih.gov/genomes/refseq/archaea/Acidianus_sp._RZ1/,Acidianus,Acidianus sp. RZ1
8,Acidianus_sulfidivorans,https://ftp.ncbi.nlm.nih.gov/genomes/refseq/archaea/Acidianus_sulfidivorans/,Acidianus sulfidivorans,Acidianus sulfidivorans
9,Acidilobus_saccharovorans,https://ftp.ncbi.nlm.nih.gov/genomes/refseq/archaea/Acidilobus_saccharovorans/,Acidilobus saccharovorans,Acidilobus saccharovorans


### search_urls

In [11]:
base_url = 'https://gcm.wdcm.org/search?search='
df_archaea_name_formatting['url_gcm_search'] = base_url + df_archaea_name_formatting['formatted_name_gcm']
df_archaea_name_formatting[:15]

Unnamed: 0,name,url,formatted_name_gcm,formatted_name_ncbi,url_gcm_search
0,ANME-2_cluster_archaeon_HR1,https://ftp.ncbi.nlm.nih.gov/genomes/refseq/archaea/ANME-2_cluster_archaeon_HR1/,ANME-2 cluster archaeon HR1,ANME-2 cluster archaeon HR1,https://gcm.wdcm.org/search?search=ANME-2 cluster archaeon HR1
1,Acidianus_ambivalens,https://ftp.ncbi.nlm.nih.gov/genomes/refseq/archaea/Acidianus_ambivalens/,Acidianus ambivalens,Acidianus ambivalens,https://gcm.wdcm.org/search?search=Acidianus ambivalens
2,Acidianus_brierleyi,https://ftp.ncbi.nlm.nih.gov/genomes/refseq/archaea/Acidianus_brierleyi/,Acidianus brierleyi,Acidianus brierleyi,https://gcm.wdcm.org/search?search=Acidianus brierleyi
3,Acidianus_hospitalis,https://ftp.ncbi.nlm.nih.gov/genomes/refseq/archaea/Acidianus_hospitalis/,Acidianus hospitalis,Acidianus hospitalis,https://gcm.wdcm.org/search?search=Acidianus hospitalis
4,Acidianus_infernus,https://ftp.ncbi.nlm.nih.gov/genomes/refseq/archaea/Acidianus_infernus/,Acidianus infernus,Acidianus infernus,https://gcm.wdcm.org/search?search=Acidianus infernus
5,Acidianus_manzaensis,https://ftp.ncbi.nlm.nih.gov/genomes/refseq/archaea/Acidianus_manzaensis/,Acidianus manzaensis,Acidianus manzaensis,https://gcm.wdcm.org/search?search=Acidianus manzaensis
6,Acidianus_sp._HS-5,https://ftp.ncbi.nlm.nih.gov/genomes/refseq/archaea/Acidianus_sp._HS-5/,Acidianus,Acidianus sp. HS-5,https://gcm.wdcm.org/search?search=Acidianus
7,Acidianus_sp._RZ1,https://ftp.ncbi.nlm.nih.gov/genomes/refseq/archaea/Acidianus_sp._RZ1/,Acidianus,Acidianus sp. RZ1,https://gcm.wdcm.org/search?search=Acidianus
8,Acidianus_sulfidivorans,https://ftp.ncbi.nlm.nih.gov/genomes/refseq/archaea/Acidianus_sulfidivorans/,Acidianus sulfidivorans,Acidianus sulfidivorans,https://gcm.wdcm.org/search?search=Acidianus sulfidivorans
9,Acidilobus_saccharovorans,https://ftp.ncbi.nlm.nih.gov/genomes/refseq/archaea/Acidilobus_saccharovorans/,Acidilobus saccharovorans,Acidilobus saccharovorans,https://gcm.wdcm.org/search?search=Acidilobus saccharovorans


In [12]:
df_archaea_name_formatting['url_gcm_isolation_src'] = base_url + df_archaea_name_formatting['formatted_name_gcm'].str.strip() + '&list=isolate'
df_archaea_name_formatting[15:]

Unnamed: 0,name,url,formatted_name_gcm,formatted_name_ncbi,url_gcm_search,url_gcm_isolation_src
15,Aciduliprofundum_sp._MAR08-339,https://ftp.ncbi.nlm.nih.gov/genomes/refseq/archaea/Aciduliprofundum_sp._MAR08-339/,Aciduliprofundum,Aciduliprofundum sp. MAR08-339,https://gcm.wdcm.org/search?search=Aciduliprofundum,https://gcm.wdcm.org/search?search=Aciduliprofundum&list=isolate
16,Actinarchaeum_halophilum,https://ftp.ncbi.nlm.nih.gov/genomes/refseq/archaea/Actinarchaeum_halophilum/,Actinarchaeum halophilum,Actinarchaeum halophilum,https://gcm.wdcm.org/search?search=Actinarchaeum halophilum,https://gcm.wdcm.org/search?search=Actinarchaeum halophilum&list=isolate
17,Aeropyrum_camini,https://ftp.ncbi.nlm.nih.gov/genomes/refseq/archaea/Aeropyrum_camini/,Aeropyrum camini,Aeropyrum camini,https://gcm.wdcm.org/search?search=Aeropyrum camini,https://gcm.wdcm.org/search?search=Aeropyrum camini&list=isolate
18,Aeropyrum_pernix,https://ftp.ncbi.nlm.nih.gov/genomes/refseq/archaea/Aeropyrum_pernix/,Aeropyrum pernix,Aeropyrum pernix,https://gcm.wdcm.org/search?search=Aeropyrum pernix,https://gcm.wdcm.org/search?search=Aeropyrum pernix&list=isolate
19,Aigarchaeota_archaeon_JGI_0000001-A7,https://ftp.ncbi.nlm.nih.gov/genomes/refseq/archaea/Aigarchaeota_archaeon_JGI_0000001-A7/,Aigarchaeota archaeon JGI 0000001-A7,Aigarchaeota archaeon JGI 0000001-A7,https://gcm.wdcm.org/search?search=Aigarchaeota archaeon JGI 0000001-A7,https://gcm.wdcm.org/search?search=Aigarchaeota archaeon JGI 0000001-A7&list=isolate
...,...,...,...,...,...,...
1232,uncultured_Nitrososphaera_sp.,https://ftp.ncbi.nlm.nih.gov/genomes/refseq/archaea/uncultured_Nitrososphaera_sp./,uncultured Nitrososphaera,uncultured Nitrososphaera sp.,https://gcm.wdcm.org/search?search=uncultured Nitrososphaera,https://gcm.wdcm.org/search?search=uncultured Nitrososphaera&list=isolate
1233,uncultured_archaeon_A07HB70,https://ftp.ncbi.nlm.nih.gov/genomes/refseq/archaea/uncultured_archaeon_A07HB70/,uncultured archaeon A07HB70,uncultured archaeon A07HB70,https://gcm.wdcm.org/search?search=uncultured archaeon A07HB70,https://gcm.wdcm.org/search?search=uncultured archaeon A07HB70&list=isolate
1234,uncultured_archaeon_A07HN63,https://ftp.ncbi.nlm.nih.gov/genomes/refseq/archaea/uncultured_archaeon_A07HN63/,uncultured archaeon A07HN63,uncultured archaeon A07HN63,https://gcm.wdcm.org/search?search=uncultured archaeon A07HN63,https://gcm.wdcm.org/search?search=uncultured archaeon A07HN63&list=isolate
1235,uncultured_archaeon_A07HR60,https://ftp.ncbi.nlm.nih.gov/genomes/refseq/archaea/uncultured_archaeon_A07HR60/,uncultured archaeon A07HR60,uncultured archaeon A07HR60,https://gcm.wdcm.org/search?search=uncultured archaeon A07HR60,https://gcm.wdcm.org/search?search=uncultured archaeon A07HR60&list=isolate


In [13]:
df_archaea_name_formatting.columns

Index(['name', 'url', 'formatted_name_gcm', 'formatted_name_ncbi',
       'url_gcm_search', 'url_gcm_isolation_src'],
      dtype='object')

### save

In [14]:
# df_archaea_name_formatting.to_csv(PATH_PREPROCESSED / 'archaea' / 'archaea_name_formatting.csv', index=False)

# Bacteria

## Data Extraction

In [6]:
df_bacteria_name_url = pd.read_csv(PATH_SCRAPED / 'bacteria_name_url.csv')
df_bacteria_name_url

Unnamed: 0,name,url
0,Abditibacterium_utsteinense,https://ftp.ncbi.nlm.nih.gov/genomes/refseq/bacteria/Abditibacterium_utsteinense/
1,Abiotrophia_defectiva,https://ftp.ncbi.nlm.nih.gov/genomes/refseq/bacteria/Abiotrophia_defectiva/
2,Abiotrophia_sp.,https://ftp.ncbi.nlm.nih.gov/genomes/refseq/bacteria/Abiotrophia_sp./
3,Abiotrophia_sp._HMSC24B09,https://ftp.ncbi.nlm.nih.gov/genomes/refseq/bacteria/Abiotrophia_sp._HMSC24B09/
4,Absicoccus_porci,https://ftp.ncbi.nlm.nih.gov/genomes/refseq/bacteria/Absicoccus_porci/
...,...,...
52904,zeta_proteobacterium_SCGC_AB-137-J06,https://ftp.ncbi.nlm.nih.gov/genomes/refseq/bacteria/zeta_proteobacterium_SCGC_AB-137-J06/
52905,zeta_proteobacterium_SCGC_AB-602-C20,https://ftp.ncbi.nlm.nih.gov/genomes/refseq/bacteria/zeta_proteobacterium_SCGC_AB-602-C20/
52906,zeta_proteobacterium_SCGC_AB-602-E04,https://ftp.ncbi.nlm.nih.gov/genomes/refseq/bacteria/zeta_proteobacterium_SCGC_AB-602-E04/
52907,zeta_proteobacterium_SCGC_AB-604-B04,https://ftp.ncbi.nlm.nih.gov/genomes/refseq/bacteria/zeta_proteobacterium_SCGC_AB-604-B04/


In [7]:
df_bacteria_name_formatting = df_bacteria_name_url.copy()

## Name formatting

### GCM: 'sp.', 'spp.'

In [8]:
df_bacteria_name_formatting['formatted_name_gcm'] = df_bacteria_name_formatting['name'].apply(lambda x: x.split('sp.')[0] if ('sp.' in x) else x)
df_bacteria_name_formatting['formatted_name_gcm'] = df_bacteria_name_formatting['formatted_name_gcm'].str.replace('_', ' ')
df_bacteria_name_formatting

Unnamed: 0,name,url,formatted_name_gcm
0,Abditibacterium_utsteinense,https://ftp.ncbi.nlm.nih.gov/genomes/refseq/bacteria/Abditibacterium_utsteinense/,Abditibacterium utsteinense
1,Abiotrophia_defectiva,https://ftp.ncbi.nlm.nih.gov/genomes/refseq/bacteria/Abiotrophia_defectiva/,Abiotrophia defectiva
2,Abiotrophia_sp.,https://ftp.ncbi.nlm.nih.gov/genomes/refseq/bacteria/Abiotrophia_sp./,Abiotrophia
3,Abiotrophia_sp._HMSC24B09,https://ftp.ncbi.nlm.nih.gov/genomes/refseq/bacteria/Abiotrophia_sp._HMSC24B09/,Abiotrophia
4,Absicoccus_porci,https://ftp.ncbi.nlm.nih.gov/genomes/refseq/bacteria/Absicoccus_porci/,Absicoccus porci
...,...,...,...
52904,zeta_proteobacterium_SCGC_AB-137-J06,https://ftp.ncbi.nlm.nih.gov/genomes/refseq/bacteria/zeta_proteobacterium_SCGC_AB-137-J06/,zeta proteobacterium SCGC AB-137-J06
52905,zeta_proteobacterium_SCGC_AB-602-C20,https://ftp.ncbi.nlm.nih.gov/genomes/refseq/bacteria/zeta_proteobacterium_SCGC_AB-602-C20/,zeta proteobacterium SCGC AB-602-C20
52906,zeta_proteobacterium_SCGC_AB-602-E04,https://ftp.ncbi.nlm.nih.gov/genomes/refseq/bacteria/zeta_proteobacterium_SCGC_AB-602-E04/,zeta proteobacterium SCGC AB-602-E04
52907,zeta_proteobacterium_SCGC_AB-604-B04,https://ftp.ncbi.nlm.nih.gov/genomes/refseq/bacteria/zeta_proteobacterium_SCGC_AB-604-B04/,zeta proteobacterium SCGC AB-604-B04


### NCBI

In [9]:
df_bacteria_name_formatting['formatted_name_ncbi'] = df_bacteria_name_formatting['name'].str.replace('_', ' ')
df_bacteria_name_formatting[:15]

Unnamed: 0,name,url,formatted_name_gcm,formatted_name_ncbi
0,Abditibacterium_utsteinense,https://ftp.ncbi.nlm.nih.gov/genomes/refseq/bacteria/Abditibacterium_utsteinense/,Abditibacterium utsteinense,Abditibacterium utsteinense
1,Abiotrophia_defectiva,https://ftp.ncbi.nlm.nih.gov/genomes/refseq/bacteria/Abiotrophia_defectiva/,Abiotrophia defectiva,Abiotrophia defectiva
2,Abiotrophia_sp.,https://ftp.ncbi.nlm.nih.gov/genomes/refseq/bacteria/Abiotrophia_sp./,Abiotrophia,Abiotrophia sp.
3,Abiotrophia_sp._HMSC24B09,https://ftp.ncbi.nlm.nih.gov/genomes/refseq/bacteria/Abiotrophia_sp._HMSC24B09/,Abiotrophia,Abiotrophia sp. HMSC24B09
4,Absicoccus_porci,https://ftp.ncbi.nlm.nih.gov/genomes/refseq/bacteria/Absicoccus_porci/,Absicoccus porci,Absicoccus porci
5,Absicoccus_sp.,https://ftp.ncbi.nlm.nih.gov/genomes/refseq/bacteria/Absicoccus_sp./,Absicoccus,Absicoccus sp.
6,Absicoccus_sp._CLA-KB-P134,https://ftp.ncbi.nlm.nih.gov/genomes/refseq/bacteria/Absicoccus_sp._CLA-KB-P134/,Absicoccus,Absicoccus sp. CLA-KB-P134
7,Absiella_sp._AM09-45,https://ftp.ncbi.nlm.nih.gov/genomes/refseq/bacteria/Absiella_sp._AM09-45/,Absiella,Absiella sp. AM09-45
8,Absiella_sp._AM09-50,https://ftp.ncbi.nlm.nih.gov/genomes/refseq/bacteria/Absiella_sp._AM09-50/,Absiella,Absiella sp. AM09-50
9,Absiella_sp._AM10-20,https://ftp.ncbi.nlm.nih.gov/genomes/refseq/bacteria/Absiella_sp._AM10-20/,Absiella,Absiella sp. AM10-20


### search_urls

In [10]:
base_url = 'https://gcm.wdcm.org/search?search='
df_bacteria_name_formatting['url_gcm_search'] = base_url + df_bacteria_name_formatting['formatted_name_gcm']
df_bacteria_name_formatting[:15]

Unnamed: 0,name,url,formatted_name_gcm,formatted_name_ncbi,url_gcm_search
0,Abditibacterium_utsteinense,https://ftp.ncbi.nlm.nih.gov/genomes/refseq/bacteria/Abditibacterium_utsteinense/,Abditibacterium utsteinense,Abditibacterium utsteinense,https://gcm.wdcm.org/search?search=Abditibacterium utsteinense
1,Abiotrophia_defectiva,https://ftp.ncbi.nlm.nih.gov/genomes/refseq/bacteria/Abiotrophia_defectiva/,Abiotrophia defectiva,Abiotrophia defectiva,https://gcm.wdcm.org/search?search=Abiotrophia defectiva
2,Abiotrophia_sp.,https://ftp.ncbi.nlm.nih.gov/genomes/refseq/bacteria/Abiotrophia_sp./,Abiotrophia,Abiotrophia sp.,https://gcm.wdcm.org/search?search=Abiotrophia
3,Abiotrophia_sp._HMSC24B09,https://ftp.ncbi.nlm.nih.gov/genomes/refseq/bacteria/Abiotrophia_sp._HMSC24B09/,Abiotrophia,Abiotrophia sp. HMSC24B09,https://gcm.wdcm.org/search?search=Abiotrophia
4,Absicoccus_porci,https://ftp.ncbi.nlm.nih.gov/genomes/refseq/bacteria/Absicoccus_porci/,Absicoccus porci,Absicoccus porci,https://gcm.wdcm.org/search?search=Absicoccus porci
5,Absicoccus_sp.,https://ftp.ncbi.nlm.nih.gov/genomes/refseq/bacteria/Absicoccus_sp./,Absicoccus,Absicoccus sp.,https://gcm.wdcm.org/search?search=Absicoccus
6,Absicoccus_sp._CLA-KB-P134,https://ftp.ncbi.nlm.nih.gov/genomes/refseq/bacteria/Absicoccus_sp._CLA-KB-P134/,Absicoccus,Absicoccus sp. CLA-KB-P134,https://gcm.wdcm.org/search?search=Absicoccus
7,Absiella_sp._AM09-45,https://ftp.ncbi.nlm.nih.gov/genomes/refseq/bacteria/Absiella_sp._AM09-45/,Absiella,Absiella sp. AM09-45,https://gcm.wdcm.org/search?search=Absiella
8,Absiella_sp._AM09-50,https://ftp.ncbi.nlm.nih.gov/genomes/refseq/bacteria/Absiella_sp._AM09-50/,Absiella,Absiella sp. AM09-50,https://gcm.wdcm.org/search?search=Absiella
9,Absiella_sp._AM10-20,https://ftp.ncbi.nlm.nih.gov/genomes/refseq/bacteria/Absiella_sp._AM10-20/,Absiella,Absiella sp. AM10-20,https://gcm.wdcm.org/search?search=Absiella


In [11]:
df_bacteria_name_formatting['url_gcm_isolation_src'] = base_url + df_bacteria_name_formatting['formatted_name_gcm'].str.strip() + '&list=isolate'
df_bacteria_name_formatting[15:]

Unnamed: 0,name,url,formatted_name_gcm,formatted_name_ncbi,url_gcm_search,url_gcm_isolation_src
15,Abyssibacter_profundi,https://ftp.ncbi.nlm.nih.gov/genomes/refseq/bacteria/Abyssibacter_profundi/,Abyssibacter profundi,Abyssibacter profundi,https://gcm.wdcm.org/search?search=Abyssibacter profundi,https://gcm.wdcm.org/search?search=Abyssibacter profundi&list=isolate
16,Abyssibacter_sp.,https://ftp.ncbi.nlm.nih.gov/genomes/refseq/bacteria/Abyssibacter_sp./,Abyssibacter,Abyssibacter sp.,https://gcm.wdcm.org/search?search=Abyssibacter,https://gcm.wdcm.org/search?search=Abyssibacter&list=isolate
17,Abyssibius_alkaniclasticus,https://ftp.ncbi.nlm.nih.gov/genomes/refseq/bacteria/Abyssibius_alkaniclasticus/,Abyssibius alkaniclasticus,Abyssibius alkaniclasticus,https://gcm.wdcm.org/search?search=Abyssibius alkaniclasticus,https://gcm.wdcm.org/search?search=Abyssibius alkaniclasticus&list=isolate
18,Abyssicoccus_albus,https://ftp.ncbi.nlm.nih.gov/genomes/refseq/bacteria/Abyssicoccus_albus/,Abyssicoccus albus,Abyssicoccus albus,https://gcm.wdcm.org/search?search=Abyssicoccus albus,https://gcm.wdcm.org/search?search=Abyssicoccus albus&list=isolate
19,Abyssisolibacter_fermentans,https://ftp.ncbi.nlm.nih.gov/genomes/refseq/bacteria/Abyssisolibacter_fermentans/,Abyssisolibacter fermentans,Abyssisolibacter fermentans,https://gcm.wdcm.org/search?search=Abyssisolibacter fermentans,https://gcm.wdcm.org/search?search=Abyssisolibacter fermentans&list=isolate
...,...,...,...,...,...,...
52904,zeta_proteobacterium_SCGC_AB-137-J06,https://ftp.ncbi.nlm.nih.gov/genomes/refseq/bacteria/zeta_proteobacterium_SCGC_AB-137-J06/,zeta proteobacterium SCGC AB-137-J06,zeta proteobacterium SCGC AB-137-J06,https://gcm.wdcm.org/search?search=zeta proteobacterium SCGC AB-137-J06,https://gcm.wdcm.org/search?search=zeta proteobacterium SCGC AB-137-J06&list=isolate
52905,zeta_proteobacterium_SCGC_AB-602-C20,https://ftp.ncbi.nlm.nih.gov/genomes/refseq/bacteria/zeta_proteobacterium_SCGC_AB-602-C20/,zeta proteobacterium SCGC AB-602-C20,zeta proteobacterium SCGC AB-602-C20,https://gcm.wdcm.org/search?search=zeta proteobacterium SCGC AB-602-C20,https://gcm.wdcm.org/search?search=zeta proteobacterium SCGC AB-602-C20&list=isolate
52906,zeta_proteobacterium_SCGC_AB-602-E04,https://ftp.ncbi.nlm.nih.gov/genomes/refseq/bacteria/zeta_proteobacterium_SCGC_AB-602-E04/,zeta proteobacterium SCGC AB-602-E04,zeta proteobacterium SCGC AB-602-E04,https://gcm.wdcm.org/search?search=zeta proteobacterium SCGC AB-602-E04,https://gcm.wdcm.org/search?search=zeta proteobacterium SCGC AB-602-E04&list=isolate
52907,zeta_proteobacterium_SCGC_AB-604-B04,https://ftp.ncbi.nlm.nih.gov/genomes/refseq/bacteria/zeta_proteobacterium_SCGC_AB-604-B04/,zeta proteobacterium SCGC AB-604-B04,zeta proteobacterium SCGC AB-604-B04,https://gcm.wdcm.org/search?search=zeta proteobacterium SCGC AB-604-B04,https://gcm.wdcm.org/search?search=zeta proteobacterium SCGC AB-604-B04&list=isolate


In [12]:
df_bacteria_name_formatting.columns

Index(['name', 'url', 'formatted_name_gcm', 'formatted_name_ncbi',
       'url_gcm_search', 'url_gcm_isolation_src'],
      dtype='object')

### No duplicates

In [6]:
df_bacteria_name_formatting = pd.read_csv(PATH_PREPROCESSED / 'bacteria' / 'bacteria_name_formatting.csv')
df_bacteria_name_formatting

Unnamed: 0,name,url,formatted_name_gcm,formatted_name_ncbi,url_gcm_search,url_gcm_isolation_src
0,Abditibacterium_utsteinense,https://ftp.ncbi.nlm.nih.gov/genomes/refseq/bacteria/Abditibacterium_utsteinense/,Abditibacterium utsteinense,Abditibacterium utsteinense,https://gcm.wdcm.org/search?search=Abditibacterium utsteinense,https://gcm.wdcm.org/search?search=Abditibacterium utsteinense&list=isolate
1,Abiotrophia_defectiva,https://ftp.ncbi.nlm.nih.gov/genomes/refseq/bacteria/Abiotrophia_defectiva/,Abiotrophia defectiva,Abiotrophia defectiva,https://gcm.wdcm.org/search?search=Abiotrophia defectiva,https://gcm.wdcm.org/search?search=Abiotrophia defectiva&list=isolate
2,Abiotrophia_sp.,https://ftp.ncbi.nlm.nih.gov/genomes/refseq/bacteria/Abiotrophia_sp./,Abiotrophia,Abiotrophia sp.,https://gcm.wdcm.org/search?search=Abiotrophia,https://gcm.wdcm.org/search?search=Abiotrophia&list=isolate
3,Abiotrophia_sp._HMSC24B09,https://ftp.ncbi.nlm.nih.gov/genomes/refseq/bacteria/Abiotrophia_sp._HMSC24B09/,Abiotrophia,Abiotrophia sp. HMSC24B09,https://gcm.wdcm.org/search?search=Abiotrophia,https://gcm.wdcm.org/search?search=Abiotrophia&list=isolate
4,Absicoccus_porci,https://ftp.ncbi.nlm.nih.gov/genomes/refseq/bacteria/Absicoccus_porci/,Absicoccus porci,Absicoccus porci,https://gcm.wdcm.org/search?search=Absicoccus porci,https://gcm.wdcm.org/search?search=Absicoccus porci&list=isolate
...,...,...,...,...,...,...
52904,zeta_proteobacterium_SCGC_AB-137-J06,https://ftp.ncbi.nlm.nih.gov/genomes/refseq/bacteria/zeta_proteobacterium_SCGC_AB-137-J06/,zeta proteobacterium SCGC AB-137-J06,zeta proteobacterium SCGC AB-137-J06,https://gcm.wdcm.org/search?search=zeta proteobacterium SCGC AB-137-J06,https://gcm.wdcm.org/search?search=zeta proteobacterium SCGC AB-137-J06&list=isolate
52905,zeta_proteobacterium_SCGC_AB-602-C20,https://ftp.ncbi.nlm.nih.gov/genomes/refseq/bacteria/zeta_proteobacterium_SCGC_AB-602-C20/,zeta proteobacterium SCGC AB-602-C20,zeta proteobacterium SCGC AB-602-C20,https://gcm.wdcm.org/search?search=zeta proteobacterium SCGC AB-602-C20,https://gcm.wdcm.org/search?search=zeta proteobacterium SCGC AB-602-C20&list=isolate
52906,zeta_proteobacterium_SCGC_AB-602-E04,https://ftp.ncbi.nlm.nih.gov/genomes/refseq/bacteria/zeta_proteobacterium_SCGC_AB-602-E04/,zeta proteobacterium SCGC AB-602-E04,zeta proteobacterium SCGC AB-602-E04,https://gcm.wdcm.org/search?search=zeta proteobacterium SCGC AB-602-E04,https://gcm.wdcm.org/search?search=zeta proteobacterium SCGC AB-602-E04&list=isolate
52907,zeta_proteobacterium_SCGC_AB-604-B04,https://ftp.ncbi.nlm.nih.gov/genomes/refseq/bacteria/zeta_proteobacterium_SCGC_AB-604-B04/,zeta proteobacterium SCGC AB-604-B04,zeta proteobacterium SCGC AB-604-B04,https://gcm.wdcm.org/search?search=zeta proteobacterium SCGC AB-604-B04,https://gcm.wdcm.org/search?search=zeta proteobacterium SCGC AB-604-B04&list=isolate


In [7]:
df_bacteria_name_formatting_no_duplicates = df_bacteria_name_formatting.drop_duplicates(subset=['url_gcm_isolation_src'], keep='first')
df_bacteria_name_formatting_no_duplicates

Unnamed: 0,name,url,formatted_name_gcm,formatted_name_ncbi,url_gcm_search,url_gcm_isolation_src
0,Abditibacterium_utsteinense,https://ftp.ncbi.nlm.nih.gov/genomes/refseq/bacteria/Abditibacterium_utsteinense/,Abditibacterium utsteinense,Abditibacterium utsteinense,https://gcm.wdcm.org/search?search=Abditibacterium utsteinense,https://gcm.wdcm.org/search?search=Abditibacterium utsteinense&list=isolate
1,Abiotrophia_defectiva,https://ftp.ncbi.nlm.nih.gov/genomes/refseq/bacteria/Abiotrophia_defectiva/,Abiotrophia defectiva,Abiotrophia defectiva,https://gcm.wdcm.org/search?search=Abiotrophia defectiva,https://gcm.wdcm.org/search?search=Abiotrophia defectiva&list=isolate
2,Abiotrophia_sp.,https://ftp.ncbi.nlm.nih.gov/genomes/refseq/bacteria/Abiotrophia_sp./,Abiotrophia,Abiotrophia sp.,https://gcm.wdcm.org/search?search=Abiotrophia,https://gcm.wdcm.org/search?search=Abiotrophia&list=isolate
4,Absicoccus_porci,https://ftp.ncbi.nlm.nih.gov/genomes/refseq/bacteria/Absicoccus_porci/,Absicoccus porci,Absicoccus porci,https://gcm.wdcm.org/search?search=Absicoccus porci,https://gcm.wdcm.org/search?search=Absicoccus porci&list=isolate
5,Absicoccus_sp.,https://ftp.ncbi.nlm.nih.gov/genomes/refseq/bacteria/Absicoccus_sp./,Absicoccus,Absicoccus sp.,https://gcm.wdcm.org/search?search=Absicoccus,https://gcm.wdcm.org/search?search=Absicoccus&list=isolate
...,...,...,...,...,...,...
52904,zeta_proteobacterium_SCGC_AB-137-J06,https://ftp.ncbi.nlm.nih.gov/genomes/refseq/bacteria/zeta_proteobacterium_SCGC_AB-137-J06/,zeta proteobacterium SCGC AB-137-J06,zeta proteobacterium SCGC AB-137-J06,https://gcm.wdcm.org/search?search=zeta proteobacterium SCGC AB-137-J06,https://gcm.wdcm.org/search?search=zeta proteobacterium SCGC AB-137-J06&list=isolate
52905,zeta_proteobacterium_SCGC_AB-602-C20,https://ftp.ncbi.nlm.nih.gov/genomes/refseq/bacteria/zeta_proteobacterium_SCGC_AB-602-C20/,zeta proteobacterium SCGC AB-602-C20,zeta proteobacterium SCGC AB-602-C20,https://gcm.wdcm.org/search?search=zeta proteobacterium SCGC AB-602-C20,https://gcm.wdcm.org/search?search=zeta proteobacterium SCGC AB-602-C20&list=isolate
52906,zeta_proteobacterium_SCGC_AB-602-E04,https://ftp.ncbi.nlm.nih.gov/genomes/refseq/bacteria/zeta_proteobacterium_SCGC_AB-602-E04/,zeta proteobacterium SCGC AB-602-E04,zeta proteobacterium SCGC AB-602-E04,https://gcm.wdcm.org/search?search=zeta proteobacterium SCGC AB-602-E04,https://gcm.wdcm.org/search?search=zeta proteobacterium SCGC AB-602-E04&list=isolate
52907,zeta_proteobacterium_SCGC_AB-604-B04,https://ftp.ncbi.nlm.nih.gov/genomes/refseq/bacteria/zeta_proteobacterium_SCGC_AB-604-B04/,zeta proteobacterium SCGC AB-604-B04,zeta proteobacterium SCGC AB-604-B04,https://gcm.wdcm.org/search?search=zeta proteobacterium SCGC AB-604-B04,https://gcm.wdcm.org/search?search=zeta proteobacterium SCGC AB-604-B04&list=isolate


### save

In [13]:
# df_bacteria_name_formatting.to_csv(PATH_PREPROCESSED / 'bacteria' / 'bacteria_name_formatting.csv', index=False)

In [8]:
# df_bacteria_name_formatting_no_duplicates.to_csv(PATH_PREPROCESSED / 'bacteria' / 'bacteria_name_formatting_no_duplicates.csv', index=False)