In [1]:
import pandas as pd
from pathlib import Path

In [2]:
import sys
sys.path.append("./../../")  
from utils.text_files_processor import TextFilesProcessor  

# Constants

In [3]:
PATH_DATA = Path.cwd().parent.parent.parent.parent / 'data'
PATH_DATA.exists()

True

In [4]:
PATH_SCRAPED = PATH_DATA / 'raw' / 'scraped'
PATH_SCRAPED.exists()

True

In [5]:
PATH_PREPROCESSED = PATH_DATA / 'preprocessed'
PATH_PREPROCESSED.exists()

True

# Data Extraction

In [6]:
files = PATH_SCRAPED.glob('*')
for file in files:
    print(file.name)

bacteria_text_files_0_15000.json
urls_bacteria_organisms.txt
archaea_text_files.json
test.json
bacteria_text_files_30000_49760.json
bacteria_text_files_15000_30000.json
bacteria_text_files_sep23.json


In [7]:
df_organisms = pd.read_json(PATH_SCRAPED / 'archaea_text_files.json')
df_organisms['type'] = 'archaea'
df_organisms = df_organisms[['organism', 'type', 'url', 'filename', 'content']]
print(df_organisms.shape)
df_organisms.head(10)

(2415, 5)


Unnamed: 0,organism,type,url,filename,content
0,ANME-2_cluster_archaeon_HR1,archaea,https://ftp.ncbi.nlm.nih.gov/genomes/refseq/ar...,assembly_summary_historical.txt,## See ftp://ftp.ncbi.nlm.nih.gov/genomes/REA...
1,ANME-2_cluster_archaeon_HR1,archaea,https://ftp.ncbi.nlm.nih.gov/genomes/refseq/ar...,annotation_hashes.txt,# Assembly accession\tDescriptors hash\tDescri...
2,Acidianus_brierleyi,archaea,https://ftp.ncbi.nlm.nih.gov/genomes/refseq/ar...,assembly_summary_historical.txt,## See ftp://ftp.ncbi.nlm.nih.gov/genomes/REA...
3,Acidianus_brierleyi,archaea,https://ftp.ncbi.nlm.nih.gov/genomes/refseq/ar...,assembly_summary.txt,## See ftp://ftp.ncbi.nlm.nih.gov/genomes/REA...
4,Acidianus_ambivalens,archaea,https://ftp.ncbi.nlm.nih.gov/genomes/refseq/ar...,assembly_summary.txt,## See ftp://ftp.ncbi.nlm.nih.gov/genomes/REA...
5,Acidianus_ambivalens,archaea,https://ftp.ncbi.nlm.nih.gov/genomes/refseq/ar...,annotation_hashes.txt,# Assembly accession\tDescriptors hash\tDescri...
6,Acidianus_infernus,archaea,https://ftp.ncbi.nlm.nih.gov/genomes/refseq/ar...,assembly_summary.txt,## See ftp://ftp.ncbi.nlm.nih.gov/genomes/REA...
7,Acidianus_sulfidivorans,archaea,https://ftp.ncbi.nlm.nih.gov/genomes/refseq/ar...,assembly_summary_historical.txt,## See ftp://ftp.ncbi.nlm.nih.gov/genomes/REA...
8,Acidianus_sp._RZ1,archaea,https://ftp.ncbi.nlm.nih.gov/genomes/refseq/ar...,assembly_summary.txt,## See ftp://ftp.ncbi.nlm.nih.gov/genomes/REA...
9,Acidianus_infernus,archaea,https://ftp.ncbi.nlm.nih.gov/genomes/refseq/ar...,annotation_hashes.txt,# Assembly accession\tDescriptors hash\tDescri...


# Text processing of 'content'

In [8]:
df_organisms['filename'].value_counts()

filename
annotation_hashes.txt              1156
assembly_summary.txt               1032
assembly_summary_historical.txt     227
Name: count, dtype: int64

In [9]:
text_pro = TextFilesProcessor()

## annotation_hashes.txt

In [10]:
annotation_hashes_0 = df_organisms[ df_organisms['filename'] == 'annotation_hashes.txt' ].iloc[0,:]['content']
print(annotation_hashes_0)

# Assembly accession	Descriptors hash	Descriptors last changed	Features hash	Features last changed	Locations hash	Locations last change	Protein names hash	Protein names last changed
GCF_002926195.1	8CA404B5F60C119A2C43EC4AF02195FA	2018/02/17 15:37:00	E217B9860CC56B81BE5B665501BC7CCA	2018/02/17 15:37:00	73B928E31BFB3E085715F26DF60C1D87	2018/02/17 15:37:00	3090555A37745BC2AE0AA911A8EE7A11	2018/02/17 15:37:00



In [11]:
columns_annotation_hashes = annotation_hashes_0.strip().split('\n')[0].split('\t')
columns_annotation_hashes

['# Assembly accession',
 'Descriptors hash',
 'Descriptors last changed',
 'Features hash',
 'Features last changed',
 'Locations hash',
 'Locations last change',
 'Protein names hash',
 'Protein names last changed']

In [12]:
df_org_annotation_hashes = df_organisms[ df_organisms['filename'] == 'annotation_hashes.txt' ]
df_annotation_hashes = pd.DataFrame(columns=columns_annotation_hashes)
df_annotation_hashes = pd.concat([df_annotation_hashes, df_org_annotation_hashes['content']\
                                  .apply(lambda x: text_pro.get_pdseries_from_text("annotation_hashes.txt", x))])
print(df_annotation_hashes.shape)
df_annotation_hashes.head(10)

(1156, 9)


Unnamed: 0,# Assembly accession,Descriptors hash,Descriptors last changed,Features hash,Features last changed,Locations hash,Locations last change,Protein names hash,Protein names last changed
1,GCF_002926195.1,8CA404B5F60C119A2C43EC4AF02195FA,2018/02/17 15:37:00,E217B9860CC56B81BE5B665501BC7CCA,2018/02/17 15:37:00,73B928E31BFB3E085715F26DF60C1D87,2018/02/17 15:37:00,3090555A37745BC2AE0AA911A8EE7A11,2018/02/17 15:37:00
5,GCF_009428885.1,13930A0174316BD1CD10ABBFB23D5C85,2023/06/10 14:29:00,89FE920552570CEE9B680F2121649608,2023/06/10 14:29:00,E6989F1E613B47E058D4C56C91375B57,2023/06/10 14:29:00,5BD3024B511FE600FD251C6762E26ACE,2023/06/10 14:29:00
9,GCF_009729545.1,F9F4D6141230C3A24706E462E21DE384,2023/05/05 17:16:00,E776E7A4A2176E9A32DD529BE26BB43E,2023/05/05 17:16:00,46E8238E890DCB56638BCB199533C7EF,2023/05/05 17:16:00,195A586CC83988DFA2A1391902296EC8,2023/05/05 17:16:00
12,GCF_003201765.1,7CA82B923A7B171FE4E12201E27517F0,2020/03/04 08:42:00,439477814832DD76BC0F9B67B4B33390,2020/03/04 08:42:00,57A12108007B9BA52A11BBA5DAD61FDC,2020/03/04 08:42:00,3DDB6D6A809477EFB8FB674F5B3D8E35,2020/03/04 08:42:00
15,GCF_027368325.1,6CFE27813E8637618B3809BCD250F7E0,2023/06/28 03:30:00,0671178CE7CA897EF900FAE0E2F5FC69,2023/06/28 03:30:00,F473815B859A604BF56ECA6AAE0E93B0,2023/06/28 03:30:00,BE7837986B420BAB3E7E18D5DC7A9810,2023/06/28 03:30:00
17,GCF_001399695.1,C8880AA472B7757F32C952A0418E9677,2022/12/19 01:34:00,69B4C00C4468C45AC4FE9E50B101A3C3,2022/12/19 01:34:00,1E046751993205F1B8715F342F9F7A07,2022/12/19 01:34:00,7082CCA6DC27D73E8C04D58EF883AE53,2022/12/19 01:34:00
22,GCF_004323575.1,5D64420AD2C5A2892F251175542F31BD,2023/09/21 07:03:00,ECFBDDE57592B655AC7E8B3A0C6965BB,2023/08/28 10:00:00,BDE09A08829F3D9CB145DD73F9676AF0,2023/08/28 10:00:00,C8812D72330253DE1CB1043CCC4A1DE5,2023/09/21 07:03:00
24,GCF_000380685.1,FDBE5F11F3C29E56076B3D7B6B6E30E5,2015/03/19 01:36:00,94299E36DDB6A7EEF2404E13F3C04FE5,2014/11/05 21:36:00,0D62517DAA457AF386F7EBC484E25DD3,2014/11/05 21:36:00,,
25,GCF_000398585.1,0CB89F35F6C7F4B8F17E97BCF431B957,2015/03/19 01:38:00,879CC168130901B3DF04627D61E3AD44,2014/11/05 21:42:00,5CC54E2EA5DAF6D825983CE283EBCE32,2014/11/05 21:42:00,,
26,GCF_000398565.1,2DD1DBBC88E2AE046BCCD15FF79D83F9,2015/03/24 17:40:00,28A781DF295025AC410048FB36E0A5AA,2014/11/05 21:42:00,60C75F8709EFE75CDC363B2C19C0435E,2014/11/05 21:42:00,,


### df_annotation_hashes_e0 (join and save)

In [13]:
df_organisms.columns

Index(['organism', 'type', 'url', 'filename', 'content'], dtype='object')

In [14]:
df_annotation_hashes_e0 = pd.concat(
     [(df_organisms[ df_organisms['filename'] == 'annotation_hashes.txt' ]\
     [['organism', 'type', 'filename', 'url']]),\
     df_annotation_hashes], axis=1
)
print(df_annotation_hashes_e0.shape)
df_annotation_hashes_e0.head()

(1156, 13)


Unnamed: 0,organism,type,filename,url,# Assembly accession,Descriptors hash,Descriptors last changed,Features hash,Features last changed,Locations hash,Locations last change,Protein names hash,Protein names last changed
1,ANME-2_cluster_archaeon_HR1,archaea,annotation_hashes.txt,https://ftp.ncbi.nlm.nih.gov/genomes/refseq/ar...,GCF_002926195.1,8CA404B5F60C119A2C43EC4AF02195FA,2018/02/17 15:37:00,E217B9860CC56B81BE5B665501BC7CCA,2018/02/17 15:37:00,73B928E31BFB3E085715F26DF60C1D87,2018/02/17 15:37:00,3090555A37745BC2AE0AA911A8EE7A11,2018/02/17 15:37:00
5,Acidianus_ambivalens,archaea,annotation_hashes.txt,https://ftp.ncbi.nlm.nih.gov/genomes/refseq/ar...,GCF_009428885.1,13930A0174316BD1CD10ABBFB23D5C85,2023/06/10 14:29:00,89FE920552570CEE9B680F2121649608,2023/06/10 14:29:00,E6989F1E613B47E058D4C56C91375B57,2023/06/10 14:29:00,5BD3024B511FE600FD251C6762E26ACE,2023/06/10 14:29:00
9,Acidianus_infernus,archaea,annotation_hashes.txt,https://ftp.ncbi.nlm.nih.gov/genomes/refseq/ar...,GCF_009729545.1,F9F4D6141230C3A24706E462E21DE384,2023/05/05 17:16:00,E776E7A4A2176E9A32DD529BE26BB43E,2023/05/05 17:16:00,46E8238E890DCB56638BCB199533C7EF,2023/05/05 17:16:00,195A586CC83988DFA2A1391902296EC8,2023/05/05 17:16:00
12,Acidianus_sulfidivorans,archaea,annotation_hashes.txt,https://ftp.ncbi.nlm.nih.gov/genomes/refseq/ar...,GCF_003201765.1,7CA82B923A7B171FE4E12201E27517F0,2020/03/04 08:42:00,439477814832DD76BC0F9B67B4B33390,2020/03/04 08:42:00,57A12108007B9BA52A11BBA5DAD61FDC,2020/03/04 08:42:00,3DDB6D6A809477EFB8FB674F5B3D8E35,2020/03/04 08:42:00
15,Acidiplasma_sp.,archaea,annotation_hashes.txt,https://ftp.ncbi.nlm.nih.gov/genomes/refseq/ar...,GCF_027368325.1,6CFE27813E8637618B3809BCD250F7E0,2023/06/28 03:30:00,0671178CE7CA897EF900FAE0E2F5FC69,2023/06/28 03:30:00,F473815B859A604BF56ECA6AAE0E93B0,2023/06/28 03:30:00,BE7837986B420BAB3E7E18D5DC7A9810,2023/06/28 03:30:00


In [15]:
# df_annotation_hashes_e0.to_csv(
#     PATH_PREPROCESSED / 'archaea' / 'annotation_hashes_e0.csv', 
#     index=True, header=True
#     )

## assembly_summary.txt

In [16]:
assembly_summary_0 = df_organisms[ df_organisms['filename'] == 'assembly_summary.txt' ].iloc[0,:]['content']
print(assembly_summary_0)

##  See ftp://ftp.ncbi.nlm.nih.gov/genomes/README_assembly_summary.txt for a description of the columns in this file.
#assembly_accession	bioproject	biosample	wgs_master	refseq_category	taxid	species_taxid	organism_name	infraspecific_name	isolate	version_status	assembly_level	release_type	genome_rep	seq_rel_date	asm_name	asm_submitter	gbrs_paired_asm	paired_asm_comp	ftp_path	excluded_from_refseq	relation_to_type_material	asm_not_live_date	assembly_type	group	genome_size	genome_size_ungapped	gc_percent	replicon_count	scaffold_count	contig_count	annotation_provider	annotation_name	annotation_date	total_gene_count	protein_coding_gene_count	non_coding_gene_count	pubmed_id
GCF_003201835.2	PRJNA224116	SAMN09071529	na	representative genome	41673	41673	Acidianus brierleyi	strain=DSM 1651	na	latest	Complete Genome	Major	Full	2020/03/10	ASM320183v2	North Carolina State University	GCA_003201835.2	identical	https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/003/201/835/GCF_003201835.2_ASM320183v2	na	ass

In [17]:
columns_assembly_summary = assembly_summary_0.strip().split('\n')[1].split('\t')
print(len(columns_assembly_summary))
columns_assembly_summary

38


['#assembly_accession',
 'bioproject',
 'biosample',
 'wgs_master',
 'refseq_category',
 'taxid',
 'species_taxid',
 'organism_name',
 'infraspecific_name',
 'isolate',
 'version_status',
 'assembly_level',
 'release_type',
 'genome_rep',
 'seq_rel_date',
 'asm_name',
 'asm_submitter',
 'gbrs_paired_asm',
 'paired_asm_comp',
 'ftp_path',
 'excluded_from_refseq',
 'relation_to_type_material',
 'asm_not_live_date',
 'assembly_type',
 'group',
 'genome_size',
 'genome_size_ungapped',
 'gc_percent',
 'replicon_count',
 'scaffold_count',
 'contig_count',
 'annotation_provider',
 'annotation_name',
 'annotation_date',
 'total_gene_count',
 'protein_coding_gene_count',
 'non_coding_gene_count',
 'pubmed_id']

In [18]:
df_org_assembly_summary = df_organisms[ df_organisms['filename'] == 'assembly_summary.txt' ]
df_assembly_summary = pd.DataFrame(columns=columns_assembly_summary)
df_assembly_summary = pd.concat([df_assembly_summary, df_org_assembly_summary['content']\
        .apply(lambda x: text_pro.get_pdseries_from_text("assembly_summary.txt", x))])
print(df_assembly_summary.shape)
df_assembly_summary.head()

(1032, 38)


Unnamed: 0,#assembly_accession,bioproject,biosample,wgs_master,refseq_category,taxid,species_taxid,organism_name,infraspecific_name,isolate,...,replicon_count,scaffold_count,contig_count,annotation_provider,annotation_name,annotation_date,total_gene_count,protein_coding_gene_count,non_coding_gene_count,pubmed_id
3,GCF_003201835.2,PRJNA224116,SAMN09071529,na,representative genome,41673,41673,Acidianus brierleyi,strain=DSM 1651,na,...,1,1,1,NCBI RefSeq,NCBI Prokaryotic Genome Annotation Pipeline (P...,06/11/23,3201,3031,51,na
4,GCF_009428885.1,PRJNA224116,SAMN13103967,WHYS00000000.1,na,2283,2283,Acidianus ambivalens,strain=DSM 3772,na,...,0,65,65,NCBI RefSeq,NCBI Prokaryotic Genome Annotation Pipeline (P...,05/18/23,2759,2481,52,na
6,GCF_009729545.1,PRJNA224116,SAMN09933090,WFIY00000000.1,representative genome,12915,12915,Acidianus infernus,strain=DSM 3191,na,...,0,4,4,NCBI RefSeq,NCBI Prokaryotic Genome Annotation Pipeline (P...,03/28/23,2467,2252,51,na
8,GCF_013133895.1,PRJNA224116,SAMN14829352,JABGBQ000000000.1,na,1540082,1540082,Acidianus sp. RZ1,strain=DSM 29099,na,...,0,746,746,NCBI RefSeq,NCBI Prokaryotic Genome Annotation Pipeline (P...,08/14/23,2665,2338,32,na
10,GCF_003201765.2,PRJNA224116,SAMN09071530,na,representative genome,619593,312539,Acidianus sulfidivorans JP7,strain=JP7,na,...,1,1,1,NCBI RefSeq,NCBI Prokaryotic Genome Annotation Pipeline (P...,03/29/23,2351,2292,51,na


### df_assembly_summary_e0 (join and save)

In [19]:
df_assembly_summary_e0 = pd.concat(
     [(df_organisms[ df_organisms['filename'] == 'assembly_summary.txt' ]\
     [['organism', 'type', 'filename', 'url']]),\
     df_assembly_summary], axis=1
)
print(df_assembly_summary_e0.shape)
df_assembly_summary_e0.head()

(1032, 42)


Unnamed: 0,organism,type,filename,url,#assembly_accession,bioproject,biosample,wgs_master,refseq_category,taxid,...,replicon_count,scaffold_count,contig_count,annotation_provider,annotation_name,annotation_date,total_gene_count,protein_coding_gene_count,non_coding_gene_count,pubmed_id
3,Acidianus_brierleyi,archaea,assembly_summary.txt,https://ftp.ncbi.nlm.nih.gov/genomes/refseq/ar...,GCF_003201835.2,PRJNA224116,SAMN09071529,na,representative genome,41673,...,1,1,1,NCBI RefSeq,NCBI Prokaryotic Genome Annotation Pipeline (P...,06/11/23,3201,3031,51,na
4,Acidianus_ambivalens,archaea,assembly_summary.txt,https://ftp.ncbi.nlm.nih.gov/genomes/refseq/ar...,GCF_009428885.1,PRJNA224116,SAMN13103967,WHYS00000000.1,na,2283,...,0,65,65,NCBI RefSeq,NCBI Prokaryotic Genome Annotation Pipeline (P...,05/18/23,2759,2481,52,na
6,Acidianus_infernus,archaea,assembly_summary.txt,https://ftp.ncbi.nlm.nih.gov/genomes/refseq/ar...,GCF_009729545.1,PRJNA224116,SAMN09933090,WFIY00000000.1,representative genome,12915,...,0,4,4,NCBI RefSeq,NCBI Prokaryotic Genome Annotation Pipeline (P...,03/28/23,2467,2252,51,na
8,Acidianus_sp._RZ1,archaea,assembly_summary.txt,https://ftp.ncbi.nlm.nih.gov/genomes/refseq/ar...,GCF_013133895.1,PRJNA224116,SAMN14829352,JABGBQ000000000.1,na,1540082,...,0,746,746,NCBI RefSeq,NCBI Prokaryotic Genome Annotation Pipeline (P...,08/14/23,2665,2338,32,na
10,Acidianus_sulfidivorans,archaea,assembly_summary.txt,https://ftp.ncbi.nlm.nih.gov/genomes/refseq/ar...,GCF_003201765.2,PRJNA224116,SAMN09071530,na,representative genome,619593,...,1,1,1,NCBI RefSeq,NCBI Prokaryotic Genome Annotation Pipeline (P...,03/29/23,2351,2292,51,na


In [20]:
# df_assembly_summary_e0.to_csv(
#     PATH_PREPROCESSED / 'archaea' / 'assembly_summary_e0.csv', 
#     index=True, header=True
#     )

## assembly_summary_historical.txt

In [21]:
assembly_summary_historical_0 = df_organisms[ 
    df_organisms['filename'] == 'assembly_summary_historical.txt' ]\
    .iloc[0,:]['content']
print(assembly_summary_historical_0)

##  See ftp://ftp.ncbi.nlm.nih.gov/genomes/README_assembly_summary.txt for a description of the columns in this file.
#assembly_accession	bioproject	biosample	wgs_master	refseq_category	taxid	species_taxid	organism_name	infraspecific_name	isolate	version_status	assembly_level	release_type	genome_rep	seq_rel_date	asm_name	asm_submitter	gbrs_paired_asm	paired_asm_comp	ftp_path	excluded_from_refseq	relation_to_type_material	asm_not_live_date	assembly_type	group	genome_size	genome_size_ungapped	gc_percent	replicon_count	scaffold_count	contig_count	annotation_provider	annotation_name	annotation_date	total_gene_count	protein_coding_gene_count	non_coding_gene_count	pubmed_id
GCF_002926195.1	PRJNA224116	SAMN06562579	MZXQ00000000.1	na	1968520	1968520	ANME-2 cluster archaeon HR1	na	HR1	suppressed	Scaffold	Major	Full	2018/02/13	ASM292619v1	California Institute of Technology	GCA_002926195.1	identical	https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/002/926/195/GCF_002926195.1_ASM292619v1	derived from 

In [22]:
columns_assembly_summary_historical = assembly_summary_historical_0.strip().split('\n')[1].split('\t')
print(len(columns_assembly_summary_historical))
columns_assembly_summary_historical

38


['#assembly_accession',
 'bioproject',
 'biosample',
 'wgs_master',
 'refseq_category',
 'taxid',
 'species_taxid',
 'organism_name',
 'infraspecific_name',
 'isolate',
 'version_status',
 'assembly_level',
 'release_type',
 'genome_rep',
 'seq_rel_date',
 'asm_name',
 'asm_submitter',
 'gbrs_paired_asm',
 'paired_asm_comp',
 'ftp_path',
 'excluded_from_refseq',
 'relation_to_type_material',
 'asm_not_live_date',
 'assembly_type',
 'group',
 'genome_size',
 'genome_size_ungapped',
 'gc_percent',
 'replicon_count',
 'scaffold_count',
 'contig_count',
 'annotation_provider',
 'annotation_name',
 'annotation_date',
 'total_gene_count',
 'protein_coding_gene_count',
 'non_coding_gene_count',
 'pubmed_id']

In [25]:
df_org_assembly_summary_historical = df_organisms[ 
    df_organisms['filename'] == 'assembly_summary_historical.txt' ]
df_assembly_summary_historical = pd.DataFrame(columns=columns_assembly_summary_historical)
df_assembly_summary_historical = pd.concat(
    [df_assembly_summary_historical, df_org_assembly_summary_historical['content']\
        .apply(lambda x: text_pro.get_pdseries_from_text("assembly_summary_historical.txt", x))])
print(df_assembly_summary_historical.shape)
df_assembly_summary_historical.head()

(227, 38)


Unnamed: 0,#assembly_accession,bioproject,biosample,wgs_master,refseq_category,taxid,species_taxid,organism_name,infraspecific_name,isolate,...,replicon_count,scaffold_count,contig_count,annotation_provider,annotation_name,annotation_date,total_gene_count,protein_coding_gene_count,non_coding_gene_count,pubmed_id
0,GCF_002926195.1,PRJNA224116,SAMN06562579,MZXQ00000000.1,na,1968520,1968520,ANME-2 cluster archaeon HR1,na,HR1,...,0,335,335,NCBI,NCBI Prokaryotic Genome Annotation Pipeline,02/15/18,2464,1927,48,na
2,GCF_003201835.1,PRJNA224116,SAMN09071529,na,na,41673,41673,Acidianus brierleyi,strain=DSM 1651,na,...,1,1,1,NCBI RefSeq,NCBI Prokaryotic Genome Annotation Pipeline (P...,06/11/23,3165,2894,51,na
7,GCF_003201765.1,PRJNA224116,SAMN09071530,na,na,619593,312539,Acidianus sulfidivorans JP7,strain=JP7,na,...,1,1,1,NCBI RefSeq,NCBI Prokaryotic Genome Annotation Pipeline (P...,03/29/23,2365,2269,51,na
20,GCF_000398565.1,PRJNA203587,SAMN02440951,ASLY00000000.1,na,1052838,1052838,Aigarchaeota archaeon JGI 0000001-A7,strain=JGI 0000001-A7,na,...,0,33,33,DOE Joint Genome Institute,Annotation submitted by DOE Joint Genome Insti...,na,na,na,na,na
21,GCF_000398585.1,PRJNA203586,SAMN02440533,ASLZ00000000.1,na,1052841,1052841,Aigarchaeota archaeon JGI 0000001-H6,strain=JGI 0000001-H6,na,...,0,21,21,NCBI,NCBI Prokaryotic Genome Annotation Pipeline,05/27/14,na,na,na,na


### df_assembly_summary_historical_e0 (join and save)

In [26]:
df_assembly_summary_historical_e0 = pd.concat(
     [(df_organisms[ df_organisms['filename'] == 'assembly_summary_historical.txt' ]\
     [['organism', 'type', 'filename', 'url']]),\
     df_assembly_summary_historical], axis=1
)
print(df_assembly_summary_historical_e0.shape)
df_assembly_summary_historical_e0.head()

(227, 42)


Unnamed: 0,organism,type,filename,url,#assembly_accession,bioproject,biosample,wgs_master,refseq_category,taxid,...,replicon_count,scaffold_count,contig_count,annotation_provider,annotation_name,annotation_date,total_gene_count,protein_coding_gene_count,non_coding_gene_count,pubmed_id
0,ANME-2_cluster_archaeon_HR1,archaea,assembly_summary_historical.txt,https://ftp.ncbi.nlm.nih.gov/genomes/refseq/ar...,GCF_002926195.1,PRJNA224116,SAMN06562579,MZXQ00000000.1,na,1968520,...,0,335,335,NCBI,NCBI Prokaryotic Genome Annotation Pipeline,02/15/18,2464,1927,48,na
2,Acidianus_brierleyi,archaea,assembly_summary_historical.txt,https://ftp.ncbi.nlm.nih.gov/genomes/refseq/ar...,GCF_003201835.1,PRJNA224116,SAMN09071529,na,na,41673,...,1,1,1,NCBI RefSeq,NCBI Prokaryotic Genome Annotation Pipeline (P...,06/11/23,3165,2894,51,na
7,Acidianus_sulfidivorans,archaea,assembly_summary_historical.txt,https://ftp.ncbi.nlm.nih.gov/genomes/refseq/ar...,GCF_003201765.1,PRJNA224116,SAMN09071530,na,na,619593,...,1,1,1,NCBI RefSeq,NCBI Prokaryotic Genome Annotation Pipeline (P...,03/29/23,2365,2269,51,na
20,Aigarchaeota_archaeon_JGI_0000001-A7,archaea,assembly_summary_historical.txt,https://ftp.ncbi.nlm.nih.gov/genomes/refseq/ar...,GCF_000398565.1,PRJNA203587,SAMN02440951,ASLY00000000.1,na,1052838,...,0,33,33,DOE Joint Genome Institute,Annotation submitted by DOE Joint Genome Insti...,na,na,na,na,na
21,Aigarchaeota_archaeon_JGI_0000001-H6,archaea,assembly_summary_historical.txt,https://ftp.ncbi.nlm.nih.gov/genomes/refseq/ar...,GCF_000398585.1,PRJNA203586,SAMN02440533,ASLZ00000000.1,na,1052841,...,0,21,21,NCBI,NCBI Prokaryotic Genome Annotation Pipeline,05/27/14,na,na,na,na


In [27]:
# df_assembly_summary_historical_e0.to_csv(
#     PATH_PREPROCESSED / 'archaea' / 'assembly_summary_historical_e0.csv', 
#     index=True, header=True
#     )