In [1]:
import pandas as pd
from pathlib import Path

# Constants

In [2]:
PATH_DATA = Path.cwd().parent.parent.parent.parent / 'data'
PATH_DATA.exists()

True

In [3]:
PATH_SCRAPED = PATH_DATA / 'raw' / 'scraped'
PATH_SCRAPED.exists()

True

# Data Extraction

In [4]:
files = PATH_SCRAPED.glob('*')
for file in files:
    print(file.name)

bacteria_text_files_0_15000.json
urls_bacteria_organisms.txt
archaea_text_files.json
test.json
bacteria_text_files_30000_49760.json
bacteria_text_files_15000_30000.json
bacteria_text_files_sep23.json


In [6]:
df_organisms = pd.read_json(PATH_SCRAPED / 'archaea_text_files.json')
df_organisms['type'] = 'archaea'
df_organisms = df_organisms[['organism', 'type', 'url', 'filename', 'content']]
print(df_organisms.shape)
df_organisms.head()

(2415, 5)


Unnamed: 0,organism,type,url,filename,content
0,ANME-2_cluster_archaeon_HR1,archaea,https://ftp.ncbi.nlm.nih.gov/genomes/refseq/ar...,assembly_summary_historical.txt,## See ftp://ftp.ncbi.nlm.nih.gov/genomes/REA...
1,ANME-2_cluster_archaeon_HR1,archaea,https://ftp.ncbi.nlm.nih.gov/genomes/refseq/ar...,annotation_hashes.txt,# Assembly accession\tDescriptors hash\tDescri...
2,Acidianus_brierleyi,archaea,https://ftp.ncbi.nlm.nih.gov/genomes/refseq/ar...,assembly_summary_historical.txt,## See ftp://ftp.ncbi.nlm.nih.gov/genomes/REA...
3,Acidianus_brierleyi,archaea,https://ftp.ncbi.nlm.nih.gov/genomes/refseq/ar...,assembly_summary.txt,## See ftp://ftp.ncbi.nlm.nih.gov/genomes/REA...
4,Acidianus_ambivalens,archaea,https://ftp.ncbi.nlm.nih.gov/genomes/refseq/ar...,assembly_summary.txt,## See ftp://ftp.ncbi.nlm.nih.gov/genomes/REA...


# Text processing of 'content'

In [8]:
df_organisms['filename'].value_counts()

filename
annotation_hashes.txt              1156
assembly_summary.txt               1032
assembly_summary_historical.txt     227
Name: count, dtype: int64

In [10]:
assembly_summary_historical_0 = df_organisms.loc[0, 'content']
print(assembly_summary_historical_0)


##  See ftp://ftp.ncbi.nlm.nih.gov/genomes/README_assembly_summary.txt for a description of the columns in this file.
#assembly_accession	bioproject	biosample	wgs_master	refseq_category	taxid	species_taxid	organism_name	infraspecific_name	isolate	version_status	assembly_level	release_type	genome_rep	seq_rel_date	asm_name	asm_submitter	gbrs_paired_asm	paired_asm_comp	ftp_path	excluded_from_refseq	relation_to_type_material	asm_not_live_date	assembly_type	group	genome_size	genome_size_ungapped	gc_percent	replicon_count	scaffold_count	contig_count	annotation_provider	annotation_name	annotation_date	total_gene_count	protein_coding_gene_count	non_coding_gene_count	pubmed_id
GCF_002926195.1	PRJNA224116	SAMN06562579	MZXQ00000000.1	na	1968520	1968520	ANME-2 cluster archaeon HR1	na	HR1	suppressed	Scaffold	Major	Full	2018/02/13	ASM292619v1	California Institute of Technology	GCA_002926195.1	identical	https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/002/926/195/GCF_002926195.1_ASM292619v1	derived from 

In [19]:
assembly_summary_historical_0.strip().split('\n')[1:]

['#assembly_accession\tbioproject\tbiosample\twgs_master\trefseq_category\ttaxid\tspecies_taxid\torganism_name\tinfraspecific_name\tisolate\tversion_status\tassembly_level\trelease_type\tgenome_rep\tseq_rel_date\tasm_name\tasm_submitter\tgbrs_paired_asm\tpaired_asm_comp\tftp_path\texcluded_from_refseq\trelation_to_type_material\tasm_not_live_date\tassembly_type\tgroup\tgenome_size\tgenome_size_ungapped\tgc_percent\treplicon_count\tscaffold_count\tcontig_count\tannotation_provider\tannotation_name\tannotation_date\ttotal_gene_count\tprotein_coding_gene_count\tnon_coding_gene_count\tpubmed_id',
 'GCF_002926195.1\tPRJNA224116\tSAMN06562579\tMZXQ00000000.1\tna\t1968520\t1968520\tANME-2 cluster archaeon HR1\tna\tHR1\tsuppressed\tScaffold\tMajor\tFull\t2018/02/13\tASM292619v1\tCalifornia Institute of Technology\tGCA_002926195.1\tidentical\thttps://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/002/926/195/GCF_002926195.1_ASM292619v1\tderived from metagenome; genus undefined\tna\t2018/02/19\thaploid\ta

In [27]:
sample = df_organisms.loc[0:3, ['organism', 'filename', 'content']].copy()
sample

Unnamed: 0,organism,filename,content
0,ANME-2_cluster_archaeon_HR1,assembly_summary_historical.txt,## See ftp://ftp.ncbi.nlm.nih.gov/genomes/REA...
1,ANME-2_cluster_archaeon_HR1,annotation_hashes.txt,# Assembly accession\tDescriptors hash\tDescri...
2,Acidianus_brierleyi,assembly_summary_historical.txt,## See ftp://ftp.ncbi.nlm.nih.gov/genomes/REA...
3,Acidianus_brierleyi,assembly_summary.txt,## See ftp://ftp.ncbi.nlm.nih.gov/genomes/REA...


In [24]:
sample['processed'] = sample['content'].apply(lambda x: x.strip().split('\n')[1:])


In [26]:
sample.loc[0, 'processed']

['#assembly_accession\tbioproject\tbiosample\twgs_master\trefseq_category\ttaxid\tspecies_taxid\torganism_name\tinfraspecific_name\tisolate\tversion_status\tassembly_level\trelease_type\tgenome_rep\tseq_rel_date\tasm_name\tasm_submitter\tgbrs_paired_asm\tpaired_asm_comp\tftp_path\texcluded_from_refseq\trelation_to_type_material\tasm_not_live_date\tassembly_type\tgroup\tgenome_size\tgenome_size_ungapped\tgc_percent\treplicon_count\tscaffold_count\tcontig_count\tannotation_provider\tannotation_name\tannotation_date\ttotal_gene_count\tprotein_coding_gene_count\tnon_coding_gene_count\tpubmed_id',
 'GCF_002926195.1\tPRJNA224116\tSAMN06562579\tMZXQ00000000.1\tna\t1968520\t1968520\tANME-2 cluster archaeon HR1\tna\tHR1\tsuppressed\tScaffold\tMajor\tFull\t2018/02/13\tASM292619v1\tCalifornia Institute of Technology\tGCA_002926195.1\tidentical\thttps://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/002/926/195/GCF_002926195.1_ASM292619v1\tderived from metagenome; genus undefined\tna\t2018/02/19\thaploid\ta

In [28]:
sample['processed'] = sample['content'].str.strip().str.split('\n').str[1:]

In [29]:
sample.loc[0, 'processed']

['#assembly_accession\tbioproject\tbiosample\twgs_master\trefseq_category\ttaxid\tspecies_taxid\torganism_name\tinfraspecific_name\tisolate\tversion_status\tassembly_level\trelease_type\tgenome_rep\tseq_rel_date\tasm_name\tasm_submitter\tgbrs_paired_asm\tpaired_asm_comp\tftp_path\texcluded_from_refseq\trelation_to_type_material\tasm_not_live_date\tassembly_type\tgroup\tgenome_size\tgenome_size_ungapped\tgc_percent\treplicon_count\tscaffold_count\tcontig_count\tannotation_provider\tannotation_name\tannotation_date\ttotal_gene_count\tprotein_coding_gene_count\tnon_coding_gene_count\tpubmed_id',
 'GCF_002926195.1\tPRJNA224116\tSAMN06562579\tMZXQ00000000.1\tna\t1968520\t1968520\tANME-2 cluster archaeon HR1\tna\tHR1\tsuppressed\tScaffold\tMajor\tFull\t2018/02/13\tASM292619v1\tCalifornia Institute of Technology\tGCA_002926195.1\tidentical\thttps://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/002/926/195/GCF_002926195.1_ASM292619v1\tderived from metagenome; genus undefined\tna\t2018/02/19\thaploid\ta