In [1]:
import pandas as pd
from pathlib import Path

# Constants

In [2]:
PATH_DATA = Path.cwd().parent.parent.parent.parent / 'data'
PATH_DATA.exists()

True

In [3]:
PATH_SCRAPED = PATH_DATA / 'raw' / 'scraped'
PATH_SCRAPED.exists()

True

# Data Extraction

In [4]:
files = PATH_SCRAPED.glob('*')
for file in files:
    print(file.name)

bacteria_text_files_0_15000.json
urls_bacteria_organisms.txt
archaea_text_files.json
test.json
bacteria_text_files_30000_49760.json
bacteria_text_files_15000_30000.json
bacteria_text_files_sep23.json


In [5]:
df_organisms = pd.read_json(PATH_SCRAPED / 'archaea_text_files.json')
df_organisms['type'] = 'archaea'
df_organisms = df_organisms[['organism', 'type', 'url', 'filename', 'content']]
print(df_organisms.shape)
df_organisms.head()

(2415, 5)


Unnamed: 0,organism,type,url,filename,content
0,ANME-2_cluster_archaeon_HR1,archaea,https://ftp.ncbi.nlm.nih.gov/genomes/refseq/ar...,assembly_summary_historical.txt,## See ftp://ftp.ncbi.nlm.nih.gov/genomes/REA...
1,ANME-2_cluster_archaeon_HR1,archaea,https://ftp.ncbi.nlm.nih.gov/genomes/refseq/ar...,annotation_hashes.txt,# Assembly accession\tDescriptors hash\tDescri...
2,Acidianus_brierleyi,archaea,https://ftp.ncbi.nlm.nih.gov/genomes/refseq/ar...,assembly_summary_historical.txt,## See ftp://ftp.ncbi.nlm.nih.gov/genomes/REA...
3,Acidianus_brierleyi,archaea,https://ftp.ncbi.nlm.nih.gov/genomes/refseq/ar...,assembly_summary.txt,## See ftp://ftp.ncbi.nlm.nih.gov/genomes/REA...
4,Acidianus_ambivalens,archaea,https://ftp.ncbi.nlm.nih.gov/genomes/refseq/ar...,assembly_summary.txt,## See ftp://ftp.ncbi.nlm.nih.gov/genomes/REA...


# Text processing of 'content'

In [6]:
df_organisms['filename'].value_counts()

filename
annotation_hashes.txt              1156
assembly_summary.txt               1032
assembly_summary_historical.txt     227
Name: count, dtype: int64

## get_series_from_text()

In [7]:
def get_series_from_text(text_line, columns):
    values = text_line.split('\n')[1].split('\t')
    return pd.Series(values, index=columns)

## annotation_hashes.txt

In [8]:
annotation_hashes_0 = df_organisms[ df_organisms['filename'] == 'annotation_hashes.txt' ].iloc[0,:]['content']
print(annotation_hashes_0)

# Assembly accession	Descriptors hash	Descriptors last changed	Features hash	Features last changed	Locations hash	Locations last change	Protein names hash	Protein names last changed
GCF_002926195.1	8CA404B5F60C119A2C43EC4AF02195FA	2018/02/17 15:37:00	E217B9860CC56B81BE5B665501BC7CCA	2018/02/17 15:37:00	73B928E31BFB3E085715F26DF60C1D87	2018/02/17 15:37:00	3090555A37745BC2AE0AA911A8EE7A11	2018/02/17 15:37:00



In [9]:
columns_annotation_hashes = annotation_hashes_0.strip().split('\n')[0].split('\t')
columns_annotation_hashes

['# Assembly accession',
 'Descriptors hash',
 'Descriptors last changed',
 'Features hash',
 'Features last changed',
 'Locations hash',
 'Locations last change',
 'Protein names hash',
 'Protein names last changed']

In [10]:
df_org_annotation_hashes = df_organisms[ df_organisms['filename'] == 'annotation_hashes.txt' ]
df_annotation_hashes = pd.DataFrame(columns=columns_annotation_hashes)
df_annotation_hashes = pd.concat([df_annotation_hashes, df_org_annotation_hashes['content']\
                                  .apply(lambda x: get_series_from_text(x, columns_annotation_hashes))])
print(df_annotation_hashes.shape)
df_annotation_hashes.head(10)

(1156, 9)


Unnamed: 0,# Assembly accession,Descriptors hash,Descriptors last changed,Features hash,Features last changed,Locations hash,Locations last change,Protein names hash,Protein names last changed
1,GCF_002926195.1,8CA404B5F60C119A2C43EC4AF02195FA,2018/02/17 15:37:00,E217B9860CC56B81BE5B665501BC7CCA,2018/02/17 15:37:00,73B928E31BFB3E085715F26DF60C1D87,2018/02/17 15:37:00,3090555A37745BC2AE0AA911A8EE7A11,2018/02/17 15:37:00
5,GCF_009428885.1,13930A0174316BD1CD10ABBFB23D5C85,2023/06/10 14:29:00,89FE920552570CEE9B680F2121649608,2023/06/10 14:29:00,E6989F1E613B47E058D4C56C91375B57,2023/06/10 14:29:00,5BD3024B511FE600FD251C6762E26ACE,2023/06/10 14:29:00
9,GCF_009729545.1,F9F4D6141230C3A24706E462E21DE384,2023/05/05 17:16:00,E776E7A4A2176E9A32DD529BE26BB43E,2023/05/05 17:16:00,46E8238E890DCB56638BCB199533C7EF,2023/05/05 17:16:00,195A586CC83988DFA2A1391902296EC8,2023/05/05 17:16:00
12,GCF_003201765.1,7CA82B923A7B171FE4E12201E27517F0,2020/03/04 08:42:00,439477814832DD76BC0F9B67B4B33390,2020/03/04 08:42:00,57A12108007B9BA52A11BBA5DAD61FDC,2020/03/04 08:42:00,3DDB6D6A809477EFB8FB674F5B3D8E35,2020/03/04 08:42:00
15,GCF_027368325.1,6CFE27813E8637618B3809BCD250F7E0,2023/06/28 03:30:00,0671178CE7CA897EF900FAE0E2F5FC69,2023/06/28 03:30:00,F473815B859A604BF56ECA6AAE0E93B0,2023/06/28 03:30:00,BE7837986B420BAB3E7E18D5DC7A9810,2023/06/28 03:30:00
17,GCF_001399695.1,C8880AA472B7757F32C952A0418E9677,2022/12/19 01:34:00,69B4C00C4468C45AC4FE9E50B101A3C3,2022/12/19 01:34:00,1E046751993205F1B8715F342F9F7A07,2022/12/19 01:34:00,7082CCA6DC27D73E8C04D58EF883AE53,2022/12/19 01:34:00
22,GCF_004323575.1,5D64420AD2C5A2892F251175542F31BD,2023/09/21 07:03:00,ECFBDDE57592B655AC7E8B3A0C6965BB,2023/08/28 10:00:00,BDE09A08829F3D9CB145DD73F9676AF0,2023/08/28 10:00:00,C8812D72330253DE1CB1043CCC4A1DE5,2023/09/21 07:03:00
24,GCF_000380685.1,FDBE5F11F3C29E56076B3D7B6B6E30E5,2015/03/19 01:36:00,94299E36DDB6A7EEF2404E13F3C04FE5,2014/11/05 21:36:00,0D62517DAA457AF386F7EBC484E25DD3,2014/11/05 21:36:00,,
25,GCF_000398585.1,0CB89F35F6C7F4B8F17E97BCF431B957,2015/03/19 01:38:00,879CC168130901B3DF04627D61E3AD44,2014/11/05 21:42:00,5CC54E2EA5DAF6D825983CE283EBCE32,2014/11/05 21:42:00,,
26,GCF_000398565.1,2DD1DBBC88E2AE046BCCD15FF79D83F9,2015/03/24 17:40:00,28A781DF295025AC410048FB36E0A5AA,2014/11/05 21:42:00,60C75F8709EFE75CDC363B2C19C0435E,2014/11/05 21:42:00,,


## assembly_summary.txt

In [11]:
assembly_summary_0 = df_organisms[ df_organisms['filename'] == 'assembly_summary.txt' ].iloc[0,:]['content']
print(assembly_summary_0)

##  See ftp://ftp.ncbi.nlm.nih.gov/genomes/README_assembly_summary.txt for a description of the columns in this file.
#assembly_accession	bioproject	biosample	wgs_master	refseq_category	taxid	species_taxid	organism_name	infraspecific_name	isolate	version_status	assembly_level	release_type	genome_rep	seq_rel_date	asm_name	asm_submitter	gbrs_paired_asm	paired_asm_comp	ftp_path	excluded_from_refseq	relation_to_type_material	asm_not_live_date	assembly_type	group	genome_size	genome_size_ungapped	gc_percent	replicon_count	scaffold_count	contig_count	annotation_provider	annotation_name	annotation_date	total_gene_count	protein_coding_gene_count	non_coding_gene_count	pubmed_id
GCF_003201835.2	PRJNA224116	SAMN09071529	na	representative genome	41673	41673	Acidianus brierleyi	strain=DSM 1651	na	latest	Complete Genome	Major	Full	2020/03/10	ASM320183v2	North Carolina State University	GCA_003201835.2	identical	https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/003/201/835/GCF_003201835.2_ASM320183v2	na	ass

In [19]:
columns_assembly_summary = assembly_summary_0.strip().split('\n')[1].split('\t')
print(len(columns_assembly_summary))
columns_assembly_summary

38


['#assembly_accession',
 'bioproject',
 'biosample',
 'wgs_master',
 'refseq_category',
 'taxid',
 'species_taxid',
 'organism_name',
 'infraspecific_name',
 'isolate',
 'version_status',
 'assembly_level',
 'release_type',
 'genome_rep',
 'seq_rel_date',
 'asm_name',
 'asm_submitter',
 'gbrs_paired_asm',
 'paired_asm_comp',
 'ftp_path',
 'excluded_from_refseq',
 'relation_to_type_material',
 'asm_not_live_date',
 'assembly_type',
 'group',
 'genome_size',
 'genome_size_ungapped',
 'gc_percent',
 'replicon_count',
 'scaffold_count',
 'contig_count',
 'annotation_provider',
 'annotation_name',
 'annotation_date',
 'total_gene_count',
 'protein_coding_gene_count',
 'non_coding_gene_count',
 'pubmed_id']

In [25]:
'\t'.join(columns_assembly_summary)

'#assembly_accession\tbioproject\tbiosample\twgs_master\trefseq_category\ttaxid\tspecies_taxid\torganism_name\tinfraspecific_name\tisolate\tversion_status\tassembly_level\trelease_type\tgenome_rep\tseq_rel_date\tasm_name\tasm_submitter\tgbrs_paired_asm\tpaired_asm_comp\tftp_path\texcluded_from_refseq\trelation_to_type_material\tasm_not_live_date\tassembly_type\tgroup\tgenome_size\tgenome_size_ungapped\tgc_percent\treplicon_count\tscaffold_count\tcontig_count\tannotation_provider\tannotation_name\tannotation_date\ttotal_gene_count\tprotein_coding_gene_count\tnon_coding_gene_count\tpubmed_id'

In [22]:
df_org_assembly_summary = df_organisms[ df_organisms['filename'] == 'assembly_summary.txt' ]
df_org_assembly_summary['content'] = df_org_assembly_summary['content'].\
    apply(lambda x: '\t'.join(x.split('\n')[1:]))
df_assembly_summary = pd.DataFrame(columns=columns_assembly_summary)
df_assembly_summary = pd.concat([df_assembly_summary, df_org_assembly_summary['content']\
                                  .apply(lambda x: get_series_from_text(x, 
                                    columns_assembly_summary))])
# print(df_assembly_summary.shape)
# df_assembly_summary.head(10)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_org_assembly_summary['content'] = df_org_assembly_summary['content'].\


IndexError: list index out of range

## temp

In [None]:
assembly_summary_historical_0 = df_organisms.loc[0, 'content']
print(assembly_summary_historical_0)


In [None]:
assembly_summary_historical_0.strip().split('\n')[1:]

In [None]:
sample = df_organisms.loc[0:3, ['organism', 'filename', 'content']].copy()
sample

In [None]:
sample['processed'] = sample['content'].apply(lambda x: x.strip().split('\n')[1:])


In [None]:
sample.loc[0, 'processed']

In [None]:
sample['processed'] = sample['content'].str.strip().str.split('\n').str[1:]

In [None]:
sample.loc[0, 'processed']