# RefSeq archaea genomes selection

In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
import os

## Create DataFrame
* set columns 
* convert date to DateTime 
* remove '/' at the end

### read .txt file

In [2]:
CURRENT_DIR = Path.cwd()
print(CURRENT_DIR)

/home/diegoflm/Documents/UV/courses_and_subjects/final_year_proyect/project/data_and_notebooks/list_of_genomes/notebooks/archaea


In [3]:
PATH_REFSEQ_ARCHAEA_LIST = CURRENT_DIR.parent.parent / 'raw/archaea/refseq_archaea_list.txt'
print(PATH_REFSEQ_ARCHAEA_LIST)
PATH_REFSEQ_ARCHAEA_LIST.exists()

/home/diegoflm/Documents/UV/courses_and_subjects/final_year_proyect/project/data_and_notebooks/list_of_genomes/raw/archaea/refseq_archaea_list.txt


True

In [4]:
# Header
header_refseq_archaea_list = ['organism', 'last_update']

# read the file
list_refseq_archaea = pd.read_csv(PATH_REFSEQ_ARCHAEA_LIST, header = None, 
                                  sep = '\s+')
list_refseq_archaea.head()


Unnamed: 0,0,1,2,3
0,ANME-2_cluster_archaeon_HR1/,2023-06-22,16:03,-
1,Acidianus_ambivalens/,2023-06-22,18:31,-
2,Acidianus_brierleyi/,2023-06-22,19:13,-
3,Acidianus_hospitalis/,2023-06-22,12:44,-
4,Acidianus_infernus/,2023-06-22,18:31,-


### Format the date into DateTime

In [5]:
# transform the columns 1 (YYYY/MM/DD) and 2 (HH:MM) into a single DateTime column
list_refseq_archaea['last_update'] = \
    pd.to_datetime(list_refseq_archaea[1] + ' ' + list_refseq_archaea[2])
list_refseq_archaea.head()

Unnamed: 0,0,1,2,3,last_update
0,ANME-2_cluster_archaeon_HR1/,2023-06-22,16:03,-,2023-06-22 16:03:00
1,Acidianus_ambivalens/,2023-06-22,18:31,-,2023-06-22 18:31:00
2,Acidianus_brierleyi/,2023-06-22,19:13,-,2023-06-22 19:13:00
3,Acidianus_hospitalis/,2023-06-22,12:44,-,2023-06-22 12:44:00
4,Acidianus_infernus/,2023-06-22,18:31,-,2023-06-22 18:31:00


In [6]:
list_refseq_archaea['last_update'][0]

Timestamp('2023-06-22 16:03:00')

### Set column names and dtype of 'organism'

In [7]:
list_refseq_archaea.drop(columns = [1, 2, 3], inplace = True)
list_refseq_archaea.head()

Unnamed: 0,0,last_update
0,ANME-2_cluster_archaeon_HR1/,2023-06-22 16:03:00
1,Acidianus_ambivalens/,2023-06-22 18:31:00
2,Acidianus_brierleyi/,2023-06-22 19:13:00
3,Acidianus_hospitalis/,2023-06-22 12:44:00
4,Acidianus_infernus/,2023-06-22 18:31:00


In [8]:
# rename columns with header_refseq_archaea_list
list_refseq_archaea.columns = header_refseq_archaea_list
list_refseq_archaea.head()

Unnamed: 0,organism,last_update
0,ANME-2_cluster_archaeon_HR1/,2023-06-22 16:03:00
1,Acidianus_ambivalens/,2023-06-22 18:31:00
2,Acidianus_brierleyi/,2023-06-22 19:13:00
3,Acidianus_hospitalis/,2023-06-22 12:44:00
4,Acidianus_infernus/,2023-06-22 18:31:00


In [9]:
list_refseq_archaea.dtypes

organism               object
last_update    datetime64[ns]
dtype: object

In [10]:
# Set organism column as type 'string[python]'
list_refseq_archaea['organism'] = list_refseq_archaea['organism'].astype("string")
list_refseq_archaea.dtypes

organism       string[python]
last_update    datetime64[ns]
dtype: object

### Remove '/' at the end of organism name

In [11]:
# Remove '/' at the end of organism name
list_refseq_archaea['organism'] = list_refseq_archaea['organism'].str.rstrip('/')
list_refseq_archaea.head()

Unnamed: 0,organism,last_update
0,ANME-2_cluster_archaeon_HR1,2023-06-22 16:03:00
1,Acidianus_ambivalens,2023-06-22 18:31:00
2,Acidianus_brierleyi,2023-06-22 19:13:00
3,Acidianus_hospitalis,2023-06-22 12:44:00
4,Acidianus_infernus,2023-06-22 18:31:00


### Save the DataFrame

In [12]:
PATH_PREPROCESSED_ARCHAEA = CURRENT_DIR.parent.parent / 'preprocessed/archaea'
print(PATH_PREPROCESSED_ARCHAEA)
PATH_PREPROCESSED_ARCHAEA.exists()

/home/diegoflm/Documents/UV/courses_and_subjects/final_year_proyect/project/data_and_notebooks/list_of_genomes/preprocessed/archaea


True

In [13]:
# save the dataframe as a csv file
list_refseq_archaea.to_csv(PATH_PREPROCESSED_ARCHAEA / 'list_refseq_archaea.csv',
                            index = True, header = True)

In [14]:
temp = pd.read_csv(PATH_PREPROCESSED_ARCHAEA / 'list_refseq_archaea.csv',
                        index_col = 0)
temp.head()

Unnamed: 0,organism,last_update
0,ANME-2_cluster_archaeon_HR1,2023-06-22 16:03:00
1,Acidianus_ambivalens,2023-06-22 18:31:00
2,Acidianus_brierleyi,2023-06-22 19:13:00
3,Acidianus_hospitalis,2023-06-22 12:44:00
4,Acidianus_infernus,2023-06-22 18:31:00


## Random selection of 100 RefSeq archaea genomes

In [15]:
import numpy as np
import pandas as pd
from pathlib import Path

In [16]:
CURRENT_DIR = Path.cwd()
PATH_PREPROCESSED_ARCHAEA = CURRENT_DIR.parent.parent / 'preprocessed/archaea'

### Sampling

In [17]:
list_archaea = pd.read_csv(PATH_PREPROCESSED_ARCHAEA / 'list_refseq_archaea.csv',
                            index_col = 0, header = 0)
list_archaea.head()

Unnamed: 0,organism,last_update
0,ANME-2_cluster_archaeon_HR1,2023-06-22 16:03:00
1,Acidianus_ambivalens,2023-06-22 18:31:00
2,Acidianus_brierleyi,2023-06-22 19:13:00
3,Acidianus_hospitalis,2023-06-22 12:44:00
4,Acidianus_infernus,2023-06-22 18:31:00


In [18]:
list_archaea.shape

(1094, 2)

In [30]:
# Set of a random seed and selection of 100 random genomes
np.random.seed(42)
sample_archaea = list_archaea.sample(n = 100)
sample_archaea.sort_index(inplace = True)
sample_archaea.head()

Unnamed: 0,organism,last_update
31,Archaeoglobus_profundus,2023-06-22 12:40:00
44,Candidatus_Bathyarchaeota_archaeon,2023-06-22 16:34:00
51,Candidatus_Korarchaeum_sp.,2023-06-22 22:41:00
56,Candidatus_Methanolliviera_sp._GoM_oil,2023-06-22 17:55:00
59,Candidatus_Methanomethylophilus_sp._1R26,2023-06-22 14:06:00


### Save the sample DataFrame

In [31]:
PATH_PREPROCESSED_ARCHAEA = CURRENT_DIR.parent.parent / 'preprocessed/archaea'
print(PATH_PREPROCESSED_ARCHAEA)
PATH_PREPROCESSED_ARCHAEA.exists()

/home/diegoflm/Documents/UV/courses_and_subjects/final_year_proyect/project/data_and_notebooks/list_of_genomes/preprocessed/archaea


True

In [32]:
sample_archaea.to_csv(PATH_PREPROCESSED_ARCHAEA / 'sample_archaea.csv',
                        index = True, header = True)


In [33]:
temp = pd.read_csv(PATH_PREPROCESSED_ARCHAEA / 'sample_archaea.csv',
                        index_col = 0)
temp.head()

Unnamed: 0,organism,last_update
31,Archaeoglobus_profundus,2023-06-22 12:40:00
44,Candidatus_Bathyarchaeota_archaeon,2023-06-22 16:34:00
51,Candidatus_Korarchaeum_sp.,2023-06-22 22:41:00
56,Candidatus_Methanolliviera_sp._GoM_oil,2023-06-22 17:55:00
59,Candidatus_Methanomethylophilus_sp._1R26,2023-06-22 14:06:00
