In [1]:
# import libraries
import os
import subprocess
import rpy2
import pandas as pd
from Bio import Entrez

In [2]:
# loading in R
%load_ext rpy2.ipython

In [3]:
%%R -o ecological_data_tibble

# loading libraries
library(tidyverse)
library(magrittr)
library(glue)

# load ecological categorical data
## define column names
ecological_data_columns = c('HEADER','scaffold ID_ORF','scaffold ID_ORF (SPADES)','CONFIDENCE','METAGENOMICS PROJECT',
                               'TAXA','scaffold length','COMPLETE SEQUENCE','LENGTH','SIGNAL PEPTIDE?','TMs','ECTODOMAIN',
                               'ECTO LENGTH','C','Num CYS','ECTO Isoelectric point','COMMENTS','BIOSAMPLE','MG NAME',
                               'HABITAT_Detailed','Temperature_Detailed','elev mts','collec DATE','HABITAT','AUTHORS',
                               'CONTACT','PAPER DOI','ISOLATION','SOLID','AQUEOUS','SALT?','pH','T_Classified',
                               'ALT_DEPT (mts)','FILTER FRACTION','O2')

ecological_data.tibble = readr::read_tsv('../data/metadata/modified_FsxAs-Kosher-Taxo-26-Abr-2021_Taxon_and_Kosher.tsv',
                                           col_names = ecological_data_columns, 
                                           skip = 1)

# performiing a little change in a too-long-tip
ecological_data.tibble %<>% dplyr::mutate(HEADER = HEADER %>% str_replace_all(., ' \\(new.*\\)', '')) %>%
        tidyr::separate_rows(., `BIOSAMPLE`, sep = ' y ')
        
# create variable to export
ecological_data_tibble = ecological_data.tibble

R[write to console]: ── [1mAttaching packages[22m ───────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.1 ──

R[write to console]: [32m✔[39m [34mggplot2[39m 3.3.3     [32m✔[39m [34mpurrr  [39m 0.3.4
[32m✔[39m [34mtibble [39m 3.1.2     [32m✔[39m [34mdplyr  [39m 1.0.6
[32m✔[39m [34mtidyr  [39m 1.1.3     [32m✔[39m [34mstringr[39m 1.4.0
[32m✔[39m [34mreadr  [39m 1.4.0     [32m✔[39m [34mforcats[39m 0.5.1

R[write to console]: ── [1mConflicts[22m ──────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()

R[write to console]: 
Attaching package: ‘magrittr’


R[write to console]: The following object is masked from ‘package:purrr’:

    set_names


R[write to console]: The following object is masked from


[36m──[39m [1m[1mColumn specification[1m[22m [36m──────────────────────────────────────────────────────────────────────────────────────────────────────[39m
cols(
  .default = col_character(),
  `scaffold length` = [32mcol_double()[39m,
  LENGTH = [32mcol_double()[39m,
  `SIGNAL PEPTIDE?` = [32mcol_double()[39m,
  TMs = [32mcol_double()[39m,
  `ECTO LENGTH` = [32mcol_double()[39m,
  `Num CYS` = [32mcol_double()[39m,
  `ECTO Isoelectric point` = [32mcol_double()[39m,
  SOLID = [32mcol_double()[39m,
  AQUEOUS = [32mcol_double()[39m,
  `SALT?` = [32mcol_double()[39m,
  pH = [32mcol_double()[39m,
  T_Classified = [32mcol_double()[39m,
  `ALT_DEPT (mts)` = [32mcol_double()[39m,
  O2 = [32mcol_double()[39m
)
[36mℹ[39m Use [30m[47m[30m[47m`spec()`[47m[30m[49m[39m for the full column specifications.



In [None]:
import collections

# trying for one biosample
Entrez.email = 'mauricio.langleib@gmail.com'

# create empty list to allocate to allocate rows
geo_data_rows = []

# loop over BioSample IDs to get geographical data
for biosample_id in ecological_data_tibble['BIOSAMPLE'].to_list():
    try:
        # retrieve data from BioSample
        # first get UID for given BioSample ID
        handle = Entrez.esearch(db = 'biosample', term = biosample_id)
        biosample_search_dict = Entrez.read(handle)
        biosample_uid = biosample_search_dict['IdList']
        # now search data for that UID
        handle = Entrez.efetch(db = 'biosample', id = biosample_uid, type = 'text')
        #print(handle.read())
        #biosample_data_dict = Entrez.read(handle)
        import xmltodict
        biosample_dict = xmltodict.parse(handle)
        # getting relevant geographical data
        # setting defaults to NA
        geo_loc_name = 'NA'
        lat = 'NA'
        lon = 'NA'
        isolation_source = 'NA'
        # parsing data
        # first case: only one organism annotated
        if type(biosample_dict['BioSampleSet']['BioSample']) == collections.OrderedDict:
            for attribute in biosample_dict['BioSampleSet']['BioSample']['Attributes']['Attribute']:
                if attribute['@attribute_name'] == 'geo_loc_name':
                    geo_loc_name = attribute['#text']
                if attribute['@attribute_name'] == 'lat_lon':
                    lat = attribute['#text'].split(' ')[0] + ' ' + attribute['#text'].split(' ')[1]
                    lon = attribute['#text'].split(' ')[2] + ' ' + attribute['#text'].split(' ')[3]
                if attribute['@attribute_name'] == 'isolation_source':
                    isolation_source = attribute['#text']
            if 'Description' in biosample_dict['BioSampleSet']['BioSample'].keys():
                if 'Title' in biosample_dict['BioSampleSet']['BioSample']['Description'].keys():
                    title = biosample_dict['BioSampleSet']['BioSample']['Description']['Title']
                else:
                    title = 'NA'
                if 'Organism' in biosample_dict['BioSampleSet']['BioSample']['Description'].keys():
                    if '@taxonomy_id' in biosample_dict['BioSampleSet']['BioSample']['Description']['Organism']:
                        taxa_id = biosample_dict['BioSampleSet']['BioSample']['Description']['Organism']['@taxonomy_id']
                    else:
                        taxa_id = 'NA'
                    if 'OrganismName' in biosample_dict['BioSampleSet']['BioSample']['Description']['Organism']:
                        organism = biosample_dict['BioSampleSet']['BioSample']['Description']['Organism']['OrganismName']
                    else:
                        organism = 'NA'
            else:
                title = 'NA'
                taxa_id = 'NA'
                organism = 'NA'
            # append pandas DataFrame with data to <geo_data_rows>
            geo_data_rows.append(pd.DataFrame.from_dict({'BioSample ID': [biosample_id], 
                                                         'Geographical location': [geo_loc_name], 
                                                         'Latitude': [str(lat)], 
                                                         'Longitude': [str(lon)],
                                                         'Isolation source': [isolation_source],
                                                         'Title': [title],
                                                         'Taxa ID': [taxa_id],
                                                         'Organism': [organism]}))
        elif type(biosample_dict['BioSampleSet']['BioSample']) == list:
            for item in biosample_dict['BioSampleSet']['BioSample']:
                for attribute in item['Attributes']['Attribute']:
                    if attribute['@attribute_name'] == 'geo_loc_name':
                        geo_loc_name = attribute['#text']
                    if attribute['@attribute_name'] == 'lat_lon':
                        lat = attribute['#text'].split(' ')[0] + ' ' + attribute['#text'].split(' ')[1]
                        lon = attribute['#text'].split(' ')[2] + ' ' + attribute['#text'].split(' ')[3]
                    if attribute['@attribute_name'] == 'isolation_source':
                        isolation_source = attribute['#text']
                if 'Description' in item.keys():
                    if 'Title' in item['Description'].keys():
                        title = item['Description']['Title']
                    else:
                        title = 'NA'
                    if 'Organism' in item['Description'].keys():
                        if '@taxonomy_id' in item['Description']['Organism'].keys():
                            taxa_id = item['Description']['Organism']['@taxonomy_id']
                        else:
                            taxa_id = 'NA'
                        if 'OrganismName' in item['Description']['Organism'].keys():
                            organism = item['Description']['Organism']['OrganismName']
                        else:
                            organism = 'NA'
                else:
                    title = 'NA'
                    taxa_id = 'NA'
                    organism = 'NA'
                # append pandas DataFrame with data to <geo_data_rows>
                geo_data_rows.append(pd.DataFrame.from_dict({'BioSample ID': [biosample_id], 
                                                             'Geographical location': [geo_loc_name], 
                                                             'Latitude': [str(lat)], 
                                                             'Longitude': [str(lon)],
                                                             'Isolation source': [isolation_source],
                                                             'Title': [title],
                                                             'Taxa ID': [taxa_id],
                                                             'Organism': [organism]}))               
    except:
        geo_data_rows.append(pd.DataFrame.from_dict({'BioSample ID': [biosample_id], 
                                                     'Geographical location': ['ERROR'], 
                                                     'Latitude': [str('ERROR')], 
                                                     'Longitude': [str('ERROR')],
                                                     'Isolation source': ['ERROR'],
                                                     'Title': ['ERROR'],
                                                     'Taxa ID': ['ERROR'],
                                                     'Organism': ['ERROR']}))
        
geo_data_table = pd.concat(geo_data_rows)

In [None]:
# save table
geo_data_table.to_csv('../results/geodata_table.tsv', index = False, sep = '\t')

In [None]:
geo_data_table.query("`Geographical location` == 'ERROR'")

In [36]:
geo_data_table.query("`BioSample ID` == 'SRS2617844'")

Unnamed: 0,BioSample ID,Geographical location,Latitude,Longitude,Isolation source,Title,Taxa ID,Organism
0,SRS2617844,"USA:Emeryville, CA",37.840854 N,122.289843 W,"Eden Landing Ponds, San Francisco, CA, USA",Aerobic enrichment media from Eden Landing Pon...,408172,marine metagenome


In [34]:
# import
import requests
biosample_id = 'SRS2617844'
response = requests.get('https://www.ebi.ac.uk/metagenomics/api/v1/samples/{0}'.format(biosample_id))
# getting request JSON data structure
biosample_metadata_json = response.json()


In [35]:
biosample_metadata_json

{'data': {'type': 'samples',
  'id': 'SRS2617844',
  'attributes': {'longitude': -122.2898,
   'latitude': 37.8409,
   'biosample': 'SAMN06268793',
   'sample-metadata': [{'key': 'geographic location (longitude)',
     'value': '-122.28984',
     'unit': None},
    {'key': 'geographic location (country and/or sea,region)',
     'value': 'USA:Emeryville, CA',
     'unit': None},
    {'key': 'collection date', 'value': '2012-04-25', 'unit': None},
    {'key': 'geographic location (latitude)',
     'value': '37.840855',
     'unit': None},
    {'key': 'instrument model', 'value': 'Illumina HiSeq 2000', 'unit': None},
    {'key': 'last update date', 'value': '2017-10-27', 'unit': None}],
   'accession': 'SRS2617844',
   'analysis-completed': '2018-10-25',
   'collection-date': '2012-04-25',
   'geo-loc-name': 'USA:Emeryville, CA',
   'sample-desc': 'Aerobic enrichment media from Eden Landing Ponds, California, USA - A23 E3 (3)',
   'environment-biome': None,
   'environment-feature': None,

In [33]:
ecological_data_tibble['BIOSAMPLE'].to_list()

['SAMN06268799',
 'SAMN07630777',
 'SAMN07630779',
 'SAMN07630779',
 'SAMN07631793',
 'SAMN07630784',
 'SAMN08777606',
 'SAMN06267216',
 'SAMN06267216',
 'SAMN06267216',
 'SAMN06267216',
 'SAMN06267216',
 'SAMN06267216',
 'SAMN06267216',
 'SAMN06267220',
 'SAMN06267220',
 'SAMN06267222',
 'SAMN06267222',
 'SAMN06267222',
 'SAMN06267222',
 'SAMN06267222',
 'SAMN06267222',
 'SAMN06267222',
 'SAMN06267222',
 'SAMN06267222',
 'SAMN06267222',
 'SAMN06267222',
 'SAMN06267222',
 'SAMN06267222',
 'SAMN06267222',
 'SAMN06267222',
 'SAMN06267222',
 'SAMN06267222',
 'SAMN06267222',
 'SAMN06267222',
 'SAMN06343954',
 'SAMN06343954',
 'SAMN06343954',
 'SAMN06343954',
 'SAMN06343954',
 'SAMN06343954',
 'SAMN06343954',
 'SAMN06343954',
 'SAMN06343954',
 'SAMN06343954',
 'SAMN06343954',
 'SAMN06343954',
 'SAMN03766243',
 'SAMN03781032',
 'SAMN06268369',
 'SAMN10863950',
 'SAMN09201407',
 'SAMN06267292',
 'SAMN05421986',
 'SAMN03842451',
 'SAMN05421978',
 'SAMEA2619974',
 'SRS3033589',
 'SRR6823441',
 

In [54]:
%%R -i geo_data_table


geo_data_table %<>% as_tibble() %>% dplyr::filter(`Longitude` != 'ERROR' & !is.na(Longitude))
geo_data_table[54,]

[90m# A tibble: 1 x 8[39m
  `BioSample ID` `Geographical loc… Latitude Longitude `Isolation sourc… Title  
  [3m[90m<chr>[39m[23m          [3m[90m<chr>[39m[23m              [3m[90m<chr>[39m[23m    [3m[90m<chr>[39m[23m     [3m[90m<chr>[39m[23m             [3m[90m<chr>[39m[23m  
[90m1[39m SAMEA2619974   NA                 NA       NA        NA                TARA_Y…
[90m# … with 2 more variables: Taxa ID <chr>, Organism <chr>[39m


In [83]:
%%R -i geo_data_table

# load libaries
library(leaflet)
library(htmlwidgets)
library(IRdisplay)

# tune a bit the tibble to work 
geo_data_table %<>% as_tibble() %>% dplyr::filter(`Longitude` != 'ERROR' & !is.na(Longitude) & Longitude != 'NA')
geo_data_table %<>% dplyr::mutate(`Latitude sign` = `Latitude` %>% str_split(' ') %>% purrr::map_chr(2),
                                  `Longitude sign` = `Longitude` %>% str_split(' ') %>% purrr::map_chr(2),
                                  `Latitude` = `Latitude` %>% str_split(' ') %>% purrr::map_chr(1) %>% as.numeric(),
                                  `Latitude` = case_when(`Latitude sign` == 'N' ~ `Latitude`,
                                                         `Latitude sign` == 'S' ~ `Latitude`*-1),
                                  `Longitude` = `Longitude` %>% str_split(' ') %>% purrr::map_chr(1) %>% as.numeric(),
                                  `Longitude` = case_when(`Longitude sign` == 'E' ~ `Longitude`,
                                                          `Longitude sign` == 'W' ~ `Longitude`*-1)) %>%
                    dplyr::filter(!is.na(`Latitude`) & !is.na(`Longitude`))

geo_data_table %<>%
    dplyr::left_join(x = .,
                    y = (ecological_data.tibble %>% dplyr::select(HEADER, BIOSAMPLE)),
                    by = c('BioSample ID' = 'BIOSAMPLE'))

geo_data_table %>%
    group_by(`BioSample ID`) %>%
    dplyr::mutate(HEADERs = paste(`HEADER`, collapse = ', ')) %>%
    dplyr::select(-HEADER) %>%
    unique()

# load table and creating dictionary for MP IDs for BioSample IDs
metagenome_project_ID_tips_and_biosample_corr_table = readr::read_tsv('../results/metagenome_project_ID_tips_and_biosample_corr_table.tsv')
metagenome_projects.dict = metagenome_project_ID_tips_and_biosample_corr_table$`Metagenome Project ID`
names(metagenome_projects.dict) = metagenome_project_ID_tips_and_biosample_corr_table$BIOSAMPLE

library(tidyverse)
geo_data_table %<>%
    dplyr::mutate(`BioSample ID2` = `BioSample ID` %>% tidytidbits::lookup_chr(., dict = metagenome_projects.dict , default = identity))

m <- leaflet() %>%
  addTiles() %>%  # Add default OpenStreetMap map tiles
  addMarkers(lng=174.768, lat=-36.852, popup="The birthplace of R")

saveWidget(m, 'demo.html', selfcontained = TRUE)

k = leaflet::leaflet(geo_data_table) %>%
  addTiles() %>%  # Add default OpenStreetMap map tiles
  addMarkers(~Longitude, ~Latitude, label = geo_data_table$`BioSample ID2`, popup = glue('Biosample ID: {geo_data_table$`BioSample ID`} \n Title:{geo_data_table$`Title`} \n Geographical location: {geo_data_table$`Geographical location`} \n Isolation source: {geo_data_table$`Isolation source`}'))

saveWidget(k, 'demo2.html', selfcontained = TRUE)


geo_data_table


[36m──[39m [1m[1mColumn specification[1m[22m [36m─────────────────────────────────────────────────────────────────────────────[39m
cols(
  new_tip_label = [31mcol_character()[39m,
  BIOSAMPLE = [31mcol_character()[39m,
  `Metagenome Project ID` = [31mcol_character()[39m
)

[90m# A tibble: 608 x 12[39m
   `BioSample ID` `Geographical lo… Latitude Longitude `Isolation sourc… Title  
   [3m[90m<chr>[39m[23m          [3m[90m<chr>[39m[23m                [3m[90m<dbl>[39m[23m     [3m[90m<dbl>[39m[23m [3m[90m<chr>[39m[23m             [3m[90m<chr>[39m[23m  
[90m 1[39m SAMN06268799   USA:Emeryville, …     37.8    -[31m122[39m[31m.[39m  Eden Landing Pon… Aerobi…
[90m 2[39m SAMN07630777   Canada: Ontario       43.4     -[31m80[39m[31m.[39m[31m6[39m composite cistern Leacha…
[90m 3[39m SAMN07630779   Canada: Ontario       43.4     -[31m80[39m[31m.[39m[31m6[39m aquifer           Ground…
[90m 4[39m SAMN07630779   Canada: Ontario       4

In [85]:
%%R

geo_data_table %>%
    group_by(`BioSample ID`) %>%
    dplyr::mutate(HEADERs = paste(`HEADER`, collapse = ', ')) %>%
    dplyr::select(-HEADER) %>%
    unique() %>%
    .$`BioSample ID2`

 [1] "MP 1"         "MP 2"         "MP 3"         "MP 9"         "MP 4"        
 [6] "MP 11"        "MP 19"        "MP 6"         "MP 7"         "MP 22"       
[11] "MP 16"        "MP 10"        "MP 28"        "MP 20"        "MP 12"       
[16] "MP 14"        "MP 34"        "MP 18"        "SRS2617844"   "SRS2617729"  
[21] "MP 44"        "SRS2617736"   "SRS2617843"   "SRS2617738"   "MP 49"       
[26] "SRS2236490"   "SRS2617845"   "MP 40"        "MP 47"        "MP 38"       
[31] "MP 35"        "MP 31"        "MP 17"        "SAMN03217314" "MP 32"       
[36] "MP 36"        "MP 5"         "MP 21"        "MP 33"        "MP 24"       


In [84]:
from IPython.display import IFrame

IFrame(src='./demo2.html', width=1200, height=700)

In [60]:
%%R

ecological_data.tibble %>%
    dplyr::select(HEADER, BIOSAMPLE)

[90m# A tibble: 99 x 2[39m
   HEADER                                                       BIOSAMPLE   
   [3m[90m<chr>[39m[23m                                                        [3m[90m<chr>[39m[23m       
[90m 1[39m 3300000868-JGI12330J12834-1000008-299010-8                   SAMN06268799
[90m 2[39m 3300014206-Ga0172377-10000119-870930-129                     SAMN07630777
[90m 3[39m 3300014208-Ga0172379-10000243-871512-158                     SAMN07630779
[90m 4[39m 3300014208-Ga0172379-10001592-871560-40                      SAMN07630779
[90m 5[39m 3300014494-Ga0182017-10003408-872394-14                      SAMN07631793
[90m 6[39m 3300014613-Ga0180008-1001212-875221-12                       SAMN07630784
[90m 7[39m 3300018015-Ga0187866-1000629-915963-9                        SAMN08777606
[90m 8[39m AntAceMinimDraft_10_1070366.scaffolds.fasta_scaffold00069_38 SAMN06267216
[90m 9[39m AntAceMinimDraft_10_1070366.scaffolds.fasta_scaffold00511_6  SAMN06267

In [42]:
%%R

help(leaflet)

File: /tmp/RtmpjD89pD/Rtxt64f27b644ae9
leaflet                package:leaflet                 R Documentation



Create a Leaflet map widget



Description:



     This function creates a Leaflet map widget using ‘htmlwidgets’.

     The widget can be rendered on HTML pages generated from R

     Markdown, Shiny, or other applications.



Usage:



     leaflet(

       data = NULL,

       width = NULL,

       height = NULL,

       padding = 0,

       options = leafletOptions(),

       elementId = NULL,

       sizingPolicy = leafletSizingPolicy(padding = padding)

     )

     

     leafletOptions(

       minZoom = NULL,

       maxZoom = NULL,

       crs = leafletCRS(),

       worldCopyJump = NULL,

       preferCanvas = NULL,

       ...

     )

     

     leafletCRS(

       crsClass = "L.CRS.EPSG3857",

       code = NULL,

       proj4def = NULL,

       projectedBounds = NULL,

       origin = NULL,

       transformation = NULL,

       scales = NULL,

       resolu