# MGRAST request to geojson
11/17/2015. 12/13,4/2014. Emilio Mayorga

In [1]:
import requests
from collections import defaultdict
import json
from geojson import Point, Feature, FeatureCollection
import numpy as np
import pandas as pd
import geopandas as gpd

In [2]:
%matplotlib inline
import matplotlib.pyplot as plt
import mplleaflet

**Issuing HTTP requests using the `requests` library**
See *IGSNxml_to_geojson.ipynb* for a bit of code on post requests.  
Tutorials:    
http://engineering.hackerearth.com/2014/08/21/python-requests-module/  
http://docs.python-requests.org/en/latest/user/quickstart/  
To specify parameters in the POST request:
```
headers = {'user-agent': 'WADOH_NANOOS_pythonrequests_test'}
payload = {'key1': 'value1', 'key2': 'value2'}
r = requests.post('http://api.ech2odata.com/wdoh/dxd.cgi', params=payload, headers=headers)
```

In [3]:
payload = {'verbosity': 'mixs', 'limit':1000}
urlendpoint = 'http://api.metagenomics.anl.gov//metagenome'
r = requests.get(urlendpoint, params=payload)

In [4]:
r.url

u'http://api.metagenomics.anl.gov//metagenome?verbosity=mixs&limit=1000'

In [5]:
mgrast_resp = r.json()

In [6]:
#mgrast_json_fpath = "/usr/mayorgadat/workmain/RIGHT NOW/2013_NSF_BiGCZ_SSI/ProjectWork/bioinformatics/metagenome_limit10k_mixs_firefox.json"
#with open(mgrast_json_fpath, 'r') as infile:
#    mgrast_resp = json.load(infile)

In [7]:
mgrast_resp.keys()

[u'url',
 u'total_count',
 u'next',
 u'version',
 u'limit',
 u'offset',
 u'prev',
 u'data',
 u'order']

In [8]:
len(mgrast_resp['data'])

1000

In [9]:
mgrast_resp_data_lst = mgrast_resp['data']
mgrast_resp_data_lst[:2]

[{u'PI_firstname': u'Dana',
  u'PI_lastname': u'Willner',
  u'biome': u'animal-associated habitat',
  u'collection_date': u'2009-07-01 UTC',
  u'country': u'United States of America',
  u'created': u'2007-04-27T14:47:11Z',
  u'env_package_type': u'human-associated',
  u'feature': u'animal-associated habitat',
  u'id': u'mgm4440026.3',
  u'latitude': 32.8781,
  u'location': u'Adult cystic fibrosis Clinic at the University of California San Diego Medical Center',
  u'longitude': -117.1072,
  u'material': u'animal-associated habitat',
  u'name': u'CFLungPat001Rep1SDVir20060505',
  u'project_id': u'mgp31',
  u'project_name': u'Human Lung Healthy vs Cystic Fibrosis Metagenome',
  u'seq_method': u'454',
  u'sequence_type': u'WGS',
  u'status': u'public',
  u'url': u'http://api.metagenomics.anl.gov/beta/metagenome/mgm4440026.3'},
 {u'PI_firstname': u'Elizabeth',
  u'PI_lastname': u'Dinsdale',
  u'biome': u'marine habitat',
  u'collection_date': u'2005-08-21 UTC',
  u'country': u'United States

In [10]:
#Create new list of igsn's, based on valid responses
# Note that it looks like ~400 of the ~954 parseable igsn responses
# have valid (ie, not 'Not Provided') latitude and longitude values!
mgrast_valid_lst = [mgrast for mgrast in mgrast_resp_data_lst 
                    if isinstance(mgrast['longitude'], float) and isinstance(mgrast['latitude'], float)
                    and mgrast['country'] in ('USA', 'United States of America')]
len(mgrast_valid_lst)
# Note: before I added the USA filter, the returned list had 9528 elements

369

In [11]:
# geojson features and feature collection
features = []
for mgrast_rec in mgrast_valid_lst:
    feature = Feature(geometry=Point((mgrast_rec['longitude'], mgrast_rec['latitude'])), 
                      id=mgrast_rec['id'], properties=mgrast_rec)
    features.append(feature)

mgrast_featcoll = FeatureCollection(features)

In [12]:
mgrast_featcoll['features'][:2]

[{"geometry": {"coordinates": [-117.1072, 32.8781], "type": "Point"}, "id": "mgm4440026.3", "properties": {"PI_firstname": "Dana", "PI_lastname": "Willner", "biome": "animal-associated habitat", "collection_date": "2009-07-01 UTC", "country": "United States of America", "created": "2007-04-27T14:47:11Z", "env_package_type": "human-associated", "feature": "animal-associated habitat", "id": "mgm4440026.3", "latitude": 32.8781, "location": "Adult cystic fibrosis Clinic at the University of California San Diego Medical Center", "longitude": -117.1072, "material": "animal-associated habitat", "name": "CFLungPat001Rep1SDVir20060505", "project_id": "mgp31", "project_name": "Human Lung Healthy vs Cystic Fibrosis Metagenome", "seq_method": "454", "sequence_type": "WGS", "status": "public", "url": "http://api.metagenomics.anl.gov/beta/metagenome/mgm4440026.3"}, "type": "Feature"},
 {"geometry": {"coordinates": [-162.33472, 6.3856], "type": "Point"}, "id": "mgm4440036.3", "properties": {"PI_first

In [13]:
# Note that all columns are converted into "object" dtype, not something more specific
# and appropriate like 'string' or 'float' (eg, for latitude & longitude)
# Look into how to fix that.
mgrast_fc_gdf = gpd.GeoDataFrame.from_features(features=mgrast_featcoll['features'])

In [14]:
mgrast_fc_gdf.head()

Unnamed: 0,PI_firstname,PI_lastname,biome,collection_date,country,created,env_package_type,feature,geometry,id,...,location,longitude,material,name,project_id,project_name,seq_method,sequence_type,status,url
0,Dana,Willner,animal-associated habitat,2009-07-01 UTC,United States of America,2007-04-27T14:47:11Z,human-associated,animal-associated habitat,POINT (-117.1072 32.8781),mgm4440026.3,...,Adult cystic fibrosis Clinic at the University...,-117.1072,animal-associated habitat,CFLungPat001Rep1SDVir20060505,mgp31,Human Lung Healthy vs Cystic Fibrosis Metagenome,454,WGS,public,http://api.metagenomics.anl.gov/beta/metagenom...
1,Elizabeth,Dinsdale,marine habitat,2005-08-21 UTC,United States of America,2007-05-21T18:37:44Z,water,marine habitat,POINT (-162.33472 6.3856),mgm4440036.3,...,Kingman Atoll,-162.33472,marine habitat,KingLIVir20050821,mgp40,Northern Line Islands,454,WGS,public,http://api.metagenomics.anl.gov/beta/metagenom...
2,Elizabeth,Dinsdale,marine habitat,2005-08-21 UTC,United States of America,2007-05-21T18:50:32Z,water,marine habitat,POINT (-162.33472 6.3856),mgm4440037.3,...,Kingman Atoll,-162.33472,marine habitat,KingLIMic20050821,mgp40,Northern Line Islands,454,WGS,public,http://api.metagenomics.anl.gov/beta/metagenom...
3,Elizabeth,Dinsdale,marine habitat,2005-08-05 UTC,United States of America,2007-05-21T19:06:22Z,water,marine habitat,POINT (-157.48407 1.9923167),mgm4440038.3,...,Christmas (Kiritimati) Atoll,-157.48407,marine habitat,XmasLIVir20050805,mgp40,Northern Line Islands,454,WGS,public,http://api.metagenomics.anl.gov/beta/metagenom...
4,Elizabeth,Dinsdale,marine habitat,2005-08-18 UTC,United States of America,2007-05-21T19:07:07Z,water,marine habitat,POINT (-162.1278 5.866944),mgm4440039.3,...,Palmyra Atoll,-162.1278,marine habitat,PalmLIMic20050818,mgp40,Northern Line Islands,454,WGS,public,http://api.metagenomics.anl.gov/beta/metagenom...


In [15]:
mgrast_fc_gdf.columns

Index([    u'PI_firstname',      u'PI_lastname',            u'biome',
        u'collection_date',          u'country',          u'created',
       u'env_package_type',          u'feature',         u'geometry',
                     u'id',         u'latitude',         u'location',
              u'longitude',         u'material',             u'name',
             u'project_id',     u'project_name',       u'seq_method',
          u'sequence_type',           u'status',              u'url'],
      dtype='object')

In [16]:
mgrast_fc_gdf.country.value_counts()

United States of America    336
USA                          33
Name: country, dtype: int64

In [17]:
mgrast_fc_gdf.env_package_type.value_counts()

host-associated          180
water                     99
soil                      25
                          24
human-gut                 18
sediment                   9
human-associated           5
miscellaneous              4
human-oral                 2
built environment          1
wastewater|sludge          1
microbial mat|biofilm      1
Name: env_package_type, dtype: int64

In [18]:
sorted(dict(mgrast_fc_gdf.env_package_type.value_counts()))

[u'',
 u'built environment',
 u'host-associated',
 u'human-associated',
 u'human-gut',
 u'human-oral',
 u'microbial mat|biofilm',
 u'miscellaneous',
 u'sediment',
 u'soil',
 u'wastewater|sludge',
 u'water']

In [19]:
mgrast_fc_gdf.biome.value_counts()

feces                                                  107
animal-associated habitat                               77
marine habitat                                          64
marine biome                                            23
grassland biome                                         16
soil                                                    15
Small lake biome                                        10
freshwater                                               9
Temperate grasslands                                     8
freshwater biome                                         7
hot spring ; microbial mat                               6
cultured habitat                                         5
extreme habitat ; hypersaline                            3
hot spring                                               3
mine drainage                                            3
marine habitat, animal-associated habitat                2
sludge                                                  

In [20]:
mgrast_fc_gdf.material.value_counts()

feces                                                  115
animal-associated habitat                               77
marine habitat                                          64
saline water                                            23
bulk soil                                               16
soil                                                    15
bodily fluid                                            10
freshwater                                               9
sediment                                                 7
hot spring ; microbial mat                               6
cultured habitat                                         5
extreme habitat ; hypersaline                            3
hot spring                                               3
mine drainage                                            3
marine habitat, animal-associated habitat                2
anaerobic sludge                                         1
marine sediment                                         

In [21]:
# looks like this "slicing" results in a DataFrame that's *not*
# a GeoDataFrame! Double check ...
#mgrast_usa_fc_gdf = mgrast_fc_gdf[mgrast_fc_gdf.isin(['USA', 'United States of America'])]

In [22]:
# I'd rather avoid having to do this explicit x & y extraction, but ok for now
# Note that the GeoDataFrame is not used here at all
mgrast_fc_xy = np.array([feat['geometry']['coordinates'] for feat in mgrast_featcoll['features']])
# Borrowed usage pattern from
# http://nbviewer.ipython.org/github/ioos/secoora/blob/master/notebooks/HF_radar/HFR_comparison.ipynb
fig, ax = plt.subplots()
ptscatterplt = ax.scatter(mgrast_fc_xy[:,0], mgrast_fc_xy[:,1], c='r', marker='o', s=50)
mplleaflet.display(fig=ptscatterplt.figure)

In [23]:
# filtered (czo bounding box) geojson features and feature collection
# US Lower 48 bbox: -125.68,24.53,-65.04,50.06
features = []
for mgrast_rec in mgrast_valid_lst:
    lon, lat = mgrast_rec['longitude'], mgrast_rec['latitude']
    if (lon > -125.68 and lon < -65.04) and (lat > 24.53 and lat < 50.06) \
        and mgrast_rec['env_package_type'] in ('air', 'built environment', 'microbial mat|biofilm', 
                             'plant-associated', 'sediment', 'soil', 'water'):
        feature = Feature(geometry=Point((lon, lat)), 
                          id=mgrast_rec['id'], properties=mgrast_rec)
        features.append(feature)

mgrast_featcoll_usabb = FeatureCollection(features)

In [24]:
# Save geojson to file so I can get back to the content later w/o issuing all the requests again
#with open('/usr/mayorgadat/workmain/RIGHT NOW/2013_NSF_BiGCZ_SSI/ProjectWork/bioinformatics/mgrast_usa1_geojsonfc.json', 'w') as outfile:
#     json.dump(mgrast_featcoll_usabb, outfile, indent=4)

In [25]:
mgrast_usabb_fc_gdf2 = gpd.GeoDataFrame.from_features(features=mgrast_featcoll_usabb['features'])

In [26]:
len(mgrast_usabb_fc_gdf2)

92

In [27]:
mgrast_fc_xy = np.array([feat['geometry']['coordinates'] for feat in mgrast_featcoll_usabb['features']])
fig, ax = plt.subplots()
ptscatterplt = ax.scatter(mgrast_fc_xy[:,0], mgrast_fc_xy[:,1], c='r', marker='o', s=50)
mplleaflet.display(fig=ptscatterplt.figure)

In [28]:
mgrast_usabb_fc_gdf2.material.value_counts()

marine habitat                   27
saline water                     23
bulk soil                        16
freshwater                        8
sediment                          6
soil                              5
hot spring                        2
mine drainage                     2
biofilm                           1
marine sediment                   1
mine drainage ; microbial mat     1
Name: material, dtype: int64

In [29]:
mgrast_usabb_fc_gdf2.feature.value_counts()

marine habitat                   27
saline evaporation pond          23
meadow soil                      16
freshwater                        8
sediment                          6
soil                              5
hot spring                        2
mine drainage                     2
biofilm                           1
abyssal plain                     1
mine drainage ; microbial mat     1
Name: feature, dtype: int64

In [30]:
mgrast_usabb_fc_gdf2.status.value_counts()

public    92
Name: status, dtype: int64