In [1]:
import os
from matplotlib import pyplot as plt
import rasterio
from shapely.geometry import Point
import geopandas as gpd
import numpy as np
import pandas as pd
from random import sample
from fiona.crs import from_string
from functools import partial

In [2]:
os.chdir("/Users/pranavkulkarni/SDM/Climate_Models_Arenaviruses")

In [3]:
ref_rast = rasterio.open("./Data/Input/Processed/Resampled/guan/bclim_01.tif")
e = ref_rast.bounds
e

BoundingBox(left=-84.75, bottom=-4.375, right=-47.25, top=14.375)

In [4]:
gbif_raw = pd.read_csv("./Data/Input/raw/GBIF/gbif_sig01.csv")
gbif_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 326 entries, 0 to 325
Columns: 144 entries, acceptedScientificName to downloadDate
dtypes: bool(2), float64(16), int64(12), object(114)
memory usage: 362.4+ KB


In [5]:
def random_points(mask, n):
    bounds = mask.bounds
    minx, miny, maxx, maxy = bounds.left, bounds.bottom, bounds.right, bounds.top
    
    x = []
    y = []
    for i in range(n):

        xi = np.random.uniform(minx, maxx)

        yi = np.random.uniform(miny, maxy)

        x.append(xi)
        y.append(yi)

    points = pd.DataFrame({'lon':x, 'lat':y})
    return(points)

In [13]:

def process_train(gbif_data, coord_ref, pseudo_abs_f, path_write, layer_name, driver_write):
    gbif = pd.DataFrame({'lon':gbif_data.lon, 'lat':gbif_data.lat})
    gbif.drop_duplicates(inplace = True)
    gbif.dropna(inplace = True)
    bounds = coord_ref.bounds
    minx, miny, maxx, maxy = bounds.left, bounds.bottom, bounds.right, bounds.top
    e = [minx, maxx, miny, maxy] 
    gbif = gbif[(gbif['lon'] >= e[0]) & (gbif['lon'] <= e[1])]
    gbif = gbif[(gbif['lat'] >= e[2]) & (gbif['lat'] <= e[3])]
    presences = gbif.shape[0]
    bg = random_points(coord_ref, presences * pseudo_abs_f)
    train = pd.concat([gbif, bg], ignore_index=True)
    pa_train = np.concatenate([np.ones(len(gbif)), np.zeros(len(bg))])
    train = pd.DataFrame({'CLASS': pa_train, 'lon': train['lon'], 'lat': train['lat']})
    class_pa = pd.DataFrame({'CLASS': train['CLASS']})
    crs = from_string(ref_rast.crs.to_string())
    geometry = [Point(xy) for xy in zip(train['lon'], train['lat'])]
    data_map_gbif = gpd.GeoDataFrame(class_pa, crs=crs, geometry=geometry)

    data_map_gbif.to_file(path_write, layer=layer_name, driver=driver_write, index=False)
    
    return {'bg': bg, 'train': train, 'crs': crs, 'dataMap_gbif': data_map_gbif}

In [14]:
process_list = process_train(gbif_raw, ref_rast, 1, "./Data", "test", "ESRI Shapefile")

  process_list = process_train(gbif_raw, ref_rast, 1, "./Data", "test", "ESRI Shapefile")
