# 2022 EY Data Science Challenge
## Model Building
Now that we have our response variable and a few predictor variables, we can begin developing a prediction model. The first step is to collect all predictor variables and their response variable (frog or no frog) into one dataframe, ready for model training. Once this dataframe is created, we can use it to train a machine learning model, before finally testing that model's accuracy.


In [2]:
# Supress Warnings 
import warnings
warnings.filterwarnings('ignore')

# Plotting tools
import matplotlib.pyplot as plt

# Data science tools
import pandas as pd
import numpy as np

# Geospatial tools
import geopandas as gpd
import contextily as cx
from shapely.geometry import Point, Polygon
import xarray as xr
import rasterio.features
# import xrspatial.multispectral as ms

# API tools
import requests
import json


### Creating the Dataframe

In [3]:
frog_data = (
    pd.read_csv('richmond_frogs.csv')
    .reset_index()
    .rename(columns={'index':'key'})
)
frog_data

Unnamed: 0,key,eventDate,decimalLatitude,decimalLongitude,occurrenceStatus,geometry
0,0,2021-01-10T13:00:00,-33.640336,150.687247,1,POINT (150.687247 -33.640336)
1,1,2021-02-01T10:24:11,-33.609240,150.748344,1,POINT (150.748344 -33.60924)
2,2,2021-02-17T20:06:00,-33.687430,150.708166,1,POINT (150.708166 -33.68743)
3,3,2021-04-03T21:07:40,-33.567792,150.790437,1,POINT (150.790437 -33.567792)
4,4,2021-07-05T15:14:39,-33.629142,150.713045,1,POINT (150.713045 -33.629142)
...,...,...,...,...,...,...
2907,2907,,-33.490500,150.739700,0,POINT (150.73970000000003 -33.49049999999999)
2908,2908,,-33.540900,150.622100,0,POINT (150.62210000000002 -33.54089999999999)
2909,2909,,-33.637500,150.647300,0,POINT (150.64730000000003 -33.637499999999996)
2910,2910,,-33.557700,150.722900,0,POINT (150.7229 -33.5577)


In [4]:
data = xr.open_dataset("jrc_mosaic_sample.nc").load().to_array()

In [5]:
data

In [8]:


model_data = frog_data[['key', 'decimalLongitude', 'decimalLatitude', 'occurrenceStatus']]


# Read in 

filenames = ['jrc_mosaic_sample.nc', 'S2_mosaic_sample.nc']

for file in filenames:
    data = xr.open_dataset(file).load().to_array()
    data_per_point = pd.DataFrame()
    
    for i, (lon, lat, key) in enumerate(zip(frog_data.decimalLongitude, frog_data.decimalLatitude, frog_data.key)):
        if i%500==0:
            print(f"{i} of {len(frog_data)}")
        nearest_point = data.sel(x=lon, y=lat, method="nearest")
        
        values = np.concatenate((np.squeeze(nearest_point.values), np.array([key])))
        columns = list(nearest_point.band.values) + ['key']
        data_per_point = data_per_point.append(
            pd.DataFrame(
                np.array([values]), 
                columns=columns
            )
        )
    
    model_data = model_data.merge(
        data_per_point,
        on = ['key'],
        how = 'inner'
    )
    
    
    


0 of 2912
500 of 2912
1000 of 2912
1500 of 2912
2000 of 2912
2500 of 2912
0 of 2912
500 of 2912
1000 of 2912
1500 of 2912
2000 of 2912
2500 of 2912


In [11]:

X = (
    model_data
    .drop(['key', 'decimalLongitude', 'decimalLatitude', 'occurrenceStatus'], 1)
)


Unnamed: 0,occurrenceStatus,change,extent,occurrence,recurrence,seasonality,transitions,red,green,blue,nir
0,1,253.0,0.0,0.0,0.0,0.0,0.0,513.0,373.0,278.0,2002.0
1,1,253.0,0.0,0.0,0.0,0.0,0.0,888.0,930.0,794.0,1826.0
2,1,253.0,0.0,0.0,0.0,0.0,0.0,630.0,711.0,390.0,2868.0
3,1,253.0,0.0,0.0,0.0,0.0,0.0,642.0,789.0,389.0,3906.0
4,1,253.0,0.0,0.0,0.0,0.0,0.0,277.0,278.0,175.0,1556.0
...,...,...,...,...,...,...,...,...,...,...,...
2907,0,253.0,0.0,0.0,0.0,0.0,0.0,1216.0,1234.0,940.0,2744.0
2908,0,253.0,0.0,0.0,0.0,0.0,0.0,208.0,292.0,162.0,2498.0
2909,0,253.0,0.0,0.0,0.0,0.0,0.0,393.0,361.0,253.0,2082.0
2910,0,253.0,0.0,0.0,0.0,0.0,0.0,420.0,558.0,295.0,3252.0


### Training a Model

In [12]:
from sklearn.linear_model import LogisticRegressionCV

X = (
    model_data
    .drop(['key', 'decimalLongitude', 'decimalLatitude', 'occurrenceStatus'], 1)
)
y = model_data.occurrenceStatus

clf = LogisticRegressionCV(cv=5, random_state=0).fit(X, y)
clf.score(X, y)


0.5803571428571429

### Evaluating the Model