In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score, GridSearchCV
import geopandas as gpd

from google.colab import drive
drive.mount('/content/drive', force_remount=True)
import plotly.express as px

Mounted at /content/drive


In [13]:
# data preprocess
gdf_3 = gpd.read_file(f'/content/drive/My Drive/Mantises!/data/gadm41_USA_shp/gadm41_USA_2.shp')
gdf_3 = gdf_3[(gdf_3['NAME_1'] == 'California') & (gdf_3['TYPE_2'] == 'County')]

df = pd.read_csv(f'/content/drive/My Drive/Mantises!/data/mantodea_ca_gbif_simple.csv', sep='\t')

In [14]:
df.columns

Index(['gbifID', 'datasetKey', 'occurrenceID', 'kingdom', 'phylum', 'class',
       'order', 'family', 'genus', 'species', 'infraspecificEpithet',
       'taxonRank', 'scientificName', 'verbatimScientificName',
       'verbatimScientificNameAuthorship', 'countryCode', 'locality',
       'stateProvince', 'occurrenceStatus', 'individualCount',
       'publishingOrgKey', 'decimalLatitude', 'decimalLongitude',
       'coordinateUncertaintyInMeters', 'coordinatePrecision', 'elevation',
       'elevationAccuracy', 'depth', 'depthAccuracy', 'eventDate', 'day',
       'month', 'year', 'taxonKey', 'speciesKey', 'basisOfRecord',
       'institutionCode', 'collectionCode', 'catalogNumber', 'recordNumber',
       'identifiedBy', 'dateIdentified', 'license', 'rightsHolder',
       'recordedBy', 'typeStatus', 'establishmentMeans', 'lastInterpreted',
       'mediaType', 'issue'],
      dtype='object')

In [15]:
species_df = df[['species', 'decimalLatitude', 'decimalLongitude', 'elevation', 'depth']]

In [16]:
from shapely.geometry import Point

# Create a new geometry column in your pandas dataframe
geometry = [Point(xy) for xy in zip(species_df.decimalLongitude, species_df.decimalLatitude)]
geo_species_df = gpd.GeoDataFrame(species_df, geometry=geometry)


In [17]:
geo_species_df.set_crs(epsg=4326, inplace=True)

gdf_3.set_crs(epsg=4326, inplace=True)
# If necessary, transform one of the dataframes to match the other's CRS
geo_species_df = geo_species_df.to_crs(gdf_3.crs)

In [18]:
joined = gpd.sjoin(geo_species_df, gdf_3, how="inner", op="intersects")

  if (await self.run_code(code, result,  async_=asy)):


In [19]:
joined.head()

Unnamed: 0,species,decimalLatitude,decimalLongitude,elevation,depth,geometry,index_right,GID_2,GID_0,COUNTRY,GID_1,NAME_1,NL_NAME_1,NAME_2,VARNAME_2,NL_NAME_2,TYPE_2,ENGTYPE_2,CC_2,HASC_2
0,Mantis religiosa,38.569964,-122.689257,,,POINT (-122.68926 38.56996),231,USA.5.49_1,USA,United States,USA.5_1,California,,Sonoma,,,County,County,,US.CA.SO
2,Mantis religiosa,38.569659,-122.689175,,,POINT (-122.68918 38.56966),231,USA.5.49_1,USA,United States,USA.5_1,California,,Sonoma,,,County,County,,US.CA.SO
9,Mantis religiosa,38.624724,-122.87158,,,POINT (-122.87158 38.62472),231,USA.5.49_1,USA,United States,USA.5_1,California,,Sonoma,,,County,County,,US.CA.SO
10,Mantis religiosa,38.57523,-122.69398,,,POINT (-122.69398 38.57523),231,USA.5.49_1,USA,United States,USA.5_1,California,,Sonoma,,,County,County,,US.CA.SO
11,Mantis religiosa,38.58435,-122.69993,,,POINT (-122.69993 38.58435),231,USA.5.49_1,USA,United States,USA.5_1,California,,Sonoma,,,County,County,,US.CA.SO


In [20]:
population_df = pd.read_csv(f'/content/drive/My Drive/Mantises!/data/ACSDT5Y2020.B01003-Data.csv')

In [21]:
population_df.head()

Unnamed: 0,County,Population Total
0,Alameda,1661584
1,Alpine,1159
2,Amador,39023
3,Butte,223344
4,Calaveras,45828


In [26]:
county_size_df = pd.read_csv(f'/content/drive/My Drive/Mantises!/data/California_Counties_Area.csv')

In [27]:
country_size_df.head()

Unnamed: 0,Rank,County,Land Area (sq mi)
0,1,San Bernardino,20068.01
1,2,Inyo,10197.26
2,3,Kern,8134.65
3,4,Riverside,7209.27
4,5,Siskiyou,6278.77


In [28]:
county_density_df = pd.merge(county_size_df, population_df, on='County')

In [29]:
county_density_df.head()

Unnamed: 0,Rank,County,Land Area (sq mi),Population Total
0,1,San Bernardino,20068.01,2162532
1,2,Inyo,10197.26,17930
2,3,Kern,8134.65,892458
3,4,Riverside,7209.27,2437864
4,5,Siskiyou,6278.77,43516


In [31]:
county_density_df['human_population_density'] = county_density_df['Population Total'] / county_density_df['Land Area (sq mi)']

In [32]:
county_density_df.head()

Unnamed: 0,Rank,County,Land Area (sq mi),Population Total,human_population_density
0,1,San Bernardino,20068.01,2162532,107.760162
1,2,Inyo,10197.26,17930,1.758315
2,3,Kern,8134.65,892458,109.710682
3,4,Riverside,7209.27,2437864,338.156845
4,5,Siskiyou,6278.77,43516,6.930657


In [33]:
joined.head()

Unnamed: 0,species,decimalLatitude,decimalLongitude,elevation,depth,geometry,index_right,GID_2,GID_0,COUNTRY,GID_1,NAME_1,NL_NAME_1,NAME_2,VARNAME_2,NL_NAME_2,TYPE_2,ENGTYPE_2,CC_2,HASC_2
0,Mantis religiosa,38.569964,-122.689257,,,POINT (-122.68926 38.56996),231,USA.5.49_1,USA,United States,USA.5_1,California,,Sonoma,,,County,County,,US.CA.SO
2,Mantis religiosa,38.569659,-122.689175,,,POINT (-122.68918 38.56966),231,USA.5.49_1,USA,United States,USA.5_1,California,,Sonoma,,,County,County,,US.CA.SO
9,Mantis religiosa,38.624724,-122.87158,,,POINT (-122.87158 38.62472),231,USA.5.49_1,USA,United States,USA.5_1,California,,Sonoma,,,County,County,,US.CA.SO
10,Mantis religiosa,38.57523,-122.69398,,,POINT (-122.69398 38.57523),231,USA.5.49_1,USA,United States,USA.5_1,California,,Sonoma,,,County,County,,US.CA.SO
11,Mantis religiosa,38.58435,-122.69993,,,POINT (-122.69993 38.58435),231,USA.5.49_1,USA,United States,USA.5_1,California,,Sonoma,,,County,County,,US.CA.SO


In [35]:
joined = joined.rename(columns = {'NAME_2': 'County'})

In [36]:
joined.head()

Unnamed: 0,species,decimalLatitude,decimalLongitude,elevation,depth,geometry,index_right,GID_2,GID_0,COUNTRY,GID_1,NAME_1,NL_NAME_1,County,VARNAME_2,NL_NAME_2,TYPE_2,ENGTYPE_2,CC_2,HASC_2
0,Mantis religiosa,38.569964,-122.689257,,,POINT (-122.68926 38.56996),231,USA.5.49_1,USA,United States,USA.5_1,California,,Sonoma,,,County,County,,US.CA.SO
2,Mantis religiosa,38.569659,-122.689175,,,POINT (-122.68918 38.56966),231,USA.5.49_1,USA,United States,USA.5_1,California,,Sonoma,,,County,County,,US.CA.SO
9,Mantis religiosa,38.624724,-122.87158,,,POINT (-122.87158 38.62472),231,USA.5.49_1,USA,United States,USA.5_1,California,,Sonoma,,,County,County,,US.CA.SO
10,Mantis religiosa,38.57523,-122.69398,,,POINT (-122.69398 38.57523),231,USA.5.49_1,USA,United States,USA.5_1,California,,Sonoma,,,County,County,,US.CA.SO
11,Mantis religiosa,38.58435,-122.69993,,,POINT (-122.69993 38.58435),231,USA.5.49_1,USA,United States,USA.5_1,California,,Sonoma,,,County,County,,US.CA.SO


In [37]:
joined = joined[['species', 'decimalLatitude', 'decimalLongitude', 'elevation', 'depth', 'geometry', 'County']]

In [38]:
joined.head()

Unnamed: 0,species,decimalLatitude,decimalLongitude,elevation,depth,geometry,County
0,Mantis religiosa,38.569964,-122.689257,,,POINT (-122.68926 38.56996),Sonoma
2,Mantis religiosa,38.569659,-122.689175,,,POINT (-122.68918 38.56966),Sonoma
9,Mantis religiosa,38.624724,-122.87158,,,POINT (-122.87158 38.62472),Sonoma
10,Mantis religiosa,38.57523,-122.69398,,,POINT (-122.69398 38.57523),Sonoma
11,Mantis religiosa,38.58435,-122.69993,,,POINT (-122.69993 38.58435),Sonoma


In [39]:
all_data = pd.merge(joined, county_density_df, on='County')

In [40]:
all_data.head()

Unnamed: 0,species,decimalLatitude,decimalLongitude,elevation,depth,geometry,County,Rank,Land Area (sq mi),Population Total,human_population_density
0,Mantis religiosa,38.569964,-122.689257,,,POINT (-122.68926 38.56996),Sonoma,29,1575.63,496801,315.303085
1,Mantis religiosa,38.569659,-122.689175,,,POINT (-122.68918 38.56966),Sonoma,29,1575.63,496801,315.303085
2,Mantis religiosa,38.624724,-122.87158,,,POINT (-122.87158 38.62472),Sonoma,29,1575.63,496801,315.303085
3,Mantis religiosa,38.57523,-122.69398,,,POINT (-122.69398 38.57523),Sonoma,29,1575.63,496801,315.303085
4,Mantis religiosa,38.58435,-122.69993,,,POINT (-122.69993 38.58435),Sonoma,29,1575.63,496801,315.303085


In [41]:
all_data = all_data.drop('Rank', axis=1)

In [42]:
all_data.columns

Index(['species', 'decimalLatitude', 'decimalLongitude', 'elevation', 'depth',
       'geometry', 'County', 'Land Area (sq mi)', 'Population Total',
       'human_population_density'],
      dtype='object')

In [43]:
# getting climate data
climate_data = pd.read_csv(f'/content/drive/My Drive/Mantises!/output/climatic_data/california_climate_data.csv')

In [44]:
climate_data.head()

Unnamed: 0,State,County,Year,tmax,tmin,prcp_monttl
0,California,Alameda,1981,,21.91,
1,California,Alpine,1981,,-3.91,
2,California,Amador,1981,,,15.53
3,California,Butte,1981,29.61,,
4,California,Calaveras,1981,,,18.55
