### Set-up

In [2]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)
!cd /content/drive/MyDrive/Mantises!/

Mounted at /content/drive


In [38]:
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point
import ast
import numpy as np
import os
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

### Create main table structure

In [7]:
counties_shapefile = '/content/drive/My Drive/Mantises!/data/gadm41_USA_shp/gadm41_USA_2.shp'
counties = gpd.read_file(counties_shapefile)

# Filter to include only counties in California
california_counties = counties[counties['NAME_1'] == 'California']
merged_table1 = pd.DataFrame(california_counties['NAME_2'].unique(), columns=['County'])


merged_table1.head()


Unnamed: 0,County
0,Alameda
1,Alpine
2,Amador
3,Butte
4,Calaveras


### Join with ecoregion data

In [9]:
# Load the ecoregion data
ecoregion_data_path = '/content/drive/My Drive/Mantises!/output/ecoregions_per_county.csv'
ecoregion_data = pd.read_csv(ecoregion_data_path)

unique_ecoregions = set()
for ecoregions_str in ecoregion_data['Unique_Ecoregions']:
    ecoregions = ast.literal_eval(ecoregions_str)
    unique_ecoregions.update(ecoregions)

# Convert the 'Unique_Ecoregions' column in ecoregion_data to actual sets
ecoregion_data['Unique_Ecoregions'] = ecoregion_data['Unique_Ecoregions'].apply(ast.literal_eval)

# Create a dictionary mapping counties to their ecoregions
county_ecoregion_map = ecoregion_data.set_index('County')['Unique_Ecoregions'].to_dict()

# Add binary columns for each ecoregion
for ecoregion in unique_ecoregions:
    merged_table1['Eco_' + ecoregion.replace(' ', '_')] = merged_table1['County'].map(
        lambda county: ecoregion in county_ecoregion_map.get(county, set())
    )

merged_table1.head()

Unnamed: 0,County,Eco_Central_California_Foothills_and_Coastal_Mountains,Eco_Eastern_Cascades_Slopes_and_Foothills,Eco_Cascades,Eco_Sonoran_Basin_and_Range,Eco_Central_Basin_and_Range,Eco_Klamath_Mountains/California_High_North_Coast_Range,Eco_Northern_Basin_and_Range,Eco_Mojave_Basin_and_Range,Eco_Central_California_Valley,Eco_Southern_California_Mountains,Eco_Sierra_Nevada,Eco_Coast_Range,Eco_Southern_California/Northern_Baja_Coast
0,Alameda,True,False,False,False,False,False,False,False,True,False,False,False,False
1,Alpine,False,False,False,False,True,False,False,False,False,False,True,False,False
2,Amador,True,False,False,False,False,False,False,False,False,False,True,False,False
3,Butte,True,False,True,False,False,False,False,False,True,False,True,False,False
4,Calaveras,True,False,False,False,False,False,False,False,False,False,True,False,False


### Join with human population data

In [11]:
# Load the human population data
human_population_data_path = '/content/drive/My Drive/Mantises!/output/human_population_per_county.csv'
human_population_data = pd.read_csv(human_population_data_path)
human_population_data = human_population_data.rename(columns={'NAME_2': 'County'})
human_population_data = human_population_data[['County', 'population_total']]

# Merge
merged_table2 = pd.merge(merged_table1, human_population_data, on='County', how='left')

merged_table2.head()


Unnamed: 0,County,Eco_Central_California_Foothills_and_Coastal_Mountains,Eco_Eastern_Cascades_Slopes_and_Foothills,Eco_Cascades,Eco_Sonoran_Basin_and_Range,Eco_Central_Basin_and_Range,Eco_Klamath_Mountains/California_High_North_Coast_Range,Eco_Northern_Basin_and_Range,Eco_Mojave_Basin_and_Range,Eco_Central_California_Valley,Eco_Southern_California_Mountains,Eco_Sierra_Nevada,Eco_Coast_Range,Eco_Southern_California/Northern_Baja_Coast,population_total
0,Alameda,True,False,False,False,False,False,False,False,True,False,False,False,False,1661584
1,Alpine,False,False,False,False,True,False,False,False,False,False,True,False,False,1159
2,Amador,True,False,False,False,False,False,False,False,False,False,True,False,False,39023
3,Butte,True,False,True,False,False,False,False,False,True,False,True,False,False,223344
4,Calaveras,True,False,False,False,False,False,False,False,False,False,True,False,False,45828


### Join table with climatic data

In [17]:
# Load the climate data
climate_data_path = '/content/drive/My Drive/Mantises!/output/climatic_data/california_climate_dummy_data.csv'
climate_data = pd.read_csv(climate_data_path)
climate_data = climate_data.drop(columns=['State'])
climate_data_per_county = climate_data.groupby('County').agg({'tmax': 'mean', 'tmin': 'mean', 'prcp_monttl': 'mean'}).reset_index()


# Merge
merged_table3 = pd.merge(merged_table2, climate_data_per_county, on='County', how='left')

merged_table3.head()


Unnamed: 0,County,Eco_Central_California_Foothills_and_Coastal_Mountains,Eco_Eastern_Cascades_Slopes_and_Foothills,Eco_Cascades,Eco_Sonoran_Basin_and_Range,Eco_Central_Basin_and_Range,Eco_Klamath_Mountains/California_High_North_Coast_Range,Eco_Northern_Basin_and_Range,Eco_Mojave_Basin_and_Range,Eco_Central_California_Valley,Eco_Southern_California_Mountains,Eco_Sierra_Nevada,Eco_Coast_Range,Eco_Southern_California/Northern_Baja_Coast,population_total,tmax,tmin,prcp_monttl
0,Alameda,True,False,False,False,False,False,False,False,True,False,False,False,False,1661584,26.572426,10.113893,9.689462
1,Alpine,False,False,False,False,True,False,False,False,False,False,True,False,False,1159,28.133128,8.786456,9.88497
2,Amador,True,False,False,False,False,False,False,False,False,False,True,False,False,39023,26.865087,11.162264,9.877558
3,Butte,True,False,True,False,False,False,False,False,True,False,True,False,False,223344,27.455577,9.70883,10.432373
4,Calaveras,True,False,False,False,False,False,False,False,False,False,True,False,False,45828,28.084286,9.06259,10.175897


### Join county and biodiversity data



In [14]:
biodiversity_data = pd.read_csv(f'/content/drive/My Drive/Mantises!/data/mantodea_ca_gbif_simple.csv', sep='\t')

status = {
    'Mantis religiosa': 'introduced',
    'Stagmomantis californica': 'native',
    'Litaneutria pacifica': 'native',
    'Litaneutria skinneri': 'native',
    'Stagmomantis limbata': 'native',
    'Iris oratoria': 'introduced',
    'Miomantis caffra': 'introduced',
    'Litaneutria ocularis': 'native',
    'Tenodera sinensis': 'introduced',
    'Litaneutria chaparrali': 'native',
    'Hierodula patellifera': 'introduced',
    'Litaneutria minor': 'native',
    'Yersiniops newboldi': 'native',
    'Thesprotia graminis': 'native'
}

# Add status column
biodiversity_data['status'] = biodiversity_data['species'].map(status)

unique_species = biodiversity_data['species'].unique()

biodiversity_data.head()

Unnamed: 0,gbifID,datasetKey,occurrenceID,kingdom,phylum,class,order,family,genus,species,...,dateIdentified,license,rightsHolder,recordedBy,typeStatus,establishmentMeans,lastInterpreted,mediaType,issue,status
0,923926829,50c9509d-22c7-4a22-a47d-8c48425ef4a7,http://www.inaturalist.org/observations/766957,Animalia,Arthropoda,Insecta,Mantodea,Mantidae,Mantis,Mantis religiosa,...,2014-07-03T20:54:23,CC_BY_NC_4_0,Callahan Charleton,Callahan Charleton,,,2023-09-28T11:04:15.229Z,StillImage,CONTINENT_DERIVED_FROM_COORDINATES;TAXON_MATCH...,introduced
1,923921190,50c9509d-22c7-4a22-a47d-8c48425ef4a7,http://www.inaturalist.org/observations/750035,Animalia,Arthropoda,Insecta,Mantodea,Mantidae,Mantis,Mantis religiosa,...,2014-06-23T01:27:47,CC0_1_0,Tony Iwane,Tony Iwane,,,2023-09-28T12:35:45.908Z,StillImage,COORDINATE_ROUNDED;CONTINENT_DERIVED_FROM_COOR...,introduced
2,923917961,50c9509d-22c7-4a22-a47d-8c48425ef4a7,http://www.inaturalist.org/observations/741586,Animalia,Arthropoda,Insecta,Mantodea,Mantidae,Mantis,Mantis religiosa,...,2014-06-18T17:43:13,CC_BY_NC_4_0,Todd Plummer,Todd Plummer,,,2023-09-28T11:04:17.149Z,StillImage,CONTINENT_DERIVED_FROM_COORDINATES;TAXON_MATCH...,introduced
3,899975780,50c9509d-22c7-4a22-a47d-8c48425ef4a7,http://www.inaturalist.org/observations/645036,Animalia,Arthropoda,Insecta,Mantodea,Mantidae,Mantis,Mantis religiosa,...,2014-04-27T06:25:51,CC_BY_NC_4_0,Eric Jacob,Eric Jacob,,,2023-09-28T12:34:59.254Z,StillImage,CONTINENT_DERIVED_FROM_COORDINATES;TAXON_MATCH...,introduced
4,891755288,50c9509d-22c7-4a22-a47d-8c48425ef4a7,http://www.inaturalist.org/observations/439613,Animalia,Arthropoda,Insecta,Mantodea,Mantidae,Mantis,Mantis religiosa,...,2013-10-25T04:13:21,CC_BY_NC_4_0,Paul G. Johnson,Paul G. Johnson,,,2023-09-28T11:04:05.380Z,StillImage,CONTINENT_DERIVED_FROM_COORDINATES;TAXON_MATCH...,introduced


In [15]:
# Load the shapefile with US counties
counties_shapefile = '/content/drive/My Drive/Mantises!/data/gadm41_USA_shp/gadm41_USA_2.shp'
counties = gpd.read_file(counties_shapefile)

# Convert the biodiversity data into a GeoDataFrame
geometry = [Point(xy) for xy in zip(biodiversity_data['decimalLongitude'], biodiversity_data['decimalLatitude'])]
biodiversity_gdf = gpd.GeoDataFrame(biodiversity_data, geometry=geometry)
biodiversity_gdf.set_crs(counties.crs, inplace=True)

# Spatial join to find which county each point falls into
joined = gpd.sjoin(biodiversity_gdf, counties, how='inner', predicate='within')

# Aggregate data to count occurrences per species, status, and county
biodiv_per_county = joined.groupby(['species', 'status', 'NAME_2']).size().reset_index(name='count')
biodiv_per_county = biodiv_per_county.rename(columns={'NAME_2': 'County'})

biodiv_per_county.head()

Unnamed: 0,species,status,County,count
0,Hierodula patellifera,introduced,Alameda,1
1,Iris oratoria,introduced,Alameda,1
2,Iris oratoria,introduced,Butte,16
3,Iris oratoria,introduced,Contra Costa,11
4,Iris oratoria,introduced,Fresno,11


In [20]:
# Initialize columns for each species
for species in unique_species:
    merged_table3[species] = 0

# Update the species column values
for index, row in biodiv_per_county.iterrows():
    species = row['species']
    county = row['County']
    count = row['count']

    if count > 0:
        # Update the species column for the county to 1 (presence)
        merged_table3.loc[merged_table3['County'] == county, species] = 1

# Display the updated merged_table
merged_table3.head()

      County  Eco_Central_California_Foothills_and_Coastal_Mountains  \
0    Alameda                                               True        
1     Alpine                                              False        
2     Amador                                               True        
3      Butte                                               True        
4  Calaveras                                               True        

   Eco_Eastern_Cascades_Slopes_and_Foothills  Eco_Cascades  \
0                                      False         False   
1                                      False         False   
2                                      False         False   
3                                      False          True   
4                                      False         False   

   Eco_Sonoran_Basin_and_Range  Eco_Central_Basin_and_Range  \
0                        False                        False   
1                        False                         True   
2    

### Decision trees per species

In [30]:
# Define ecoregion columns based on unique_ecoregions set
ecoregion_columns = ['Eco_' + ecoregion.replace(' ', '_') for ecoregion in unique_ecoregions]

# Output directory for decision tree visualizations
output_dir = '/content/drive/My Drive/Mantises!/output/decision_trees'
os.makedirs(output_dir, exist_ok=True)

# Iterate over each species to build a model and visualize the decision tree
for species in unique_species:
    # Prepare features (X) and target variable (y)
    X = merged_table3[ecoregion_columns]
    y = merged_table3[species]

    # Check if there's enough data to train the model
    if y.nunique() <= 1:
        print(f"Skipping model for {species}: insufficient variation in target variable.")
        continue

    # Initialize and train the decision tree classifier
    model = DecisionTreeClassifier(criterion='entropy', random_state=42)
    model.fit(X, y)

    # Visualize the decision tree
    plt.figure(figsize=(20, 10))
    plot_tree(model, feature_names=ecoregion_columns, class_names=['Absent', 'Present'], filled=True)
    plt.title(f"Decision Tree for {species}")

    # Save the figure
    plt.savefig(os.path.join(output_dir, f"decision_tree_{species}.png"))
    plt.close()

    # Extract and print feature importances
    feature_importances = model.feature_importances_
    importance_df = pd.DataFrame(feature_importances, index=ecoregion_columns, columns=['Importance']).sort_values(by='Importance', ascending=False)
    print(f"Feature Importances for {species}:\n", importance_df)
    print("\n")


Feature Importances for Mantis religiosa:
                                                     Importance
Eco_Central_California_Foothills_and_Coastal_Mo...    0.205653
Eco_Cascades                                          0.137736
Eco_Klamath_Mountains/California_High_North_Coa...    0.130165
Eco_Sonoran_Basin_and_Range                           0.118756
Eco_Mojave_Basin_and_Range                            0.095725
Eco_Sierra_Nevada                                     0.088841
Eco_Southern_California/Northern_Baja_Coast           0.084494
Eco_Central_California_Valley                         0.072480
Eco_Southern_California_Mountains                     0.066149
Eco_Eastern_Cascades_Slopes_and_Foothills             0.000000
Eco_Central_Basin_and_Range                           0.000000
Eco_Northern_Basin_and_Range                          0.000000
Eco_Coast_Range                                       0.000000


Feature Importances for Stagmomantis californica:
                       

### Decision trees for introduced/native

In [37]:
# Convert all column names to strings
merged_table3.columns = merged_table3.columns.map(str)

# Define ecoregion columns
ecoregion_columns = [col for col in merged_table3.columns if col.startswith('Eco_')]

# Features for the model
feature_columns = ['tmax', 'tmin', 'prcp_monttl', 'population_total'] + ecoregion_columns

# Prepare features (X) and target variable (y)
X = merged_data[feature_columns]
y = merged_data['status'].apply(lambda x: 1 if x == 'introduced' else 0)

# Initialize and train the decision tree classifier
model = DecisionTreeClassifier(criterion='entropy', random_state=42)
model.fit(X, y)

# Visualize the decision tree
plt.figure(figsize=(20, 10))
plot_tree(model, feature_names=feature_columns, class_names=['Native', 'Introduced'], filled=True)
plt.title("Decision Tree for Introduced vs. Native Species")

# Save the figure
plt.savefig(os.path.join(output_dir, "decision_tree_introduced_vs_native.png"))
plt.close()

# Extract and print feature importances
feature_importances = model.feature_importances_
importance_df = pd.DataFrame(feature_importances, index=feature_columns, columns=['Importance']).sort_values(by='Importance', ascending=False)
print("Feature Importances for Introduced vs. Native Species:\n", importance_df)


Feature Importances for Introduced vs. Native Species:
                                                     Importance
tmin                                                  0.235126
tmax                                                  0.176623
prcp_monttl                                           0.129455
Eco_Central_California_Valley                         0.126558
population_total                                      0.123293
Eco_Central_California_Foothills_and_Coastal_Mo...    0.112166
Eco_Southern_California_Mountains                     0.059572
Eco_Sierra_Nevada                                     0.037208
Eco_Mojave_Basin_and_Range                            0.000000
Eco_Coast_Range                                       0.000000
Eco_Central_Basin_and_Range                           0.000000
Eco_Northern_Basin_and_Range                          0.000000
Eco_Klamath_Mountains/California_High_North_Coa...    0.000000
Eco_Sonoran_Basin_and_Range                           0.000000