### Set-up

In [19]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)
!cd /content/drive/MyDrive/Mantises!/

Mounted at /content/drive


In [41]:
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point
import ast
import numpy as np
import os
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer
import matplotlib.pyplot as plt

### Create main table structure

In [21]:
counties_shapefile = '/content/drive/My Drive/Mantises!/data/gadm41_USA_shp/gadm41_USA_2.shp'
counties = gpd.read_file(counties_shapefile)

# Filter to include only counties in California
california_counties = counties[counties['NAME_1'] == 'California']
merged_table1 = pd.DataFrame(california_counties['NAME_2'].unique(), columns=['County'])


merged_table1.head()


Unnamed: 0,County
0,Alameda
1,Alpine
2,Amador
3,Butte
4,Calaveras


### Join with ecoregion data

In [22]:
# Load the ecoregion data
ecoregion_data_path = '/content/drive/My Drive/Mantises!/output/ecoregions_per_county.csv'
ecoregion_data = pd.read_csv(ecoregion_data_path)

unique_ecoregions = set()
for ecoregions_str in ecoregion_data['Unique_Ecoregions']:
    ecoregions = ast.literal_eval(ecoregions_str)
    unique_ecoregions.update(ecoregions)

# Convert the 'Unique_Ecoregions' column in ecoregion_data to actual sets
ecoregion_data['Unique_Ecoregions'] = ecoregion_data['Unique_Ecoregions'].apply(ast.literal_eval)

# Create a dictionary mapping counties to their ecoregions
county_ecoregion_map = ecoregion_data.set_index('County')['Unique_Ecoregions'].to_dict()

# Add binary columns for each ecoregion
for ecoregion in unique_ecoregions:
    merged_table1['Eco_' + ecoregion.replace(' ', '_')] = merged_table1['County'].map(
        lambda county: ecoregion in county_ecoregion_map.get(county, set())
    )

merged_table1.head()

Unnamed: 0,County,Eco_Central_Basin_and_Range,Eco_Mojave_Basin_and_Range,Eco_Coast_Range,Eco_Southern_California/Northern_Baja_Coast,Eco_Northern_Basin_and_Range,Eco_Eastern_Cascades_Slopes_and_Foothills,Eco_Southern_California_Mountains,Eco_Sierra_Nevada,Eco_Cascades,Eco_Central_California_Valley,Eco_Sonoran_Basin_and_Range,Eco_Klamath_Mountains/California_High_North_Coast_Range,Eco_Central_California_Foothills_and_Coastal_Mountains
0,Alameda,False,False,False,False,False,False,False,False,False,True,False,False,True
1,Alpine,True,False,False,False,False,False,False,True,False,False,False,False,False
2,Amador,False,False,False,False,False,False,False,True,False,False,False,False,True
3,Butte,False,False,False,False,False,False,False,True,True,True,False,False,True
4,Calaveras,False,False,False,False,False,False,False,True,False,False,False,False,True


### Join with human population data

In [23]:
# Load the human population data
human_population_data_path = '/content/drive/My Drive/Mantises!/output/human_population_per_county.csv'
human_population_data = pd.read_csv(human_population_data_path)
human_population_data = human_population_data.rename(columns={'NAME_2': 'County'})
human_population_data = human_population_data[['County', 'population_total']]

# Merge
merged_table2 = pd.merge(merged_table1, human_population_data, on='County', how='left')

merged_table2.head()


Unnamed: 0,County,Eco_Central_Basin_and_Range,Eco_Mojave_Basin_and_Range,Eco_Coast_Range,Eco_Southern_California/Northern_Baja_Coast,Eco_Northern_Basin_and_Range,Eco_Eastern_Cascades_Slopes_and_Foothills,Eco_Southern_California_Mountains,Eco_Sierra_Nevada,Eco_Cascades,Eco_Central_California_Valley,Eco_Sonoran_Basin_and_Range,Eco_Klamath_Mountains/California_High_North_Coast_Range,Eco_Central_California_Foothills_and_Coastal_Mountains,population_total
0,Alameda,False,False,False,False,False,False,False,False,False,True,False,False,True,1661584
1,Alpine,True,False,False,False,False,False,False,True,False,False,False,False,False,1159
2,Amador,False,False,False,False,False,False,False,True,False,False,False,False,True,39023
3,Butte,False,False,False,False,False,False,False,True,True,True,False,False,True,223344
4,Calaveras,False,False,False,False,False,False,False,True,False,False,False,False,True,45828


### Join table with climatic data

In [24]:
# Load the climate data
climate_data_path = '/content/drive/My Drive/Mantises!/output/climatic_data/california_climate_dummy_data.csv'
climate_data = pd.read_csv(climate_data_path)
climate_data = climate_data.drop(columns=['State'])
climate_data_per_county = climate_data.groupby('County').agg({'tmax': 'mean', 'tmin': 'mean', 'prcp_monttl': 'mean'}).reset_index()


# Merge
merged_table3 = pd.merge(merged_table2, climate_data_per_county, on='County', how='left')

merged_table3.head()


Unnamed: 0,County,Eco_Central_Basin_and_Range,Eco_Mojave_Basin_and_Range,Eco_Coast_Range,Eco_Southern_California/Northern_Baja_Coast,Eco_Northern_Basin_and_Range,Eco_Eastern_Cascades_Slopes_and_Foothills,Eco_Southern_California_Mountains,Eco_Sierra_Nevada,Eco_Cascades,Eco_Central_California_Valley,Eco_Sonoran_Basin_and_Range,Eco_Klamath_Mountains/California_High_North_Coast_Range,Eco_Central_California_Foothills_and_Coastal_Mountains,population_total,tmax,tmin,prcp_monttl
0,Alameda,False,False,False,False,False,False,False,False,False,True,False,False,True,1661584,26.572426,10.113893,9.689462
1,Alpine,True,False,False,False,False,False,False,True,False,False,False,False,False,1159,28.133128,8.786456,9.88497
2,Amador,False,False,False,False,False,False,False,True,False,False,False,False,True,39023,26.865087,11.162264,9.877558
3,Butte,False,False,False,False,False,False,False,True,True,True,False,False,True,223344,27.455577,9.70883,10.432373
4,Calaveras,False,False,False,False,False,False,False,True,False,False,False,False,True,45828,28.084286,9.06259,10.175897


### Join county and biodiversity data



In [25]:
biodiversity_data = pd.read_csv(f'/content/drive/My Drive/Mantises!/data/mantodea_ca_gbif_simple.csv', sep='\t')

status = {
    'Mantis religiosa': 'introduced',
    'Stagmomantis californica': 'native',
    'Litaneutria pacifica': 'native',
    'Litaneutria skinneri': 'native',
    'Stagmomantis limbata': 'native',
    'Iris oratoria': 'introduced',
    'Miomantis caffra': 'introduced',
    'Litaneutria ocularis': 'native',
    'Tenodera sinensis': 'introduced',
    'Litaneutria chaparrali': 'native',
    'Hierodula patellifera': 'introduced',
    'Litaneutria minor': 'native',
    'Yersiniops newboldi': 'native',
    'Thesprotia graminis': 'native'
}

# Add status column
biodiversity_data['status'] = biodiversity_data['species'].map(status)

unique_species = biodiversity_data['species'].unique()
unique_species = [species for species in unique_species if not pd.isna(species)]

biodiversity_data.head()

Unnamed: 0,gbifID,datasetKey,occurrenceID,kingdom,phylum,class,order,family,genus,species,...,dateIdentified,license,rightsHolder,recordedBy,typeStatus,establishmentMeans,lastInterpreted,mediaType,issue,status
0,923926829,50c9509d-22c7-4a22-a47d-8c48425ef4a7,http://www.inaturalist.org/observations/766957,Animalia,Arthropoda,Insecta,Mantodea,Mantidae,Mantis,Mantis religiosa,...,2014-07-03T20:54:23,CC_BY_NC_4_0,Callahan Charleton,Callahan Charleton,,,2023-09-28T11:04:15.229Z,StillImage,CONTINENT_DERIVED_FROM_COORDINATES;TAXON_MATCH...,introduced
1,923921190,50c9509d-22c7-4a22-a47d-8c48425ef4a7,http://www.inaturalist.org/observations/750035,Animalia,Arthropoda,Insecta,Mantodea,Mantidae,Mantis,Mantis religiosa,...,2014-06-23T01:27:47,CC0_1_0,Tony Iwane,Tony Iwane,,,2023-09-28T12:35:45.908Z,StillImage,COORDINATE_ROUNDED;CONTINENT_DERIVED_FROM_COOR...,introduced
2,923917961,50c9509d-22c7-4a22-a47d-8c48425ef4a7,http://www.inaturalist.org/observations/741586,Animalia,Arthropoda,Insecta,Mantodea,Mantidae,Mantis,Mantis religiosa,...,2014-06-18T17:43:13,CC_BY_NC_4_0,Todd Plummer,Todd Plummer,,,2023-09-28T11:04:17.149Z,StillImage,CONTINENT_DERIVED_FROM_COORDINATES;TAXON_MATCH...,introduced
3,899975780,50c9509d-22c7-4a22-a47d-8c48425ef4a7,http://www.inaturalist.org/observations/645036,Animalia,Arthropoda,Insecta,Mantodea,Mantidae,Mantis,Mantis religiosa,...,2014-04-27T06:25:51,CC_BY_NC_4_0,Eric Jacob,Eric Jacob,,,2023-09-28T12:34:59.254Z,StillImage,CONTINENT_DERIVED_FROM_COORDINATES;TAXON_MATCH...,introduced
4,891755288,50c9509d-22c7-4a22-a47d-8c48425ef4a7,http://www.inaturalist.org/observations/439613,Animalia,Arthropoda,Insecta,Mantodea,Mantidae,Mantis,Mantis religiosa,...,2013-10-25T04:13:21,CC_BY_NC_4_0,Paul G. Johnson,Paul G. Johnson,,,2023-09-28T11:04:05.380Z,StillImage,CONTINENT_DERIVED_FROM_COORDINATES;TAXON_MATCH...,introduced


In [26]:
# Load the shapefile with US counties
counties_shapefile = '/content/drive/My Drive/Mantises!/data/gadm41_USA_shp/gadm41_USA_2.shp'
counties = gpd.read_file(counties_shapefile)

# Convert the biodiversity data into a GeoDataFrame
geometry = [Point(xy) for xy in zip(biodiversity_data['decimalLongitude'], biodiversity_data['decimalLatitude'])]
biodiversity_gdf = gpd.GeoDataFrame(biodiversity_data, geometry=geometry)
biodiversity_gdf.set_crs(counties.crs, inplace=True)

# Spatial join to find which county each point falls into
joined = gpd.sjoin(biodiversity_gdf, counties, how='inner', predicate='within')

# Aggregate data to count occurrences per species, status, and county
biodiv_per_county = joined.groupby(['species', 'status', 'NAME_2']).size().reset_index(name='count')
biodiv_per_county = biodiv_per_county.rename(columns={'NAME_2': 'County'})

biodiv_per_county.head()

Unnamed: 0,species,status,County,count
0,Hierodula patellifera,introduced,Alameda,1
1,Iris oratoria,introduced,Alameda,1
2,Iris oratoria,introduced,Butte,16
3,Iris oratoria,introduced,Contra Costa,11
4,Iris oratoria,introduced,Fresno,11


In [30]:
# Initialize columns for each species
for species in unique_species:
    merged_table3[species] = 0

# Update the species column values
for index, row in biodiv_per_county.iterrows():
    species = row['species']
    county = row['County']
    count = row['count']

    if count > 0:
        # Update the species column for the county to 1 (presence)
        merged_table3.loc[merged_table3['County'] == county, species] = 1

# Display the updated merged_table
merged_table3.head()

Unnamed: 0,County,Eco_Central_Basin_and_Range,Eco_Mojave_Basin_and_Range,Eco_Coast_Range,Eco_Southern_California/Northern_Baja_Coast,Eco_Northern_Basin_and_Range,Eco_Eastern_Cascades_Slopes_and_Foothills,Eco_Southern_California_Mountains,Eco_Sierra_Nevada,Eco_Cascades,...,Stagmomantis limbata,Iris oratoria,Miomantis caffra,Litaneutria ocularis,Tenodera sinensis,Litaneutria chaparrali,Hierodula patellifera,Litaneutria minor,Yersiniops newboldi,Thesprotia graminis
0,Alameda,False,False,False,False,False,False,False,False,False,...,1,1,0,1,1,0,1,0,0,0
1,Alpine,True,False,False,False,False,False,False,True,False,...,0,0,0,0,0,0,0,0,0,0
2,Amador,False,False,False,False,False,False,False,True,False,...,1,0,0,0,0,0,0,0,0,0
3,Butte,False,False,False,False,False,False,False,True,True,...,1,1,0,1,0,0,0,0,0,0
4,Calaveras,False,False,False,False,False,False,False,True,False,...,1,0,0,0,0,0,0,0,0,0


### Decision trees per species

In [38]:
# Define ecoregion columns based on unique_ecoregions set
ecoregion_columns = ['Eco_' + ecoregion.replace(' ', '_') for ecoregion in unique_ecoregions]

# Prepare a DataFrame to store metrics for each species
metrics_df = pd.DataFrame(columns=['Species', 'Accuracy', 'Precision', 'Recall', 'F1 Score'])

# Output directory for decision tree visualizations
output_dir = '/content/drive/My Drive/Mantises!/output/decision_trees'
os.makedirs(output_dir, exist_ok=True)

# Iterate over each species to build a model and visualize the decision tree
for species in unique_species:
    # Prepare features (X) and target variable (y)
    X = merged_table3[ecoregion_columns]
    y = merged_table3[species]

    # Check if there's enough data to train the model
    if y.nunique() <= 1:
        print(f"Skipping model for {species}: insufficient variation in target variable.")
        continue

    # Split the data into training and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Initialize and train the decision tree classifier
    model = DecisionTreeClassifier(criterion='entropy', random_state=42)
    model.fit(X_train, y_train)

    # Predict on the test set
    y_pred = model.predict(X_test)

    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='binary')
    recall = recall_score(y_test, y_pred, average='binary')
    f1 = f1_score(y_test, y_pred, average='binary')

    # Append metrics to the DataFrame
    metrics_df = metrics_df.append({
        'Species': species,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1
    }, ignore_index=True)

    # Visualize the decision tree
    plt.figure(figsize=(20, 10))
    plot_tree(model, feature_names=ecoregion_columns, class_names=['Absent', 'Present'], filled=True)
    plt.title(f"Decision Tree for {species}")

    # Save the figure
    plt.savefig(os.path.join(output_dir, f"decision_tree_{species}.png"))
    plt.close()

    # Extract and print feature importances
    feature_importances = model.feature_importances_
    importance_df = pd.DataFrame(feature_importances, index=ecoregion_columns, columns=['Importance']).sort_values(by='Importance', ascending=False)
    print(f"Feature Importances for {species}:\n", importance_df)
    print("\n")

# Export the metrics to CSV
metrics_csv_path = '/content/drive/My Drive/Mantises!/output/decision_trees/species_decision_tree_metrics.csv'
metrics_df.to_csv(metrics_csv_path, index=False)

metrics_df.head()


  metrics_df = metrics_df.append({


Feature Importances for Mantis religiosa:
                                                     Importance
Eco_Southern_California/Northern_Baja_Coast           0.267081
Eco_Sierra_Nevada                                     0.194368
Eco_Central_California_Foothills_and_Coastal_Mo...    0.156554
Eco_Cascades                                          0.131476
Eco_Southern_California_Mountains                     0.123981
Eco_Central_California_Valley                         0.094319
Eco_Klamath_Mountains/California_High_North_Coa...    0.032221
Eco_Central_Basin_and_Range                           0.000000
Eco_Mojave_Basin_and_Range                            0.000000
Eco_Coast_Range                                       0.000000
Eco_Northern_Basin_and_Range                          0.000000
Eco_Eastern_Cascades_Slopes_and_Foothills             0.000000
Eco_Sonoran_Basin_and_Range                           0.000000




  metrics_df = metrics_df.append({


Feature Importances for Stagmomantis californica:
                                                     Importance
Eco_Southern_California/Northern_Baja_Coast           0.390246
Eco_Central_California_Foothills_and_Coastal_Mo...    0.156238
Eco_Klamath_Mountains/California_High_North_Coa...    0.152534
Eco_Central_California_Valley                         0.139085
Eco_Coast_Range                                       0.058296
Eco_Mojave_Basin_and_Range                            0.054854
Eco_Sierra_Nevada                                     0.048747
Eco_Central_Basin_and_Range                           0.000000
Eco_Northern_Basin_and_Range                          0.000000
Eco_Eastern_Cascades_Slopes_and_Foothills             0.000000
Eco_Southern_California_Mountains                     0.000000
Eco_Cascades                                          0.000000
Eco_Sonoran_Basin_and_Range                           0.000000




  metrics_df = metrics_df.append({


Feature Importances for Litaneutria pacifica:
                                                     Importance
Eco_Mojave_Basin_and_Range                            0.274823
Eco_Central_California_Foothills_and_Coastal_Mo...    0.226557
Eco_Central_Basin_and_Range                           0.156990
Eco_Klamath_Mountains/California_High_North_Coa...    0.114642
Eco_Southern_California/Northern_Baja_Coast           0.076539
Eco_Cascades                                          0.065893
Eco_Central_California_Valley                         0.058175
Eco_Coast_Range                                       0.024155
Eco_Sierra_Nevada                                     0.002225
Eco_Northern_Basin_and_Range                          0.000000
Eco_Eastern_Cascades_Slopes_and_Foothills             0.000000
Eco_Southern_California_Mountains                     0.000000
Eco_Sonoran_Basin_and_Range                           0.000000




  _warn_prf(average, modifier, msg_start, len(result))
  metrics_df = metrics_df.append({


Feature Importances for Litaneutria skinneri:
                                                     Importance
Eco_Sierra_Nevada                                     0.500165
Eco_Cascades                                          0.208606
Eco_Central_California_Valley                         0.158234
Eco_Central_Basin_and_Range                           0.092521
Eco_Mojave_Basin_and_Range                            0.040473
Eco_Coast_Range                                       0.000000
Eco_Southern_California/Northern_Baja_Coast           0.000000
Eco_Northern_Basin_and_Range                          0.000000
Eco_Eastern_Cascades_Slopes_and_Foothills             0.000000
Eco_Southern_California_Mountains                     0.000000
Eco_Sonoran_Basin_and_Range                           0.000000
Eco_Klamath_Mountains/California_High_North_Coa...    0.000000
Eco_Central_California_Foothills_and_Coastal_Mo...    0.000000




  metrics_df = metrics_df.append({


Feature Importances for Stagmomantis limbata:
                                                     Importance
Eco_Cascades                                          0.270023
Eco_Klamath_Mountains/California_High_North_Coa...    0.226413
Eco_Central_Basin_and_Range                           0.214203
Eco_Central_California_Valley                         0.098850
Eco_Sierra_Nevada                                     0.071558
Eco_Coast_Range                                       0.063026
Eco_Southern_California/Northern_Baja_Coast           0.050526
Eco_Central_California_Foothills_and_Coastal_Mo...    0.005401
Eco_Mojave_Basin_and_Range                            0.000000
Eco_Northern_Basin_and_Range                          0.000000
Eco_Eastern_Cascades_Slopes_and_Foothills             0.000000
Eco_Southern_California_Mountains                     0.000000
Eco_Sonoran_Basin_and_Range                           0.000000




  metrics_df = metrics_df.append({


Feature Importances for Iris oratoria:
                                                     Importance
Eco_Central_California_Valley                         0.391976
Eco_Southern_California/Northern_Baja_Coast           0.337635
Eco_Central_California_Foothills_and_Coastal_Mo...    0.157471
Eco_Coast_Range                                       0.043113
Eco_Sierra_Nevada                                     0.038869
Eco_Klamath_Mountains/California_High_North_Coa...    0.024369
Eco_Southern_California_Mountains                     0.006566
Eco_Central_Basin_and_Range                           0.000000
Eco_Mojave_Basin_and_Range                            0.000000
Eco_Northern_Basin_and_Range                          0.000000
Eco_Eastern_Cascades_Slopes_and_Foothills             0.000000
Eco_Cascades                                          0.000000
Eco_Sonoran_Basin_and_Range                           0.000000




  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  metrics_df = metrics_df.append({


Feature Importances for Miomantis caffra:
                                                     Importance
Eco_Southern_California/Northern_Baja_Coast           0.283193
Eco_Sierra_Nevada                                     0.246540
Eco_Mojave_Basin_and_Range                            0.240177
Eco_Sonoran_Basin_and_Range                           0.099845
Eco_Central_California_Valley                         0.092192
Eco_Klamath_Mountains/California_High_North_Coa...    0.024049
Eco_Central_California_Foothills_and_Coastal_Mo...    0.014004
Eco_Central_Basin_and_Range                           0.000000
Eco_Coast_Range                                       0.000000
Eco_Northern_Basin_and_Range                          0.000000
Eco_Eastern_Cascades_Slopes_and_Foothills             0.000000
Eco_Southern_California_Mountains                     0.000000
Eco_Cascades                                          0.000000




  _warn_prf(average, modifier, msg_start, len(result))
  metrics_df = metrics_df.append({


Feature Importances for Litaneutria ocularis:
                                                     Importance
Eco_Southern_California_Mountains                     0.589390
Eco_Central_California_Valley                         0.238221
Eco_Mojave_Basin_and_Range                            0.068116
Eco_Klamath_Mountains/California_High_North_Coa...    0.038077
Eco_Sonoran_Basin_and_Range                           0.037078
Eco_Central_California_Foothills_and_Coastal_Mo...    0.021161
Eco_Sierra_Nevada                                     0.007957
Eco_Central_Basin_and_Range                           0.000000
Eco_Coast_Range                                       0.000000
Eco_Southern_California/Northern_Baja_Coast           0.000000
Eco_Northern_Basin_and_Range                          0.000000
Eco_Eastern_Cascades_Slopes_and_Foothills             0.000000
Eco_Cascades                                          0.000000




  metrics_df = metrics_df.append({


Feature Importances for Tenodera sinensis:
                                                     Importance
Eco_Klamath_Mountains/California_High_North_Coa...    0.188971
Eco_Southern_California/Northern_Baja_Coast           0.172950
Eco_Southern_California_Mountains                     0.171558
Eco_Coast_Range                                       0.135544
Eco_Mojave_Basin_and_Range                            0.128955
Eco_Central_California_Foothills_and_Coastal_Mo...    0.088436
Eco_Sierra_Nevada                                     0.060999
Eco_Sonoran_Basin_and_Range                           0.041969
Eco_Central_California_Valley                         0.010618
Eco_Central_Basin_and_Range                           0.000000
Eco_Northern_Basin_and_Range                          0.000000
Eco_Eastern_Cascades_Slopes_and_Foothills             0.000000
Eco_Cascades                                          0.000000




  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  metrics_df = metrics_df.append({


Feature Importances for Litaneutria chaparrali:
                                                     Importance
Eco_Southern_California_Mountains                      0.59589
Eco_Southern_California/Northern_Baja_Coast            0.40411
Eco_Central_Basin_and_Range                            0.00000
Eco_Mojave_Basin_and_Range                             0.00000
Eco_Coast_Range                                        0.00000
Eco_Northern_Basin_and_Range                           0.00000
Eco_Eastern_Cascades_Slopes_and_Foothills              0.00000
Eco_Sierra_Nevada                                      0.00000
Eco_Cascades                                           0.00000
Eco_Central_California_Valley                          0.00000
Eco_Sonoran_Basin_and_Range                            0.00000
Eco_Klamath_Mountains/California_High_North_Coa...     0.00000
Eco_Central_California_Foothills_and_Coastal_Mo...     0.00000




  _warn_prf(average, modifier, msg_start, len(result))
  metrics_df = metrics_df.append({


Feature Importances for Hierodula patellifera:
                                                     Importance
Eco_Central_Basin_and_Range                                0.0
Eco_Mojave_Basin_and_Range                                 0.0
Eco_Coast_Range                                            0.0
Eco_Southern_California/Northern_Baja_Coast                0.0
Eco_Northern_Basin_and_Range                               0.0
Eco_Eastern_Cascades_Slopes_and_Foothills                  0.0
Eco_Southern_California_Mountains                          0.0
Eco_Sierra_Nevada                                          0.0
Eco_Cascades                                               0.0
Eco_Central_California_Valley                              0.0
Eco_Sonoran_Basin_and_Range                                0.0
Eco_Klamath_Mountains/California_High_North_Coa...         0.0
Eco_Central_California_Foothills_and_Coastal_Mo...         0.0




  _warn_prf(average, modifier, msg_start, len(result))
  metrics_df = metrics_df.append({


Feature Importances for Litaneutria minor:
                                                     Importance
Eco_Southern_California/Northern_Baja_Coast           0.617424
Eco_Sierra_Nevada                                     0.131941
Eco_Southern_California_Mountains                     0.092985
Eco_Central_California_Valley                         0.084142
Eco_Mojave_Basin_and_Range                            0.066812
Eco_Central_California_Foothills_and_Coastal_Mo...    0.006696
Eco_Central_Basin_and_Range                           0.000000
Eco_Coast_Range                                       0.000000
Eco_Northern_Basin_and_Range                          0.000000
Eco_Eastern_Cascades_Slopes_and_Foothills             0.000000
Eco_Cascades                                          0.000000
Eco_Sonoran_Basin_and_Range                           0.000000
Eco_Klamath_Mountains/California_High_North_Coa...    0.000000




  _warn_prf(average, modifier, msg_start, len(result))
  metrics_df = metrics_df.append({


Feature Importances for Yersiniops newboldi:
                                                     Importance
Eco_Sierra_Nevada                                     0.479815
Eco_Central_California_Valley                         0.440989
Eco_Mojave_Basin_and_Range                            0.079196
Eco_Central_Basin_and_Range                           0.000000
Eco_Coast_Range                                       0.000000
Eco_Southern_California/Northern_Baja_Coast           0.000000
Eco_Northern_Basin_and_Range                          0.000000
Eco_Eastern_Cascades_Slopes_and_Foothills             0.000000
Eco_Southern_California_Mountains                     0.000000
Eco_Cascades                                          0.000000
Eco_Sonoran_Basin_and_Range                           0.000000
Eco_Klamath_Mountains/California_High_North_Coa...    0.000000
Eco_Central_California_Foothills_and_Coastal_Mo...    0.000000




  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))
  metrics_df = metrics_df.append({


Feature Importances for Thesprotia graminis:
                                                     Importance
Eco_Klamath_Mountains/California_High_North_Coa...     0.61617
Eco_Central_California_Foothills_and_Coastal_Mo...     0.38383
Eco_Central_Basin_and_Range                            0.00000
Eco_Mojave_Basin_and_Range                             0.00000
Eco_Coast_Range                                        0.00000
Eco_Southern_California/Northern_Baja_Coast            0.00000
Eco_Northern_Basin_and_Range                           0.00000
Eco_Eastern_Cascades_Slopes_and_Foothills              0.00000
Eco_Southern_California_Mountains                      0.00000
Eco_Sierra_Nevada                                      0.00000
Eco_Cascades                                           0.00000
Eco_Central_California_Valley                          0.00000
Eco_Sonoran_Basin_and_Range                            0.00000




Unnamed: 0,Species,Accuracy,Precision,Recall,F1 Score
0,Mantis religiosa,0.75,0.9,0.818182,0.857143
1,Stagmomantis californica,0.583333,0.0,0.0,0.0
2,Litaneutria pacifica,0.333333,0.25,0.166667,0.2
3,Litaneutria skinneri,0.833333,0.0,0.0,0.0
4,Stagmomantis limbata,0.25,0.2,0.166667,0.181818


### Decision trees for introduced/native

In [44]:
# Aggregate environmental data for each species, including ecoregions
species_data = pd.DataFrame(index=unique_species)
for species in unique_species:
    species_rows = merged_table3[merged_table3[species] == 1]  # Rows where the species is present
    for col in ['tmax', 'tmin', 'prcp_monttl', 'population_total'] + ecoregion_columns:
        # Mean for continuous variables, mode (most common value) for ecoregions
        if col in ecoregion_columns:
            species_data.at[species, col] = species_rows[col].mode()[0]
        else:
            species_data.at[species, col] = species_rows[col].mean()

# Add species status
species_data['status'] = [1 if status[species] == 'introduced' else 0 for species in species_data.index]

# Split data into features (X) and target (y)
X = species_data.drop(columns=['status'])
y = species_data['status']

# Decision Tree Model
model = DecisionTreeClassifier(criterion='entropy', random_state=42)
model.fit(X, y)

# Cross-validation
scoring = {'accuracy': 'accuracy',
           'precision': make_scorer(precision_score, zero_division=0),
           'recall': make_scorer(recall_score, zero_division=0),
           'f1': make_scorer(f1_score, zero_division=0)}

cv_results = cross_validate(model, X, y, cv=5, scoring=scoring)

# Average across folds
metrics_avg = {metric: np.mean(cv_results['test_' + metric]) for metric in scoring.keys()}

print("Cross-validated metrics:")
print(metrics_avg)

# Save metrics to CSV
output_dir = '/content/drive/My Drive/Mantises!/output/decision_trees'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
metrics_csv_path = os.path.join(output_dir, 'introduced_native_decision_tree_metrics.csv')
pd.DataFrame([metrics_avg]).to_csv(metrics_csv_path, index=False)

# Decision Tree Visualization
plt.figure(figsize=(20, 10))
plot_tree(model, feature_names=X.columns, class_names=['Native', 'Introduced'], filled=True)
plt.title("Decision Tree for Species Status Prediction")
plt.savefig(os.path.join(output_dir, 'introduced_native_decision_tree.png'))
plt.close()

print(f"Metrics saved to: {metrics_csv_path}")


Cross-validated metrics:
{'accuracy': 0.4, 'precision': 0.26666666666666666, 'recall': 0.4, 'f1': 0.3}
Metrics saved to: /content/drive/My Drive/Mantises!/output/decision_trees/introduced_native_decision_tree_metrics.csv
