In [1]:
import pandas as pd
import numpy as np
import sqlite3
from sqlalchemy import create_engine
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree

In [2]:
conn = sqlite3.connect('../usdaplantsapi/usdadb_new.sqlite3')
cursor = conn.cursor()
cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
print(cursor.fetchall())

[('usda',)]


In [3]:
c = conn.cursor()

In [4]:
data = pd.read_sql("SELECT * FROM usda", conn)

In [5]:
data

Unnamed: 0,id,Symbol,Accepted_Symbol_x,Synonym_Symbol_x,Scientific_Name_x,Hybrid_Genus_Indicator,Hybrid_Species_Indicator,Species,Subspecies_Prefix,Hybrid_Subspecies_Indicator,...,Naval_Store_Product,Nursery_Stock_Product,Palatable_Browse_Animal,Palatable_Graze_Animal,Palatable_Human,Post_Product,Protein_Potential,Pulpwood_Product,Veneer_Product,Genus
0,1,ABAB,ABAB,,Abutilon abutiloides (Jacq.) Garcke ex Hochr.,,,abutiloides,,,...,,,,,,,,,,Abutilon
1,2,ABAB2,ABPR3,ABAB2,"Abrus abrus (L.) W. Wight, nom. inval.",,,abrus,,,...,,,,,,,,,,Abrus
2,3,ABAB3,ABTH,ABAB3,"Abutilon abutilon (L.) Rusby, nom. inval.",,,abutilon,,,...,,,,,,,,,,Abutilon
3,4,ABAB70,ABAB70,,Abietinella abietina (Hedw.) Fleisch.,,,abietina,,,...,,,,,,,,,,Abietinella
4,5,ABAC,ABUMB,ABAC,Abronia acutalata Standl.,,,acutalata,,,...,,,,,,,,,,Abronia
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
92166,92167,ZYVIR,ZYVIR,,Zygodon viridissimus (Dicks.) Brid. var. rupes...,,,viridissimus,,,...,,,,,,,,,,Zygodon
92167,92168,ZYVIR2,ZYVIR,ZYVIR2,Zygodon viridissimus (Dicks.) Brid. var. rufot...,,,viridissimus,,,...,,,,,,,,,,Zygodon
92168,92169,ZYVIV,ZYVIV,,Zygodon viridissimus (Dicks.) Brid. var. virid...,,,viridissimus,,,...,,,,,,,,,,Zygodon
92169,92170,ZYVIV2,ZYVIR,ZYVIV2,Zygodon viridissimus (Dicks.) Brid. var. vulga...,,,viridissimus,,,...,,,,,,,,,,Zygodon


In [6]:
df = data.copy()

In [7]:
df.to_csv('usda_plantsdb.csv')

In [8]:
# get list of df columns
column_list = list(df)
column_list

['id',
 'Symbol',
 'Accepted_Symbol_x',
 'Synonym_Symbol_x',
 'Scientific_Name_x',
 'Hybrid_Genus_Indicator',
 'Hybrid_Species_Indicator',
 'Species',
 'Subspecies_Prefix',
 'Hybrid_Subspecies_Indicator',
 'Subspecies',
 'Variety_Prefix',
 'Hybrid_Variety_Indicator',
 'Variety',
 'Subvariety_Prefix',
 'Subvariety',
 'Forma_Prefix',
 'Forma',
 'Genera_Binomial_Author',
 'Trinomial_Author',
 'Quadranomial_Author',
 'Questionable_Taxon_Indicator',
 'Parents',
 'Common_Name',
 'State_and_Province',
 'Category',
 'Family',
 'Family_Symbol',
 'Family_Common_Name',
 'xOrder',
 'SubClass',
 'Class',
 'SubDivision',
 'Division',
 'SuperDivision',
 'SubKingdom',
 'Kingdom',
 'Duration',
 'Growth_Habit',
 'Native_Status',
 'Federal_Noxious_Status',
 'State_Noxious_Status',
 'State_Noxious_Common_Name',
 'Invasive',
 'Federal_T_E_Status',
 'State_T_E_Status',
 'State_T_E_Common_Name',
 'Accepted_Symbol_y',
 'Synonym_Symbol_y',
 'Scientific_Name_y',
 'Active_Growth_Period',
 'After_Harvest_Regrowth

In [9]:
len(column_list)

134

In [10]:
# find empty columns
df_null_list = df.isnull().sum().head(100)
print(df_null_list)

id                           0
Symbol                       0
Accepted_Symbol_x            0
Synonym_Symbol_x             0
Scientific_Name_x            0
                            ..
Root_Depth_Minimum_inches    0
Salinity_Tolerance           0
Shade_Tolerance              0
Temperature_Minimum_F        0
Bloom_Period                 0
Length: 100, dtype: int64


In [11]:
df.isnull().count().sort_values()

id                           92171
Shade_Tolerance              92171
Salinity_Tolerance           92171
Root_Depth_Minimum_inches    92171
Precipitation_Maximum        92171
                             ...  
Duration                     92171
Kingdom                      92171
SubKingdom                   92171
Scientific_Name_y            92171
Genus                        92171
Length: 134, dtype: int64

In [12]:
df_state_nox_list = df["State_Noxious_Status"].isnull().sum()
df_state_nox_list

0

In [13]:
def in_nc(string):
        if string.find('NC') == -1:
            return False
        else:
            return True

In [14]:
def in_l48(string):
        if string.find('L48') == -1:
            return False
        else:
            return True

In [15]:
# for column in column_list:
#     print(df[column])
#     print(df[column].isna().sum())
#     print(df[column].describe())

In [16]:
# get
print(df.State_Noxious_Status.value_counts())

                                                                                    91551
HI (NW)                                                                                56
AL (CAW), CA (Q), FL (NW), MA (P), MN (PNW), NC (CAW), OR (Q), SC (PP), VT (CAW)       29
AL (CAW), CA (Q), MN (PNW), NC (CAW), OR (Q), SC (PP), VT (CAW)                        22
CA (BW)                                                                                21
                                                                                    ...  
CO (AW), OR (BDW, Q)                                                                    1
CA (BW), WA (CAW, NWSPQ)                                                                1
FL (NW), SC (PP)                                                                        1
SD (RNPS)                                                                               1
CA (AW), ID (NW), OR (ADW, Q), WA (CAW, NWSPQ)                                          1
Name: Stat

In [17]:
# get
print(df.State_and_Province.describe())

count     92171
unique    10922
top            
freq      56506
Name: State_and_Province, dtype: object


In [18]:
# get
df.State_Noxious_Status.describe()

count     92171
unique      283
top            
freq      91551
Name: State_Noxious_Status, dtype: object

In [19]:
df.State_and_Province.describe()

count     92171
unique    10922
top            
freq      56506
Name: State_and_Province, dtype: object

In [20]:
df.State_T_E_Status.value_counts()

                                  86924
FL (E)                              355
AZ (SR)                             252
WA (S)                              156
CA (E)                              121
                                  ...  
IL (), NY ()                          1
IL (E), NY (E)                        1
CT (E), IN (E), OH (T), PA (X)        1
MD (E, X), MI (T), OH (T)             1
FL (E), NJ (E), TN (T)                1
Name: State_T_E_Status, Length: 1646, dtype: int64

In [21]:
# df['In_NC'] = df_native['State_and_Province'].apply(in_nc)

## SLIM DATASET

In [22]:
df_enriched = pd.DataFrame(df, columns = [
 'Accepted_Symbol_x',
 'Hybrid_Genus_Indicator',
 'Hybrid_Species_Indicator',
 'Genus',
 'Species',
 'Subspecies_Prefix',
 'Variety',
 'Subvariety_Prefix',
 'Genera_Binomial_Author',
 'Common_Name',
 'State_and_Province',
 'Category',
 'Family_Symbol',
 'Family_Common_Name',
 'xOrder',
 'SubClass',
 'Class',
 'Division',
 'SuperDivision',
 'SubKingdom',
 'Kingdom',
 'Duration',
 'Growth_Habit',
 'Native_Status',
 'Federal_Noxious_Status',
 'State_Noxious_Status',
 'Invasive',
 'Federal_T_E_Status',
 'State_T_E_Status',
 'Active_Growth_Period',
 'After_Harvest_Regrowth_Rate',
 'Bloat',
 'C_N_Ratio',
 'Coppice_Potential',
 'Fall_Conspicuous',
 'Fire_Resistance',
 'Flower_Color',
 'Flower_Conspicuous',
 'Foliage_Color',
 'Foliage_Porosity_Summer',
 'Foliage_Porosity_Winter',
 'Foliage_Texture',
 'Fruit_Color',
 'Fruit_Conspicuous',
 'Growth_Form',
 'Growth_Rate',
 'Height_at_Base_Age_Maximum_feet',
 'Height_Mature_feet',
 'Known_Allelopath',
 'Leaf_Retention',
 'Lifespan',
 'Low_Growing_Grass',
 'Nitrogen_Fixation',
 'Resprout_Ability',
 'Shape_and_Orientation',
 'Toxicity',
 'Adapted_to_Coarse_Textured_Soils',
 'Adapted_to_Medium_Textured_Soils',
 'Adapted_to_Fine_Textured_Soils',
 'Anaerobic_Tolerance',
 'CaCO_3_Tolerance',
 'Cold_Stratification_Required',
 'Drought_Tolerance',
 'Fertility_Requirement',
 'Fire_Tolerance',
 'Frost_Free_Days_Minimum',
 'Hedge_Tolerance',
 'Moisture_Use',
 'pH_Minimum',
 'pH_Maximum',
 'Planting_Density_per_Acre_Minimum',
 'Planting_Density_per_Acre_Maximum',
 'Precipitation_Minimum',
 'Precipitation_Maximum',
 'Root_Depth_Minimum_inches',
 'Salinity_Tolerance',
 'Shade_Tolerance',
 'Temperature_Minimum_F',
 'Bloom_Period',
 'Commercial_Availability',
 'Fruit_Seed_Abundance',
 'Fruit_Seed_Period_Begin',
 'Fruit_Seed_Period_End',
 'Fruit_Seed_Persistence',
 'Propogated_by_Bare_Root',
 'Propogated_by_Bulbs',
 'Propogated_by_Container',
 'Propogated_by_Corms',
 'Propogated_by_Cuttings',
 'Propogated_by_Seed',
 'Propogated_by_Sod',
 'Propogated_by_Sprigs',
 'Propogated_by_Tubers',
 'Seeds_per_Pound',
 'Seed_Spread_Rate',
 'Seedling_Vigor',
 'Small_Grain',
 'Vegetative_Spread_Rate',
 'Berry_Nut_Seed_Product',
 'Christmas_Tree_Product',
 'Fodder_Product',
 'Fuelwood_Product',
 'Lumber_Product',
 'Naval_Store_Product',
 'Nursery_Stock_Product',
 'Palatable_Browse_Animal',
 'Palatable_Graze_Animal',
 'Palatable_Human',
 'Post_Product',
 'Protein_Potential',
 'Pulpwood_Product',
 'Veneer_Product'
])

In [23]:
df_enriched.head()

Unnamed: 0,Accepted_Symbol_x,Hybrid_Genus_Indicator,Hybrid_Species_Indicator,Genus,Species,Subspecies_Prefix,Variety,Subvariety_Prefix,Genera_Binomial_Author,Common_Name,...,Lumber_Product,Naval_Store_Product,Nursery_Stock_Product,Palatable_Browse_Animal,Palatable_Graze_Animal,Palatable_Human,Post_Product,Protein_Potential,Pulpwood_Product,Veneer_Product
0,ABAB,,,Abutilon,abutiloides,,,,(Jacq.) Garcke ex Hochr.,shrubby Indian mallow,...,,,,,,,,,,
1,ABPR3,,,Abrus,abrus,,,,"(L.) W. Wight, nom. inval.",,...,,,,,,,,,,
2,ABTH,,,Abutilon,abutilon,,,,"(L.) Rusby, nom. inval.",,...,,,,,,,,,,
3,ABAB70,,,Abietinella,abietina,,,,(Hedw.) Fleisch.,abietinella moss,...,,,,,,,,,,
4,ABUMB,,,Abronia,acutalata,,,,Standl.,,...,,,,,,,,,,


In [24]:
df_enriched['Hybrid_Genus_Indicator'].value_counts()

     92037
×      134
Name: Hybrid_Genus_Indicator, dtype: int64

In [25]:
df_enriched['Hybrid_Species_Indicator'].value_counts()

     90689
×     1482
Name: Hybrid_Species_Indicator, dtype: int64

In [26]:
df_enriched['Subspecies_Prefix'].value_counts()

        85730
ssp.     6441
Name: Subspecies_Prefix, dtype: int64

In [27]:
df_enriched['Common_Name'].value_counts()

                            48971
rim lichen                    127
dot lichen                    105
wart lichen                   104
lecidea lichen                101
                            ...  
false wheatgrass                1
eremocrinum                     1
eremogone                       1
eremophila                      1
Reinwardt's zygodon moss        1
Name: Common_Name, Length: 31248, dtype: int64

In [28]:
df_enriched['Common_Name'].isna().sum()
df_enriched['Common_Name'].describe()

count     92171
unique    31248
top            
freq      48971
Name: Common_Name, dtype: object

In [29]:
df_enriched['State_and_Province'].value_counts()

                                                                                                                                                                                               56506
USA (CA)                                                                                                                                                                                        3487
USA (HI)                                                                                                                                                                                        1875
USA+ (PR)                                                                                                                                                                                       1574
USA (TX)                                                                                                                                                                                        1155
               

In [30]:
df_enriched.isna().sum().sort_values()

Accepted_Symbol_x              0
Fruit_Seed_Period_Begin        0
Fruit_Seed_Abundance           0
Commercial_Availability        0
Bloom_Period                   0
                              ..
Bloat                          0
After_Harvest_Regrowth_Rate    0
Active_Growth_Period           0
Pulpwood_Product               0
Veneer_Product                 0
Length: 112, dtype: int64

In [31]:
df_enriched['Category'].value_counts()

              43773
Dicot         31496
Monocot        7697
Lichen         4610
Moss           1842
Fern           1198
Liverwort       933
Gymnosperm      359
Lycopod         142
Quillwort        60
Hornwort         24
Horsetail        24
Green alga        5
RA                4
Whisk-fern        4
Name: Category, dtype: int64

In [32]:
df_enriched['Category'].value_counts()

              43773
Dicot         31496
Monocot        7697
Lichen         4610
Moss           1842
Fern           1198
Liverwort       933
Gymnosperm      359
Lycopod         142
Quillwort        60
Hornwort         24
Horsetail        24
Green alga        5
RA                4
Whisk-fern        4
Name: Category, dtype: int64

In [33]:
df_enriched['State_Noxious_Status'].value_counts()

                                                                                    91551
HI (NW)                                                                                56
AL (CAW), CA (Q), FL (NW), MA (P), MN (PNW), NC (CAW), OR (Q), SC (PP), VT (CAW)       29
AL (CAW), CA (Q), MN (PNW), NC (CAW), OR (Q), SC (PP), VT (CAW)                        22
CA (BW)                                                                                21
                                                                                    ...  
CO (AW), OR (BDW, Q)                                                                    1
CA (BW), WA (CAW, NWSPQ)                                                                1
FL (NW), SC (PP)                                                                        1
SD (RNPS)                                                                               1
CA (AW), ID (NW), OR (ADW, Q), WA (CAW, NWSPQ)                                          1
Name: Stat

In [34]:
df_core = pd.DataFrame(df, columns = [
 'Accepted_Symbol_x',
 'Hybrid_Genus_Indicator',
 'Hybrid_Species_Indicator',
 'Species',
 'Subspecies_Prefix',
 'Variety',
 'Subvariety_Prefix',
 'Genera_Binomial_Author',
 'Common_Name',
 'State_and_Province',
 'Category',
 'Family_Symbol',
 'Family_Common_Name',
 'xOrder',
 'SubClass',
 'Class',
 'Division',
 'SuperDivision',
 'SubKingdom',
 'Kingdom',
 'Duration',
 'Growth_Habit',
 'Native_Status',
 'Federal_Noxious_Status',
 'State_Noxious_Status',
 'Invasive',
 'Federal_T_E_Status',
 'State_T_E_Status',
 'Genus'])

In [35]:
df_core.head()

Unnamed: 0,Accepted_Symbol_x,Hybrid_Genus_Indicator,Hybrid_Species_Indicator,Species,Subspecies_Prefix,Variety,Subvariety_Prefix,Genera_Binomial_Author,Common_Name,State_and_Province,...,Kingdom,Duration,Growth_Habit,Native_Status,Federal_Noxious_Status,State_Noxious_Status,Invasive,Federal_T_E_Status,State_T_E_Status,Genus
0,ABAB,,,abutiloides,,,,(Jacq.) Garcke ex Hochr.,shrubby Indian mallow,"USA (AZ, TX), USA+ (PR, VI)",...,Plantae,Perennial,"Subshrub, Forb/herb","L48 (N), PR (N), VI (I)",,,,,,Abutilon
1,ABPR3,,,abrus,,,,"(L.) W. Wight, nom. inval.",,,...,,,,,,,,,,Abrus
2,ABTH,,,abutilon,,,,"(L.) Rusby, nom. inval.",,,...,,,,,,,,,,Abutilon
3,ABAB70,,,abietina,,,,(Hedw.) Fleisch.,abietinella moss,USA (NJ),...,Plantae,,Nonvascular,NA (N),,,,,,Abietinella
4,ABUMB,,,acutalata,,,,Standl.,,,...,,,,,,,,,,Abronia


## PROCESS TARGET COLUMN

In [36]:
df_core['Invasive'].apply(lambda x: x.find('NC')).sum()

-92171

In [37]:
df_core['Invasive'].apply(lambda x: x.find('SE')).sum()

-91282

In [38]:
df_core['Invasive'].apply(lambda x: x.find("N'EAST")).sum()

-91449

In [39]:
df_core['Invasive'].describe()

count     92171
unique      167
top            
freq      90735
Name: Invasive, dtype: object

In [40]:
unique_invasive = df_core['Invasive'].unique()
print(unique_invasive)

['' 'SWSS' 'FLEPPC' "KY, N'EAST, NE&GP, STATE, SWSS, WSWS" 'STATE' 'HEAR'
 'WI' 'WSWS' 'HEAR, STATE' "KY, N'EAST, NE&GP, SWSS" "N'EAST" 'KY, SWSS'
 "N'EAST, WI" "N'EAST, SWSS" 'NE&GP, STATE, SWSS, WSWS' 'STATE, US'
 'STATE, SWSS' 'Cal-IPC, STATE' 'Cal-IPC, HEAR, STATE, US'
 "N'EAST, STATE, WSWS" 'KY, NE&GP, STATE, SWSS' 'NE&GP, WSWS'
 'Cal-IPC, SEEPPC, STATE, WI' 'STATE, WSWS' 'NE&GP, SWSS' 'SEEPPC, WI'
 'FLEPPC, SEEPPC' 'FLEPPC, HEAR' 'SEEPPC, STATE, WI'
 'FLEPPC, SEEPPC, STATE, SWSS' 'Cal-IPC, STATE, WSWS'
 "KY, N'EAST, SEEPPC, STATE, SWSS" "N'EAST, NE&GP, SWSS, WSWS" 'Cal-IPC'
 "KY, N'EAST, SWSS" 'SWSS, WSWS' 'NE&GP'
 "N'EAST, NE&GP, STATE, SWSS, WSWS" "KY, N'EAST" 'KY, STATE, SWSS, WSWS'
 "HEAR, KY, N'EAST, STATE, SWSS" 'KY, NE&GP, SWSS, WSWS' 'KY'
 'Cal-IPC, HEAR, SEEPPC, STATE' 'FLEPPC, HEAR, STATE' 'SEEPPC'
 'HEAR, SWSS' "KY, N'EAST, NE&GP, STATE, SWSS, WI, WSWS" "N'EAST, SEEPPC"
 'Cal-IPC, STATE, SWSS, WSWS' 'NE&GP, STATE, SWSS' "N'EAST, WSWS"
 'STATE, WI' "N'EAST, NE&GP" 'SEEP

In [41]:
(len(df_core))+(df_core['State_Noxious_Status'].apply(lambda x: x.find("NC")).sum())

4614

## DF_CORE TARGET COLUMN ISOLATION

In [42]:
df_core['In_NC'] = df_core['State_and_Province'].apply(in_nc)
df_core['Native_L48'] = df_core['Native_Status'].apply(in_l48)
df_core['Native_NC'] = df_core['Native_Status'].apply(in_nc)

In [43]:
# Drop the label to create the X data
X = df_core.drop('Invasive', axis=1)
X


Unnamed: 0,Accepted_Symbol_x,Hybrid_Genus_Indicator,Hybrid_Species_Indicator,Species,Subspecies_Prefix,Variety,Subvariety_Prefix,Genera_Binomial_Author,Common_Name,State_and_Province,...,Growth_Habit,Native_Status,Federal_Noxious_Status,State_Noxious_Status,Federal_T_E_Status,State_T_E_Status,Genus,In_NC,Native_L48,Native_NC
0,ABAB,,,abutiloides,,,,(Jacq.) Garcke ex Hochr.,shrubby Indian mallow,"USA (AZ, TX), USA+ (PR, VI)",...,"Subshrub, Forb/herb","L48 (N), PR (N), VI (I)",,,,,Abutilon,False,True,False
1,ABPR3,,,abrus,,,,"(L.) W. Wight, nom. inval.",,,...,,,,,,,Abrus,False,False,False
2,ABTH,,,abutilon,,,,"(L.) Rusby, nom. inval.",,,...,,,,,,,Abutilon,False,False,False
3,ABAB70,,,abietina,,,,(Hedw.) Fleisch.,abietinella moss,USA (NJ),...,Nonvascular,NA (N),,,,,Abietinella,False,False,False
4,ABUMB,,,acutalata,,,,Standl.,,,...,,,,,,,Abronia,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
92166,ZYVIR,,,viridissimus,,rupestris,,(Dicks.) Brid.,zygodon moss,,...,Nonvascular,NA (N),,,,,Zygodon,False,False,False
92167,ZYVIR,,,viridissimus,,rufotomentosus,,(Dicks.) Brid.,,,...,,,,,,,Zygodon,False,False,False
92168,ZYVIV,,,viridissimus,,viridissimus,,(Dicks.) Brid.,zygodon moss,,...,Nonvascular,NA (N),,,,,Zygodon,False,False,False
92169,ZYVIR,,,viridissimus,,vulgaris,,(Dicks.) Brid.,,,...,,,,,,,Zygodon,False,False,False


In [44]:
# # One-hot encoding the 'workclass' column creates 9 new columns, one for each category
# pd.get_dummies(X['workclass'])


In [45]:
# One-hot encoding the entire dataframe
# X_dummies = pd.get_dummies(X)
# print(X_dummies.columns)
# X_dummies

## PRODUCES 92171 rows × 149451 columns

In [46]:
# Converting output labels to 0 and 1
y_label = LabelEncoder().fit_transform(df_core['Invasive'])
y_label


array([0, 0, 0, ..., 0, 0, 0])

In [47]:
X_train, X_test, y_train, y_test = train_test_split(X, y_label, random_state=99)

In [48]:
# Scaling the X data by using StandardScaler()
# scaler = StandardScaler().fit(X_train)
# X_train_scaled = scaler.transform(X_train)
# X_train_scaled


In [49]:
# # Alternatively, scaling the data by using MinMaxScaler()
# scaler = MinMaxScaler().fit(X_train)
# X_train_scaled = scaler.transform(X_train)
# X_train_scaled


In [50]:
# Transforming the test dataset based on the fit from the training dataset
# X_test_scaled = scaler.transform(X_test)
# X_test_scaled


In [51]:
df_core.fillna("unknown")

Unnamed: 0,Accepted_Symbol_x,Hybrid_Genus_Indicator,Hybrid_Species_Indicator,Species,Subspecies_Prefix,Variety,Subvariety_Prefix,Genera_Binomial_Author,Common_Name,State_and_Province,...,Native_Status,Federal_Noxious_Status,State_Noxious_Status,Invasive,Federal_T_E_Status,State_T_E_Status,Genus,In_NC,Native_L48,Native_NC
0,ABAB,,,abutiloides,,,,(Jacq.) Garcke ex Hochr.,shrubby Indian mallow,"USA (AZ, TX), USA+ (PR, VI)",...,"L48 (N), PR (N), VI (I)",,,,,,Abutilon,False,True,False
1,ABPR3,,,abrus,,,,"(L.) W. Wight, nom. inval.",,,...,,,,,,,Abrus,False,False,False
2,ABTH,,,abutilon,,,,"(L.) Rusby, nom. inval.",,,...,,,,,,,Abutilon,False,False,False
3,ABAB70,,,abietina,,,,(Hedw.) Fleisch.,abietinella moss,USA (NJ),...,NA (N),,,,,,Abietinella,False,False,False
4,ABUMB,,,acutalata,,,,Standl.,,,...,,,,,,,Abronia,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
92166,ZYVIR,,,viridissimus,,rupestris,,(Dicks.) Brid.,zygodon moss,,...,NA (N),,,,,,Zygodon,False,False,False
92167,ZYVIR,,,viridissimus,,rufotomentosus,,(Dicks.) Brid.,,,...,,,,,,,Zygodon,False,False,False
92168,ZYVIV,,,viridissimus,,viridissimus,,(Dicks.) Brid.,zygodon moss,,...,NA (N),,,,,,Zygodon,False,False,False
92169,ZYVIR,,,viridissimus,,vulgaris,,(Dicks.) Brid.,,,...,,,,,,,Zygodon,False,False,False


In [52]:
for col in df_core.columns:
    if df_core[col].dtype == 'object':
        df_core[col] = pd.to_numeric(df_core[col], errors='coerce')

In [53]:
clf = RandomForestClassifier(random_state=99, n_estimators=500).fit(X_train, y_train)
print(f'Training Score: {clf.score(X_train, y_train)}')
print(f'Testing Score: {clf.score(X_test, y_test)}')

ValueError: could not convert string to float: 'CLIC2'

## DF_NATIVE TARGET COLUMN ISOLATION

In [None]:
df_native = pd.DataFrame(df, columns = ['id', 'Symbol', 'Scientific_Name_x', 'State_Noxious_Status', 'Federal_Noxious_Status','Invasive','Native_Status', 'State_and_Province'])
df_native

In [None]:
df_invasive_features = pd.DataFrame(df, columns = ['id',
                                          'Symbol',
                                          'Scientific_Name_x',
                                          'State_Noxious_Status',
                                          'Invasive','Native_Status',
                                          'State_and_Province',
                                          'Resprout_Ability',
                                          'Growth_Habit', 
                                          'Leaf_Retention',
                                          'Lifespan','Moisture_Use',
                                          'pH_Minimum',
                                          'pH_Maximum',
                                          'Planting_Density_per_Acre_Minimum',
                                          'Planting_Density_per_Acre_Maximum',
                                          'Precipitation_Minimum',
                                          'Precipitation_Maximum',
                                          'Root_Depth_Minimum_inches',
                                          'Salinity_Tolerance','Shade_Tolerance','Category'
])
df_invasive_features

In [None]:
df_native
# view df[State_and] columns 
# what type is it?

In [None]:
df_state_prov_list = df_native["State_and_Province"].isnull().sum()
df_state_prov_list
# no null values

In [None]:
df_native.head()

In [None]:
df_native['In_NC'] = df_native['State_and_Province'].apply(in_nc)
df_native['Native_L48'] = df_native['Native_Status'].apply(in_l48)
df_native['Native_NC'] = df_native['Native_Status'].apply(in_nc)

In [None]:
df_native.head()

In [None]:
print(df_native['In_NC'].value_counts())
print(df_native['Native_L48'].value_counts())
print(df_native['Native_NC'].value_counts())

In [None]:
df_In_NC = df_native[df_native['In_NC']==True]
df_In_NC

In [None]:
df_L48_NC = df_native[df_native['Native_L48']==True]
df_L48_NC

In [None]:
df_Native_NC = df_native[df_native['Native_NC']==True]
df_Native_NC

## SUPERVISED ML WORK BEGINS

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree

In [None]:
# Converting output labels to 0 and 1
y_label = LabelEncoder().fit_transform(df['income'])
y_label

In [None]:
Hybrid_Genus_Indicator 2
Hybrid_Species_Indicator 2
Subspecies_Prefix 2
SuperDivision 2
SubKingdom, 2
Kingdom 3
Federal_T_E_Status (3)

In [None]:
[AL_Native,
L48,
S_Native,
AZ_Native,
AR_Native,
CO_Native,
CT_Native,
DE_Native,
DC_Native,
FM_Native,
FL_Native,
GA_Native,
GU_Native,
ID_Native,
IL_Native,
IN_Native,
IA_Native,
KS_Native,
KY_Native,
LA_Native,
ME_Native,
MH_Native,
MD_Native,
MN_Native,
MA_Native,
MI_Native,
MO_Native,
MS_Native,
WY_Native,
WI_Native,
WV_Native,
VA_Native,
WA_Native,
VT_Native,
UT_Native,
TX_Native,
TN_Native,
SD_Native,
SC_Native,
RI_Native,
PA_Native,
PW_Native,
OR_Native,
OK_Native,
OH_Native,
MP_Native,
ND_Native,
NC_Native,
NY_Native,
NM_Native,
NJ_Native,
NH_Native,
NV_Native,
NE_Native,
MT_Native]
