In [1]:
# Import dependencies
import pandas as pd
import numpy as np
import collections

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA
from sklearn import tree

In [2]:
# Study data files
data_path = "usda_plantsdb.csv"

# Read the mouse data and the study results
data = pd.read_csv(data_path,low_memory=False)

In [3]:
# Combine the data into a single dataset
df_og = pd.DataFrame(data)
df = df_og.copy()

# Display the data table for preview
df.head()

Unnamed: 0.1,Unnamed: 0,id,Symbol,Accepted_Symbol_x,Synonym_Symbol_x,Scientific_Name_x,Hybrid_Genus_Indicator,Hybrid_Species_Indicator,Species,Subspecies_Prefix,...,Naval_Store_Product,Nursery_Stock_Product,Palatable_Browse_Animal,Palatable_Graze_Animal,Palatable_Human,Post_Product,Protein_Potential,Pulpwood_Product,Veneer_Product,Genus
0,0,1,ABAB,ABAB,,Abutilon abutiloides (Jacq.) Garcke ex Hochr.,,,abutiloides,,...,,,,,,,,,,Abutilon
1,1,2,ABAB2,ABPR3,ABAB2,"Abrus abrus (L.) W. Wight, nom. inval.",,,abrus,,...,,,,,,,,,,Abrus
2,2,3,ABAB3,ABTH,ABAB3,"Abutilon abutilon (L.) Rusby, nom. inval.",,,abutilon,,...,,,,,,,,,,Abutilon
3,3,4,ABAB70,ABAB70,,Abietinella abietina (Hedw.) Fleisch.,,,abietina,,...,,,,,,,,,,Abietinella
4,4,5,ABAC,ABUMB,ABAC,Abronia acutalata Standl.,,,acutalata,,...,,,,,,,,,,Abronia


In [4]:
df.shape

(92171, 135)

In [5]:
column_list = list(df)
print(column_list)
# Does T_E_ mean tax-exempt?
# are _y columns duplicated?

['Unnamed: 0', 'id', 'Symbol', 'Accepted_Symbol_x', 'Synonym_Symbol_x', 'Scientific_Name_x', 'Hybrid_Genus_Indicator', 'Hybrid_Species_Indicator', 'Species', 'Subspecies_Prefix', 'Hybrid_Subspecies_Indicator', 'Subspecies', 'Variety_Prefix', 'Hybrid_Variety_Indicator', 'Variety', 'Subvariety_Prefix', 'Subvariety', 'Forma_Prefix', 'Forma', 'Genera_Binomial_Author', 'Trinomial_Author', 'Quadranomial_Author', 'Questionable_Taxon_Indicator', 'Parents', 'Common_Name', 'State_and_Province', 'Category', 'Family', 'Family_Symbol', 'Family_Common_Name', 'xOrder', 'SubClass', 'Class', 'SubDivision', 'Division', 'SuperDivision', 'SubKingdom', 'Kingdom', 'Duration', 'Growth_Habit', 'Native_Status', 'Federal_Noxious_Status', 'State_Noxious_Status', 'State_Noxious_Common_Name', 'Invasive', 'Federal_T_E_Status', 'State_T_E_Status', 'State_T_E_Common_Name', 'Accepted_Symbol_y', 'Synonym_Symbol_y', 'Scientific_Name_y', 'Active_Growth_Period', 'After_Harvest_Regrowth_Rate', 'Bloat', 'C_N_Ratio', 'Coppic

In [6]:
# find empty columns
df_null_list = df.isnull().sum().sort_values(ascending=False)
print(df_null_list)

Questionable_Taxon_Indicator    92171
Hybrid_Subspecies_Indicator     92166
Hybrid_Variety_Indicator        92165
Subvariety_Prefix               92163
Subvariety                      92163
                                ...  
id                                  0
Scientific_Name_x                   0
Accepted_Symbol_x                   0
Symbol                              0
Genus                               0
Length: 135, dtype: int64


In [7]:
for column in column_list:
    print(df[column].nunique())
    print('----------------')

92171
----------------
92171
----------------
92171
----------------
48398
----------------
43773
----------------
92163
----------------
1
----------------
1
----------------
20631
----------------
1
----------------
1
----------------
3428
----------------
1
----------------
1
----------------
7649
----------------
1
----------------
8
----------------
1
----------------
56
----------------
20255
----------------
11690
----------------
225
----------------
0
----------------
1068
----------------
31247
----------------
10921
----------------
14
----------------
547
----------------
547
----------------
311
----------------
156
----------------
18
----------------
24
----------------
3
----------------
16
----------------
1
----------------
1
----------------
2
----------------
9
----------------
63
----------------
347
----------------
2
----------------
282
----------------
573
----------------
166
----------------
2
----------------
1645
----------------
4872
----------------
1081


In [8]:
# bin 3 lists of column names by number of unique values
one_unique_list = []
two_unique_list = []
three_unique_list = []
four_unique_list = []
med_unique_list = []

for x in column_list:
    if df[x].nunique() == 1:
        one_unique_list.append(x)
    elif df[x].nunique() == 2:
        two_unique_list.append(x)
    elif df[x].nunique() == 3:
        three_unique_list.append(x)
    elif df[x].nunique() == 4:
        four_unique_list.append(x)
    else: 
        med_unique_list.append(x)


In [9]:
# check for matches
if collections.Counter(one_unique_list) == collections.Counter(two_unique_list):
    print("matches found")
else: print ("no matches found")

no matches found


In [10]:
# check for matches
if collections.Counter(two_unique_list) == collections.Counter(three_unique_list):
    print("matches found")
else: print ("no matches found")

no matches found


In [11]:
# check for matches
if collections.Counter(three_unique_list) == collections.Counter(four_unique_list):
    print("matches found")
else: print ("no matches found")

no matches found


In [12]:
# check for matches
if collections.Counter(four_unique_list) == collections.Counter(med_unique_list):
    print("matches found")
else: print ("no matches found")

no matches found


In [13]:
# double check for correct nunique
for x in one_unique_list:
    print(df[x].nunique())

# CORRECT

1
1
1
1
1
1
1
1
1
1


In [14]:
# double check for correct nunique
for x in two_unique_list:
    print(df[x].nunique())

# CORRECT

2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2


In [15]:
# double check for correct nunique
for x in three_unique_list:
    print(df[x].nunique())
    
# CORRECT

3
3
3
3
3
3
3
3
3
3
3
3
3
3
3


In [16]:
# double check for correct nunique
for x in four_unique_list:
    print(df[x].nunique())


# CORRECT

4
4
4
4
4
4
4
4
4
4
4
4
4


In [17]:
# double check for correct nunique
for x in med_unique_list:
    print(df[x].nunique())


# CORRECT

92171
92171
92171
48398
43773
92163
20631
3428
7649
8
56
20255
11690
225
0
1068
31247
10921
14
547
547
311
156
18
24
16
9
63
347
282
573
166
1645
4872
1081
30
1111
8
8
6
9
8
34
101
11
62
35
42
62
59
42
66
30
69
12
5
5
564
6877


In [18]:
len(one_unique_list)

10

In [19]:
len(two_unique_list)

38

In [20]:
len(three_unique_list)

15

In [21]:
len(four_unique_list)

13

In [22]:
len(med_unique_list)

59

## Replace nan with 0 and categorical values with numerical values inplace for one_unique_list

In [23]:
# check rate of na values in low variety columns
for x in one_unique_list:
    print(df[x].isna().sum())
    print("--------------")

92037
--------------
90689
--------------
85730
--------------
92166
--------------
72851
--------------
92165
--------------
92163
--------------
92108
--------------
52621
--------------
51193
--------------


In [24]:
# check name of unique values in low variety columns
for x in one_unique_list:
    print(df[x].value_counts())
    print("--------------")

×    134
Name: Hybrid_Genus_Indicator, dtype: int64
--------------
×    1482
Name: Hybrid_Species_Indicator, dtype: int64
--------------
ssp.    6441
Name: Subspecies_Prefix, dtype: int64
--------------
×    5
Name: Hybrid_Subspecies_Indicator, dtype: int64
--------------
var.    19320
Name: Variety_Prefix, dtype: int64
--------------
×    6
Name: Hybrid_Variety_Indicator, dtype: int64
--------------
subvar.    8
Name: Subvariety_Prefix, dtype: int64
--------------
f.    63
Name: Forma_Prefix, dtype: int64
--------------
Spermatophyta    39550
Name: SuperDivision, dtype: int64
--------------
Tracheobionta    40978
Name: SubKingdom, dtype: int64
--------------


In [25]:
# replace single column value with numeric
for x in one_unique_list:
    df[x].fillna(0,inplace=True)
    df[x]=df[x].apply(lambda x: 1 if x!=0 else 0)


#     print(len(y)) = 2 x10
#     df[x].replace({y:1},inplace=True)

In [26]:
for x in one_unique_list:
    print(df[x].value_counts())

0    92037
1      134
Name: Hybrid_Genus_Indicator, dtype: int64
0    90689
1     1482
Name: Hybrid_Species_Indicator, dtype: int64
0    85730
1     6441
Name: Subspecies_Prefix, dtype: int64
0    92166
1        5
Name: Hybrid_Subspecies_Indicator, dtype: int64
0    72851
1    19320
Name: Variety_Prefix, dtype: int64
0    92165
1        6
Name: Hybrid_Variety_Indicator, dtype: int64
0    92163
1        8
Name: Subvariety_Prefix, dtype: int64
0    92108
1       63
Name: Forma_Prefix, dtype: int64
0    52621
1    39550
Name: SuperDivision, dtype: int64
0    51193
1    40978
Name: SubKingdom, dtype: int64


In [27]:
# check rate of na values in low variety columns
for x in one_unique_list:
    print(df[x].isna().sum())
    print("--------------")

0
--------------
0
--------------
0
--------------
0
--------------
0
--------------
0
--------------
0
--------------
0
--------------
0
--------------
0
--------------


## Check/Factorize/Check two_unique_list

In [28]:
# check rate of na values in low variety columns
for x in two_unique_list:
    print(df[x].isna().sum())
    print("--------------")

43775
--------------
92076
--------------
91411
--------------
91090
--------------
91090
--------------
91090
--------------
91090
--------------
91090
--------------
91090
--------------
91090
--------------
91090
--------------
91090
--------------
91090
--------------
91090
--------------
91090
--------------
91090
--------------
91090
--------------
91090
--------------
91090
--------------
91090
--------------
91090
--------------
91090
--------------
91090
--------------
91090
--------------
91090
--------------
91090
--------------
91090
--------------
91090
--------------
91090
--------------
91090
--------------
91090
--------------
91090
--------------
91090
--------------
91090
--------------
91090
--------------
91090
--------------
91090
--------------
91090
--------------


In [29]:
# check name of unique values in low variety columns
for x in two_unique_list:
    print(df[x].value_counts())
    print("--------------")

Plantae    43786
Fungi       4610
Name: Kingdom, dtype: int64
--------------
NW       94
NW, Q     1
Name: Federal_Noxious_Status, dtype: int64
--------------
Endangered    609
Threatened    151
Name: Federal_T_E_Status, dtype: int64
--------------
No     936
Yes    145
Name: Coppice_Potential, dtype: int64
--------------
No     868
Yes    213
Name: Fall_Conspicuous, dtype: int64
--------------
No     991
Yes     90
Name: Fire_Resistance, dtype: int64
--------------
No     604
Yes    477
Name: Flower_Conspicuous, dtype: int64
--------------
No     734
Yes    347
Name: Fruit_Conspicuous, dtype: int64
--------------
No     1048
Yes      33
Name: Known_Allelopath, dtype: int64
--------------
No     942
Yes    139
Name: Leaf_Retention, dtype: int64
--------------
No     1000
Yes      81
Name: Low_Growing_Grass, dtype: int64
--------------
No     747
Yes    334
Name: Resprout_Ability, dtype: int64
--------------
Yes    767
No     314
Name: Adapted_to_Coarse_Textured_Soils, dtype: int64
----

In [30]:
# replace single column value with numerics
for x in two_unique_list:
    df[x] = pd.factorize(df[x], na_sentinel=None)[0]

In [31]:
# check name of unique values in low variety columns
for x in two_unique_list:
    print(df[x].value_counts())
    print("--------------")

0    43786
2    43775
1     4610
Name: Kingdom, dtype: int64
--------------
2    92076
0       94
1        1
Name: Federal_Noxious_Status, dtype: int64
--------------
2    91411
0      609
1      151
Name: Federal_T_E_Status, dtype: int64
--------------
2    91090
0      936
1      145
Name: Coppice_Potential, dtype: int64
--------------
2    91090
0      868
1      213
Name: Fall_Conspicuous, dtype: int64
--------------
2    91090
0      991
1       90
Name: Fire_Resistance, dtype: int64
--------------
2    91090
0      604
1      477
Name: Flower_Conspicuous, dtype: int64
--------------
2    91090
0      734
1      347
Name: Fruit_Conspicuous, dtype: int64
--------------
2    91090
0     1048
1       33
Name: Known_Allelopath, dtype: int64
--------------
2    91090
1      942
0      139
Name: Leaf_Retention, dtype: int64
--------------
2    91090
0     1000
1       81
Name: Low_Growing_Grass, dtype: int64
--------------
2    91090
0      747
1      334
Name: Resprout_Ability, dtype: 

In [32]:
# check rate of na values in low variety columns
for x in two_unique_list:
    print(df[x].isna().sum())
    print("--------------")

0
--------------
0
--------------
0
--------------
0
--------------
0
--------------
0
--------------
0
--------------
0
--------------
0
--------------
0
--------------
0
--------------
0
--------------
0
--------------
0
--------------
0
--------------
0
--------------
0
--------------
0
--------------
0
--------------
0
--------------
0
--------------
0
--------------
0
--------------
0
--------------
0
--------------
0
--------------
0
--------------
0
--------------
0
--------------
0
--------------
0
--------------
0
--------------
0
--------------
0
--------------
0
--------------
0
--------------
0
--------------
0
--------------


## Check/Factorize/Check three_unique_list

In [33]:
# check rate of na values in low variety columns
for x in three_unique_list:
    print(df[x].isna().sum())
    print("--------------")

89375
--------------
91577
--------------
91122
--------------
91100
--------------
91106
--------------
91090
--------------
91092
--------------
91144
--------------
91090
--------------
91090
--------------
91090
--------------
91115
--------------
92027
--------------
91112
--------------
91161
--------------


In [34]:
# check name of unique values in low variety columns
for x in three_unique_list:
    print(df[x].value_counts())
    print("--------------")

Musci           1839
Hepaticae        933
Anthocerotae      24
Name: SubDivision, dtype: int64
--------------
Slow        397
Moderate    136
Rapid        61
Name: After_Harvest_Regrowth_Rate, dtype: int64
--------------
Medium    520
High      413
Low       116
Name: C_N_Ratio, dtype: int64
--------------
Moderate    413
Dense       339
Porous      319
Name: Foliage_Porosity_Summer, dtype: int64
--------------
Porous      817
Moderate    166
Dense        82
Name: Foliage_Porosity_Winter, dtype: int64
--------------
Medium    433
Coarse    354
Fine      294
Name: Foliage_Texture, dtype: int64
--------------
Moderate    487
Rapid       369
Slow        223
Name: Growth_Rate, dtype: int64
--------------
Moderate    457
Long        321
Short       249
Name: Lifespan, dtype: int64
--------------
Low       534
Medium    486
High       61
Name: Fertility_Requirement, dtype: int64
--------------
Medium    450
Low       358
High      273
Name: Moisture_Use, dtype: int64
--------------
Intoleran

In [35]:
# replace single column value with numerics
for x in three_unique_list:
    df[x] = pd.factorize(df[x], na_sentinel=None)[0]

In [36]:
# check name of unique values in low variety columns
for x in three_unique_list:
    print(df[x].value_counts())
    print("--------------")

3    89375
0     1839
1      933
2       24
Name: SubDivision, dtype: int64
--------------
3    91577
1      397
0      136
2       61
Name: After_Harvest_Regrowth_Rate, dtype: int64
--------------
3    91122
1      520
0      413
2      116
Name: C_N_Ratio, dtype: int64
--------------
3    91100
1      413
0      339
2      319
Name: Foliage_Porosity_Summer, dtype: int64
--------------
3    91106
2      817
1      166
0       82
Name: Foliage_Porosity_Winter, dtype: int64
--------------
3    91090
0      433
1      354
2      294
Name: Foliage_Texture, dtype: int64
--------------
3    91092
1      487
2      369
0      223
Name: Growth_Rate, dtype: int64
--------------
3    91144
2      457
1      321
0      249
Name: Lifespan, dtype: int64
--------------
3    91090
1      534
0      486
2       61
Name: Fertility_Requirement, dtype: int64
--------------
3    91090
0      450
1      358
2      273
Name: Moisture_Use, dtype: int64
--------------
3    91090
2      544
1      305
0      

In [37]:
# check rate of na values in low variety columns
for x in three_unique_list:
    print(df[x].isna().sum())
    print("--------------")

0
--------------
0
--------------
0
--------------
0
--------------
0
--------------
0
--------------
0
--------------
0
--------------
0
--------------
0
--------------
0
--------------
0
--------------
0
--------------
0
--------------
0
--------------


## Check/Factorize/Check four_unique_list

In [38]:
# check rate of na values in low variety columns
for x in four_unique_list:
    print(df[x].isna().sum())
    print("--------------")

91090
--------------
91090
--------------
91096
--------------
91090
--------------
91094
--------------
91090
--------------
91106
--------------
91096
--------------
91090
--------------
91092
--------------
91111
--------------
91094
--------------
91099
--------------


In [39]:
# check name of unique values in low variety columns
for x in four_unique_list:
    print(df[x].value_counts())
    print("--------------")

None      1037
Low         38
Medium       4
High         2
Name: Bloat, dtype: int64
--------------
None      974
Medium     50
Low        49
High        8
Name: Nitrogen_Fixation, dtype: int64
--------------
None        959
Slight       57
Moderate     40
Severe       19
Name: Toxicity, dtype: int64
--------------
None      528
Low       214
High      175
Medium    164
Name: Anaerobic_Tolerance, dtype: int64
--------------
Medium    403
Low       326
High      267
None       81
Name: CaCO_3_Tolerance, dtype: int64
--------------
Low       353
High      347
Medium    289
None       92
Name: Drought_Tolerance, dtype: int64
--------------
High      489
Medium    274
Low       209
None       93
Name: Fire_Tolerance, dtype: int64
--------------
None      702
Low       153
High      115
Medium    105
Name: Hedge_Tolerance, dtype: int64
--------------
None      581
Low       275
Medium    164
High       61
Name: Salinity_Tolerance, dtype: int64
--------------
Routinely Available       575
N

In [40]:
# replace single column value with numerics
for x in four_unique_list:
    df[x] = pd.factorize(df[x], na_sentinel=None)[0]

In [41]:
# check name of unique values in low variety columns
for x in four_unique_list:
    print(df[x].value_counts())
    print("--------------")

4    91090
0     1037
1       38
2        4
3        2
Name: Bloat, dtype: int64
--------------
4    91090
0      974
2       50
1       49
3        8
Name: Nitrogen_Fixation, dtype: int64
--------------
4    91096
0      959
1       57
2       40
3       19
Name: Toxicity, dtype: int64
--------------
4    91090
0      528
1      214
2      175
3      164
Name: Anaerobic_Tolerance, dtype: int64
--------------
4    91094
1      403
0      326
2      267
3       81
Name: CaCO_3_Tolerance, dtype: int64
--------------
4    91090
0      353
3      347
1      289
2       92
Name: Drought_Tolerance, dtype: int64
--------------
4    91106
2      489
1      274
0      209
3       93
Name: Fire_Tolerance, dtype: int64
--------------
4    91096
2      702
0      153
1      115
3      105
Name: Hedge_Tolerance, dtype: int64
--------------
4    91090
0      581
1      275
2      164
3       61
Name: Salinity_Tolerance, dtype: int64
--------------
4    91092
0      575
3      287
2      166
1       

In [42]:
# check rate of na values in low variety columns
for x in four_unique_list:
    print(df[x].isna().sum())
    print("--------------")

0
--------------
0
--------------
0
--------------
0
--------------
0
--------------
0
--------------
0
--------------
0
--------------
0
--------------
0
--------------
0
--------------
0
--------------
0
--------------


## Check/Factorize/Check med_unique_list

In [43]:
# check rate of na values in high variety columns
for x in med_unique_list:
    print(df[x].isna().sum())
    print("--------------")

0
--------------
0
--------------
0
--------------
0
--------------
48398
--------------
0
--------------
5011
--------------
85730
--------------
72849
--------------
92163
--------------
92108
--------------
4984
--------------
66679
--------------
91862
--------------
92171
--------------
91097
--------------
48971
--------------
56506
--------------
43773
--------------
43775
--------------
43773
--------------
51092
--------------
43858
--------------
50169
--------------
43775
--------------
43775
--------------
59943
--------------
52693
--------------
53845
--------------
91551
--------------
91564
--------------
90735
--------------
86924
--------------
87139
--------------
91060
--------------
92141
--------------
91060
--------------
91092
--------------
91122
--------------
91090
--------------
91117
--------------
91090
--------------
91756
--------------
91091
--------------
91090
--------------
91092
--------------
91093
--------------
91093
--------------
91403
--------

In [44]:
# check name of unique values in high variety columns
for x in med_unique_list:
    print(df[x].value_counts())
    print("--------------")

0        1
61454    1
61452    1
61451    1
61450    1
        ..
30722    1
30721    1
30720    1
30719    1
92170    1
Name: Unnamed: 0, Length: 92171, dtype: int64
--------------
1        1
61455    1
61453    1
61452    1
61451    1
        ..
30723    1
30722    1
30721    1
30720    1
92171    1
Name: id, Length: 92171, dtype: int64
--------------
ABAB     1
PAGY2    1
PAGUB    1
PAGU4    1
PAGU3    1
        ..
EGPR     1
EGLET    1
EGERI    1
EGDE     1
ZYVU     1
Name: Symbol, Length: 92171, dtype: int64
--------------
MIGU      70
IRHEH     69
TEOA      54
DECE      46
CRSU5     44
          ..
LAPOP      1
LAPOH      1
CAGR37     1
LAPO3      1
HEAM6      1
Name: Accepted_Symbol_x, Length: 48398, dtype: int64
--------------
ABAB2     1
PAFEP     1
PAFA6     1
PAFA7     1
PAFA8     1
         ..
ELAR10    1
ELAR11    1
ELAR12    1
ELAR13    1
ZYVU      1
Name: Synonym_Symbol_x, Length: 43773, dtype: int64
--------------
Carex sonomensis Stacey                                 

In [45]:
# split columns by type and use case
med_int_list = ['Seeds_per_Pound','Planting_Density_per_Acre_Minimum','Temperature_Minimum_F','Root_Depth_Minimum_inches','Precipitation_Maximum','Precipitation_Minimum','Planting_Density_per_Acre_Maximum','Planting_Density_per_Acre_Minimum','pH_Maximum','pH_Minimum','Frost_Free_Days_Minimum','Height_Mature_feet','Height_at_Base_Age_Maximum_feet','id']
med_str_list = ['Genus','Fruit_Seed_Period_End','Bloom_Period','Shape_and_Orientation','Growth_Form','Fruit_Color','Foliage_Color','Flower_Color','Scientific_Name_y','Synonym_Symbol_y','Accepted_Symbol_y','Division','Class','SubClass','xOrder','Family_Common_Name','Family_Symbol','Family','Category','Common_Name','Parents','Quadranomial_Author','Trinomial_Author','Genera_Binomial_Author','Forma','Subvariety','Variety','Subspecies','Species','Scientific_Name_x','Synonym_Symbol_x','Accepted_Symbol_x','Symbol']
med_npa_list = ['Active_Growth_Period','State_T_E_Common_Name','State_T_E_Status','Native_Status','Growth_Habit','Duration']
target_npa_col = ['State_Noxious_Common_Name','Invasive','State_Noxious_Status','State_and_Province','Questionable_Taxon_Indicator','Unnamed']

In [46]:
len(med_unique_list)

59

In [47]:
len(med_int_list)

14

In [48]:
len(med_str_list)

33

In [49]:
len(med_npa_list)

6

In [50]:
len(target_npa_col)

6

In [51]:
len(med_int_list) + len(med_str_list) + len(med_npa_list) + len(target_npa_col)

59

## CHECK/FACTORIZE/CHECK med_str_list

In [52]:
# check rate of na values in high variety columns
for x in med_str_list:
    print(df[x].isna().sum())
    print("--------------")

0
--------------
91113
--------------
91098
--------------
91090
--------------
91090
--------------
91117
--------------
91090
--------------
91122
--------------
91060
--------------
92141
--------------
91060
--------------
43775
--------------
43775
--------------
50169
--------------
43858
--------------
51092
--------------
43773
--------------
43775
--------------
43773
--------------
48971
--------------
91097
--------------
91862
--------------
66679
--------------
4984
--------------
92108
--------------
92163
--------------
72849
--------------
85730
--------------
5011
--------------
0
--------------
48398
--------------
0
--------------
0
--------------


In [53]:
# check name of unique values in high variety columns
for x in med_str_list:
    print(df[x].value_counts())
    print("--------------")

Carex          1288
Astragalus      996
Lupinus         906
Salix           851
Eriogonum       801
               ... 
Garysmithia       1
Galatea           1
Sargentia         1
Gaurella          1
Zygocactus        1
Name: Genus, Length: 6877, dtype: int64
--------------
Fall          527
Summer        452
Spring         51
Winter         19
Year Round      9
Name: Fruit_Seed_Period_End, dtype: int64
--------------
Late Spring      335
Mid Spring       182
Early Spring     100
Early Summer      99
Late Summer       87
Spring            85
Mid Summer        84
Summer            64
Indeterminate     22
Fall               7
Late Winter        5
Winter             3
Name: Bloom_Period, dtype: int64
--------------
Erect         708
Semi-Erect    156
Decumbent      79
Rounded        46
Prostrate      36
Irregular      17
Conical        16
Climbing       11
Vase            7
Columnar        3
Oval            2
Name: Shape_and_Orientation, dtype: int64
--------------
Multiple Stem      265


In [54]:
# replace single column value with numerics
for x in med_str_list:
    df[x] = pd.factorize(df[x], na_sentinel=None)[0]

In [55]:
# check name of unique values in high variety columns
for x in med_str_list:
    print(df[x].value_counts())
    print("--------------")

1004    1288
552      996
3862     906
5638     851
2358     801
        ... 
2635       1
2637       1
5663       1
2639       1
6876       1
Name: Genus, Length: 6877, dtype: int64
--------------
5    91113
0      527
1      452
2       51
3       19
4        9
Name: Fruit_Seed_Period_End, dtype: int64
--------------
12    91098
1       335
2       182
6       100
4        99
3        87
7        85
0        84
5        64
11       22
8         7
9         5
10        3
Name: Bloom_Period, dtype: int64
--------------
11    91090
1       708
2       156
4        79
3        46
5        36
7        17
0        16
6        11
9         7
10        3
8         2
Name: Shape_and_Orientation, dtype: int64
--------------
8    91090
1      265
4      248
2      209
3      136
0      130
6       48
5       35
7       10
Name: Growth_Form, dtype: int64
--------------
9    91117
0      721
2      108
4       79
6       45
5       27
1       25
3       21
8       20
7        8
Name: Fruit_Color,

In [56]:
# check rate of na values in high variety columns
for x in med_str_list:
    print(df[x].isna().sum())
    print("--------------")

0
--------------
0
--------------
0
--------------
0
--------------
0
--------------
0
--------------
0
--------------
0
--------------
0
--------------
0
--------------
0
--------------
0
--------------
0
--------------
0
--------------
0
--------------
0
--------------
0
--------------
0
--------------
0
--------------
0
--------------
0
--------------
0
--------------
0
--------------
0
--------------
0
--------------
0
--------------
0
--------------
0
--------------
0
--------------
0
--------------
0
--------------
0
--------------
0
--------------


In [57]:
df['Active_Growth_Period'].value_counts()

Spring and Summer          701
Spring, Summer, Fall       116
Spring                      89
Summer                      84
Summer and Fall             32
Fall, Winter and Spring     25
Spring and Fall             20
Year Round                  12
Name: Active_Growth_Period, dtype: int64

In [58]:
# check rate of na values in high variety columns
for x in med_npa_list:
    print(df[x].isna().sum())
    print("--------------")

91092
--------------
87139
--------------
86924
--------------
53845
--------------
52693
--------------
59943
--------------


In [59]:
# check rate of na values in high variety columns
for x in med_npa_list:
    print(df[x].value_counts())
    print("--------------")

Spring and Summer          701
Spring, Summer, Fall       116
Spring                      89
Summer                      84
Summer and Fall             32
Fall, Winter and Spring     25
Spring and Fall             20
Year Round                  12
Name: Active_Growth_Period, dtype: int64
--------------
TN (a liverwort)                                                                                                      12
MN (a species of lichen)                                                                                              11
NJ (sphagnum)                                                                                                         10
KY (a moss)                                                                                                           10
AZ (Missouri foxtail cactus)                                                                                           5
                                                                                           

In [60]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 92171 entries, 0 to 92170
Columns: 135 entries, Unnamed: 0 to Genus
dtypes: float64(13), int64(111), object(11)
memory usage: 94.9+ MB


In [61]:
df.head()

Unnamed: 0.1,Unnamed: 0,id,Symbol,Accepted_Symbol_x,Synonym_Symbol_x,Scientific_Name_x,Hybrid_Genus_Indicator,Hybrid_Species_Indicator,Species,Subspecies_Prefix,...,Naval_Store_Product,Nursery_Stock_Product,Palatable_Browse_Animal,Palatable_Graze_Animal,Palatable_Human,Post_Product,Protein_Potential,Pulpwood_Product,Veneer_Product,Genus
0,0,1,0,0,43773,0,0,0,0,0,...,2,2,3,3,2,2,2,2,2,0
1,1,2,1,1,0,1,0,0,1,0,...,2,2,3,3,2,2,2,2,2,1
2,2,3,2,2,1,2,0,0,2,0,...,2,2,3,3,2,2,2,2,2,0
3,3,4,3,3,43773,3,0,0,3,0,...,2,2,3,3,2,2,2,2,2,2
4,4,5,4,4,2,4,0,0,4,0,...,2,2,3,3,2,2,2,2,2,3


## fillna for med_int_list

In [62]:
# check rate of na values in high variety columns
# check rate of na values in high variety columns
for x in med_int_list:
    print(df[x].isna().sum())
    print("--------------")

91288
--------------
91403
--------------
91092
--------------
91090
--------------
91097
--------------
91097
--------------
91404
--------------
91403
--------------
91093
--------------
91093
--------------
91092
--------------
91091
--------------
91756
--------------
0
--------------


In [63]:
for x in med_int_list:
    df[x].fillna(0,inplace=True)

In [64]:
# check rate of na values in high variety columns
for x in med_int_list:
    print(df[x].isna().sum())
    print("--------------")

0
--------------
0
--------------
0
--------------
0
--------------
0
--------------
0
--------------
0
--------------
0
--------------
0
--------------
0
--------------
0
--------------
0
--------------
0
--------------
0
--------------


In [65]:
# check rate of na values in high variety columns
for x in med_npa_list:
    print(df[x].isna().sum())
    print("--------------")

91092
--------------
87139
--------------
86924
--------------
53845
--------------
52693
--------------
59943
--------------


In [71]:
target_npa_col = ['State_Noxious_Common_Name',
 'Invasive',
 'State_Noxious_Status',
 'State_and_Province',
 'Questionable_Taxon_Indicator']

In [72]:
target_npa_col

['State_Noxious_Common_Name',
 'Invasive',
 'State_Noxious_Status',
 'State_and_Province',
 'Questionable_Taxon_Indicator']

In [73]:
# df = df.drop(columns=['Unnamed'])
# # 'Unnamed not found'

In [74]:
for x in target_npa_col:
    df[x].fillna(0,inplace=True)

In [75]:
# check rate of na values in high variety columns
for x in target_npa_col:
    print(df[x].isna().sum())
    print("--------------")

0
--------------
0
--------------
0
--------------
0
--------------
0
--------------


In [76]:
# check rate of na values in high variety columns
for x in med_npa_list:
    print(df[x].isna().sum())
    print("--------------")

91092
--------------
87139
--------------
86924
--------------
53845
--------------
52693
--------------
59943
--------------


In [77]:
for x in med_npa_list:
    df[x].fillna(0,inplace=True)

In [78]:
# check rate of na values in high variety columns
for x in med_npa_list:
    print(df[x].isna().sum())
    print("--------------")

0
--------------
0
--------------
0
--------------
0
--------------
0
--------------
0
--------------


In [None]:
# # replace single column value with numeric
# for x in rando_list:
# #     df[x].fillna(0, inplace = True)
# #     df[x].loc[~df[x].isnull()] = 1
#     df[x].fillna(0,inplace=True)
#     df[x]=df[x].apply(lambda x: 1 if x!=0 else 0)
# #     y = (df[x].unique().tolist())
# #     df[x] = df[x].map({y:1})
# #     for z in y:
# #         print(z) 

# #     df[x].replace({y:1})

# #     print(y)
# #     print(len(y)) = 2 x10
# #     df[x].replace({y:1},inplace=True)

## Enumerate columns with categorical arrays

In [79]:
state_abr_list = ['AL','AK','AS','AZ','AR','CA','CO','CT','DE','DC','FM','FL','GA','GU','HI','ID','IL','IN','IA','KS','KY','LA','ME','MH','MD','MA','MI','MN','MS','MO','MT','NE','NV','NH','NJ','NM','NY','NC'
,'ND','MP','OH','OK','OR','PW','PA','PR','RI','SC','SD','TN','TX','UT','VT','VI','VA','WA','WV','WI','WY','L48']

In [None]:
# ## for state in state_abr_list:
# ## df_core_state[state + '_Native'] = df_core_state['Native_Status'].apply(lambda x: in_state(x, state))


# ## for state in state_abr_list:
# ##    df_core[state] = df_core['State_and_Province'].apply(lambda x: in_state(x, state))
        
# ## rework code
# ### make 2 empty lists outside for loop

# ### list 1 = append new columns +Native names
# for state in state_abr_list:
#     df[state + '_Native']=0

In [None]:
# ### list 2 = append new column from lambda loop
# for state in state_abr_list:
#     df_core_state_data = df_core_state['Native_Status'].apply(lambda x: in_state(x, state))

In [None]:
# ### outside of for loop create new df by merging the two SAME SIZE lists
# df_core_merged = pd.merge(df_core_state,df_core_state_data, how='left', on =[])
# pd.merge

# ### concate new df onto old data frame with axis = 1 so it appends to the side not underneath/above

# #### Native_names = []
# #### Lambda_products = []

In [82]:
# # create column for each state in state_abr_list
# for x in state_abr_list:
#     df_core[x]=0

## CREATE CORE DATASET

In [None]:
df_core = pd.DataFrame(df, columns = [
    'Accepted_Symbol_x','Hybrid_Genus_Indicator','Hybrid_Species_Indicator',
    'Species', 'Subspecies_Prefix', 'Variety',
    'Genera_Binomial_Author', 'Common_Name', 'State_and_Province',
    'Category', 'Family_Symbol', 'Family_Common_Name',
    'xOrder', 'SubClass', 'Class', 'Division',
    'SuperDivision', 'SubKingdom', 'Kingdom',
    'Duration', 'Growth_Habit', 'Native_Status',
    'Federal_T_E_Status', 'State_T_E_Status',
    'Genus','Seeds_per_Pound','Planting_Density_per_Acre_Minimum',
    'Temperature_Minimum_F', 'Root_Depth_Minimum_inches',
    'Precipitation_Maximum','Precipitation_Minimum',
    'Planting_Density_per_Acre_Maximum','Planting_Density_per_Acre_Minimum',
    'pH_Maximum','pH_Minimum','Frost_Free_Days_Minimum','Height_Mature_feet',
    'Height_at_Base_Age_Maximum_feet','Coppice_Potential', 'Fall_Conspicuous', 
    'Fire_Resistance', 'Flower_Conspicuous', 'Fruit_Conspicuous', 'Known_Allelopath', 
    'Leaf_Retention','State_Noxious_Status','Toxicity''Fruit_Seed_Abundance',
    'Seed_Spread_Rate', 'Vegetative_Spread_Rate'])


In [None]:
def state_col(string):
    for x in state_abr_list:
        if string.find(x) != -1:
            return True
        else:
            return False

In [None]:
def in_state(string, state):
        if string.find(state) == -1:
            return 0
        else:
            return 1

In [None]:
df_core['State_and_Province'] = df_core['State_and_Province'].astype('string')

In [None]:
df_core['State_and_Province'].isna().value_counts().sort_values()

In [None]:
for state in state_abr_list:
    df_core[state] = df_core['State_and_Province'].apply(lambda x: in_state(x, state))
    if df_core[state].isna().sum() == 92171:
        df_core[state].drop(column=[df_core[state]])

In [None]:
# create a new columns for each state with _Native string added
# without + _Native, loop would create one column and overwrite itself for each loop
for state in state_abr_list:
    df_core_state[state + '_Native'] = df_core_state['Native_Status'].apply(lambda x: in_state(x, state))

In [None]:
# create a new columns for each state with _Native string added
# without + _Native, loop would create one column and overwrite itself for each loop
for state in state_abr_list:
    df_core_state[state + '_Native'] = df_core_state['Native_Status'].apply(lambda x: in_state(x, state))

In [None]:
# # replace single column value with numerics
# for x in med_npa_list:
#     df[x] = pd.factorize(df[x], na_sentinel=None)[0]

In [None]:
# # replace single column value with dummies
# for x in med_npa_list:
#     pd.get_dummies(df[x], dummy_na=True)

# CREATE ENRICHED DATASET

In [None]:
column_list_groups = [['id', 'drop'],
 ['Symbol', 'drop'],
 ['Accepted_Symbol_x', 'shorthand for scientific name'],
 ['Synonym_Symbol_x', 'drop'],
 ['Scientific_Name_x', 'drop'],
 ['Hybrid_Genus_Indicator', 'T/F'],
 ['Hybrid_Species_Indicator', 'T/F'],
 ['Species', 'repeats'],
 ['Subspecies_Prefix', 'T/F'],
 ['Hybrid_Subspecies_Indicator', 'drop'],
 ['Subspecies', 'repeats (similar to species)'],
 ['Variety_Prefix', 'drop'],
 ['Hybrid_Variety_Indicator', 'drop'],
 ['Variety', '72k nulls some overlap with other columns'],
 ['Subvariety_Prefix', 'drop'],
 ['Subvariety', 'drop'],
 ['Forma_Prefix', 'drop'],
 ['Forma', 'drop'],
 ['Genera_Binomial_Author', 'discovering author'],
 ['Trinomial_Author', 'drop'],
 ['Quadranomial_Author', 'drop'],
 ['Questionable_Taxon_Indicator', 'drop'],
 ['Parents', 'drop'],
 ['Common_Name', 'T/F'],
 ['State_and_Province', 'is present column'],
 ['Category', 'fill na'],
 ['Family', 'drop'],
 ['Family_Symbol', 'fill na'],
 ['Family_Common_Name', 'fill na'],
 ['xOrder', 'fill na'],
 ['SubClass', 'fill na'],
 ['Class', 'fill na'],
 ['SubDivision', 'drop'],
 ['Division', 'fill na'],
 ['SuperDivision', 'fill na'],
 ['SubKingdom', 'fill na'],
 ['Kingdom', 'fill na'],
 ['Duration', 'split and fill na'],
 ['Growth_Habit', 'Split dummies and fill na'],
 ['Native_Status', 'more thought'],
 ['Federal_Noxious_Status', 'Combine with State'],
 ['State_Noxious_Status', 'Filter to NC and combine'],
 ['State_Noxious_Common_Name', 'drop'],
 ['Invasive', 'potential target'],
 ['Federal_T_E_Status', 'fill na'],
 ['State_T_E_Status', 'filter to nc'],
 ['State_T_E_Common_Name', 'drop'],
 ['Accepted_Symbol_y', 'drop'],
 ['Synonym_Symbol_y', 'drop'],
 ['Scientific_Name_y', 'drop'],
 ['Active_Growth_Period', 'Enriched Dataset'],
 ['After_Harvest_Regrowth_Rate', 'Enriched Dataset'],
 ['Bloat', 'Enriched Dataset'],
 ['C_N_Ratio', 'Enriched Dataset'],
 ['Coppice_Potential', 'Enriched Dataset'],
 ['Fall_Conspicuous', 'Enriched Dataset'],
 ['Fire_Resistance', 'Enriched Dataset'],
 ['Flower_Color', 'fill na'],
 ['Flower_Conspicuous', 'Enriched Dataset'],
 ['Foliage_Color', 'Enriched Dataset'],
 ['Foliage_Porosity_Summer', 'Enriched Dataset'],
 ['Foliage_Porosity_Winter', 'Enriched Dataset'],
 ['Foliage_Texture', 'Enriched Dataset'],
 ['Fruit_Color', 'Enriched Dataset'],
 ['Fruit_Conspicuous', 'Enriched Dataset'],
 ['Growth_Form', 'Enriched Dataset'],
 ['Growth_Rate', 'Enriched Dataset'],
 ['Height_at_Base_Age_Maximum_feet', 'Enriched Dataset'],
 ['Height_Mature_feet', 'Enriched Dataset'],
 ['Known_Allelopath', 'Enriched Dataset'],
 ['Leaf_Retention', 'Enriched Dataset'],
 ['Lifespan', 'Enriched Dataset'],
 ['Low_Growing_Grass', 'Enriched Dataset'],
 ['Nitrogen_Fixation', 'Enriched Dataset'],
 ['Resprout_Ability', 'Enriched Dataset'],
 ['Shape_and_Orientation', 'Enriched Dataset'],
 ['Toxicity', 'Enriched Dataset'],
 ['Adapted_to_Coarse_Textured_Soils', 'Enriched Dataset'],
 ['Adapted_to_Medium_Textured_Soils', 'Enriched Dataset'],
 ['Adapted_to_Fine_Textured_Soils', 'Enriched Dataset'],
 ['Anaerobic_Tolerance', 'Enriched Dataset'],
 ['CaCO_3_Tolerance', 'Enriched Dataset'],
 ['Cold_Stratification_Required', 'Enriched Dataset'],
 ['Drought_Tolerance', 'Enriched Dataset'],
 ['Fertility_Requirement', 'Enriched Dataset'],
 ['Fire_Tolerance', 'Enriched Dataset'],
 ['Frost_Free_Days_Minimum', 'Enriched Dataset'],
 ['Hedge_Tolerance', 'Enriched Dataset'],
 ['Moisture_Use', 'Enriched Dataset'],
 ['pH_Minimum', 'Enriched Dataset'],
 ['pH_Maximum', 'Enriched Dataset'],
 ['Planting_Density_per_Acre_Minimum', 'Enriched Dataset'],
 ['Planting_Density_per_Acre_Maximum', 'Enriched Dataset'],
 ['Precipitation_Minimum', 'Enriched Dataset'],
 ['Precipitation_Maximum', 'Enriched Dataset'],
 ['Root_Depth_Minimum_inches', 'Enriched Dataset'],
 ['Salinity_Tolerance', 'Enriched Dataset'],
 ['Shade_Tolerance', 'Enriched Dataset'],
 ['Temperature_Minimum_F', 'Enriched Dataset'],
 ['Bloom_Period', 'Enriched Dataset'],
 ['Commercial_Availability', 'Enriched Dataset'],
 ['Fruit_Seed_Abundance', 'Enriched Dataset'],
 ['Fruit_Seed_Period_Begin', 'Enriched Dataset'],
 ['Fruit_Seed_Period_End', 'Enriched Dataset'],
 ['Fruit_Seed_Persistence', 'Enriched Dataset'],
 ['Propogated_by_Bare_Root', 'Enriched Dataset'],
 ['Propogated_by_Bulbs', 'Enriched Dataset'],
 ['Propogated_by_Container', 'Enriched Dataset'],
 ['Propogated_by_Corms', 'Enriched Dataset'],
 ['Propogated_by_Cuttings', 'Enriched Dataset'],
 ['Propogated_by_Seed', 'Enriched Dataset'],
 ['Propogated_by_Sod', 'Enriched Dataset'],
 ['Propogated_by_Sprigs', 'Enriched Dataset'],
 ['Propogated_by_Tubers', 'Enriched Dataset'],
 ['Seeds_per_Pound', 'Enriched Dataset'],
 ['Seed_Spread_Rate', 'Enriched Dataset'],
 ['Seedling_Vigor', 'Enriched Dataset'],
 ['Small_Grain', 'Enriched Dataset'],
 ['Vegetative_Spread_Rate', 'Enriched Dataset'],
 ['Berry_Nut_Seed_Product', 'Enriched Dataset'],
 ['Christmas_Tree_Product', 'Enriched Dataset'],
 ['Fodder_Product', 'Enriched Dataset'],
 ['Fuelwood_Product', 'Enriched Dataset'],
 ['Lumber_Product', 'Enriched Dataset'],
 ['Naval_Store_Product', 'Enriched Dataset'],
 ['Nursery_Stock_Product', 'Enriched Dataset'],
 ['Palatable_Browse_Animal', 'Enriched Dataset'],
 ['Palatable_Graze_Animal', 'Enriched Dataset'],
 ['Palatable_Human', 'Enriched Dataset'],
 ['Post_Product', 'Enriched Dataset'],
 ['Protein_Potential', 'Enriched Dataset'],
 ['Pulpwood_Product', 'Enriched Dataset'],
 ['Veneer_Product', 'Enriched Dataset'],
 ['Genus', 'use']]

In [None]:
# use list comprehension
enriched_dataset = [col[0] for col in column_list_groups if col[1] == 'Enriched Dataset']

In [None]:
enriched_dataset

In [None]:
# Create enriched dataset
df_enriched = pd.DataFrame(df, columns = [enriched_dataset])

In [None]:
df_enriched.head()

In [None]:
# need to trim rows for only 10000 subset with complete enriched data


In [None]:
one_unique_list

In [None]:
two_unique_list

In [None]:
three_unique_list

In [None]:
four_unique_list

In [None]:
df_core.head()

In [None]:
core_column_list = list(df_core)

In [None]:
# for column in core_column_list:
#     print(df_core[column].isna().value_counts().sort_values())
#     print('-------------')
#     print(df_core[column].describe())
#     print('-------------')
#     print('-------------')
#     print('-------------')

In [None]:
df_core.isna().sum()

In [None]:
# values = {'State_and_Province':'unknown','Native_Status':'unknown',}
df_core.fillna('unknown', inplace=True)

In [None]:
df_core.isna().sum()

In [None]:
# # function to id 'Lower 48' states code in super string
# def in_l48(string):
#         if string.find('L48') == -1:
#             return False
#         else:
#             return True

In [None]:
# # function to id 'NC' state code in super string
# def in_nc(string):
#         if string.find('NC') == -1:
#             return False
#         else:
#             return True

In [None]:
# df_core_nc = df_core.copy()

In [None]:
# create df with nc nativity known
# df_core_nc['In_NC'] = df_core['State_and_Province'].apply(in_nc)
# df_core_nc['Native_L48'] = df_core['Native_Status'].apply(in_l48)
# df_core_nc['Native_NC'] = df_core['Native_Status'].apply(in_nc)

In [None]:
list(df_core.columns)

## DF_CORE TARGET COLUMN ISOLATION, PREPROCESS

In [None]:
df_core

In [None]:
df_core_state.head()

In [None]:
df_core_process = df_core_state.copy()

In [None]:
df_core_process

In [None]:
column_process_list = list(df_core_process)

In [None]:
for column in column_process_list:
    print(df_core_process[column].describe())
    print('----------')

In [None]:
core_process = df_core_process.drop(columns = ['AL_Native', 'L48','AS_Native','MA_Native',
'AZ_Native','AR_Native','CO_Native','CT_Native','DE_Native','DC_Native','MN_Native',
'FM_Native','FL_Native','GA_Native','GU_Native','ID_Native','IL_Native','IN_Native',
'IA_Native','KS_Native','KY_Native','LA_Native','ME_Native','MH_Native','MD_Native',
'MI_Native','MO_Native','MS_Native','WY_Native','WI_Native','WV_Native','VA_Native',
'WA_Native','VT_Native','UT_Native','TX_Native','TN_Native','SD_Native','SC_Native',
'RI_Native','PA_Native','PW_Native','OR_Native','OK_Native','OH_Native','MP_Native',
'NC_Native','NY_Native','NM_Native','NJ_Native','NH_Native','NV_Native','NE_Native',
'ND_Native','MT_Native'])

In [None]:
df_core_process

In [None]:
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(sparse=False)

In [None]:
list(df_core_process)

In [None]:
# df_core_process.unique()

In [None]:
df_core_process.dtypes

In [None]:
list(df_core_process)

In [None]:
for column in column_process_list:
    print(df_core_process[column].unique())
    print('----------')

## DF TARGET COLUMN ISOLATION

In [None]:
# # Drop the label to create the X data
# X = df_core.drop('State_Noxious_Status', axis=1)
# X

In [None]:
X = df_core_process.drop(['State_Noxious_Status','State_and_Province'],axis=1)

In [None]:
X

In [None]:
# One-hot encoding the entire dataframe
# X_dummies = pd.get_dummies(X)
# X_dummies
# 92171 rows × 138303 columns

In [None]:
# One-hot encoding the entire dataframe
# X_dummies = pd.get_dummies(X)
# print(X_dummies.columns)
# X_dummies

## PRODUCES 92171 rows × 149451 columns

In [None]:
# Converting output labels to 0 and 1
# y_label = LabelEncoder().fit_transform(df_core_process['State_Noxious_Status'])
# y_label


In [None]:
# X_train, X_test, y_train, y_test = train_test_split(X_dummies, y_label, random_state=99)

In [None]:
# Scaling the X data by using StandardScaler()
# scaler = StandardScaler()

In [None]:
# X_train_scaled = scaler.fit(X_train)
# X_train_scaled = scaler.transform(X_train)
# X_train_scaled

In [None]:
# # Alternatively, scaling the data by using MinMaxScaler()
# scaler = MinMaxScaler().fit(X_train)
# X_train_scaled = scaler.transform(X_train)
# X_train_scaled

In [None]:
# Transforming the test dataset based on the fit from the training dataset
# X_test_scaled = scaler.transform(X_test)
# X_test_scaled


## Score Models

In [None]:
# clf = RandomForestClassifier(random_state=99, n_estimators=500).fit(X_train, y_train)
# print(f'Training Score: {clf.score(X_train, y_train)}')
# print(f'Testing Score: {clf.score(X_test, y_test)}')