In [53]:
import numpy as np
import pandas as pd

import lazypredict
from lazypredict.Supervised import LazyClassifier

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [54]:
df = pd.read_csv("Dataset/Tree_Data.csv")
df.head()

Unnamed: 0,No,Plot,Subplot,Species,Light_ISF,Light_Cat,Core,Soil,Adult,Sterile,...,AMF,EMF,Phenolics,Lignin,NSC,Census,Time,Event,Harvest,Alive
0,126,1,C,Acer saccharum,0.11,Med,2017,Prunus serotina,I,Non-Sterile,...,22.0,,-0.56,13.86,12.15,4,14.0,1.0,,
1,11,1,C,Quercus alba,0.11,Med,2017,Quercus rubra,970,Non-Sterile,...,15.82,31.07,5.19,20.52,19.29,33,115.5,0.0,,X
2,12,1,C,Quercus rubra,0.11,Med,2017,Prunus serotina,J,Non-Sterile,...,24.45,28.19,3.36,24.74,15.01,18,63.0,1.0,,
3,2823,7,D,Acer saccharum,0.08,Med,2016,Prunus serotina,J,Non-Sterile,...,22.23,,-0.71,14.29,12.36,4,14.0,1.0,,
4,5679,14,A,Acer saccharum,0.06,Low,2017,Prunus serotina,689,Non-Sterile,...,21.15,,-0.58,10.85,11.2,4,14.0,1.0,,


In [55]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2783 entries, 0 to 2782
Data columns (total 24 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   No           2783 non-null   int64  
 1   Plot         2783 non-null   int64  
 2   Subplot      2783 non-null   object 
 3   Species      2783 non-null   object 
 4   Light_ISF    2783 non-null   float64
 5   Light_Cat    2783 non-null   object 
 6   Core         2783 non-null   int64  
 7   Soil         2783 non-null   object 
 8   Adult        2783 non-null   object 
 9   Sterile      2783 non-null   object 
 10  Conspecific  2783 non-null   object 
 11  Myco         2783 non-null   object 
 12  SoilMyco     2783 non-null   object 
 13  PlantDate    2783 non-null   object 
 14  AMF          2783 non-null   float64
 15  EMF          1283 non-null   float64
 16  Phenolics    2783 non-null   float64
 17  Lignin       2783 non-null   float64
 18  NSC          2783 non-null   float64
 19  Census

In [58]:
columns = ["Subplot", "Species", "Soil", "Adult", "Sterile", "Conspecific", "Myco", "SoilMyco"]

In [59]:
df.nunique()

No             2783
Plot             18
Subplot           5
Species           4
Light_ISF        53
Light_Cat         3
Core              2
Soil              7
Adult            36
Sterile           2
Conspecific       3
Myco              2
SoilMyco          3
PlantDate        19
AMF             924
EMF             682
Phenolics       494
Lignin         1095
NSC             998
Census           22
Time             22
Event             2
Harvest           1
Alive             1
dtype: int64

In [60]:
df.isnull().sum()

No                0
Plot              0
Subplot           0
Species           0
Light_ISF         0
Light_Cat         0
Core              0
Soil              0
Adult             0
Sterile           0
Conspecific       0
Myco              0
SoilMyco          0
PlantDate         0
AMF               0
EMF            1500
Phenolics         0
Lignin            0
NSC               0
Census            0
Time              0
Event             1
Harvest        2079
Alive          2292
dtype: int64

In [61]:
len(df)

2783

In [62]:
df = df.drop(columns=["No", "EMF", "Harvest", "Alive", "PlantDate", "Light_Cat"])

In [63]:
df["Event"].isna()

0       False
1       False
2       False
3       False
4       False
        ...  
2778    False
2779    False
2780    False
2781     True
2782    False
Name: Event, Length: 2783, dtype: bool

In [64]:
df = df.dropna(subset=["Event"])

In [65]:
#df = df.fillna(0)
#df.head()

In [66]:
df.corr()

Unnamed: 0,Plot,Light_ISF,Core,AMF,Phenolics,Lignin,NSC,Census,Time,Event
Plot,1.0,0.25,0.04,0.08,0.04,-0.01,0.03,-0.06,-0.06,-0.07
Light_ISF,0.25,1.0,0.05,0.1,0.1,0.08,0.23,0.02,0.02,-0.05
Core,0.04,0.05,1.0,-0.06,0.05,0.09,0.05,0.05,0.05,-0.05
AMF,0.08,0.1,-0.06,1.0,-0.12,-0.3,-0.18,-0.09,-0.09,0.13
Phenolics,0.04,0.1,0.05,-0.12,1.0,0.77,0.79,0.38,0.38,-0.58
Lignin,-0.01,0.08,0.09,-0.3,0.77,1.0,0.55,0.29,0.29,-0.52
NSC,0.03,0.23,0.05,-0.18,0.79,0.55,1.0,0.37,0.37,-0.41
Census,-0.06,0.02,0.05,-0.09,0.38,0.29,0.37,1.0,1.0,-0.23
Time,-0.06,0.02,0.05,-0.09,0.38,0.29,0.37,1.0,1.0,-0.23
Event,-0.07,-0.05,-0.05,0.13,-0.58,-0.52,-0.41,-0.23,-0.23,1.0


In [68]:
y = df["Event"]
X = df.drop(columns=["Event"])

In [69]:
#X.iloc[:, 0]

In [70]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
X_train.head()

Unnamed: 0,Plot,Subplot,Species,Light_ISF,Core,Soil,Adult,Sterile,Conspecific,Myco,SoilMyco,AMF,Phenolics,Lignin,NSC,Census,Time
361,6,C,Prunus serotina,0.09,2017,Sterile,H,Sterile,Sterilized,AMF,Sterile,8.73,0.06,7.69,11.93,7,24.5
1845,12,B,Prunus serotina,0.06,2016,Quercus rubra,1688,Non-Sterile,Heterospecific,AMF,EMF,21.36,0.68,6.47,10.46,12,42.0
1337,18,A,Acer saccharum,0.14,2016,Quercus alba,1478,Non-Sterile,Heterospecific,AMF,EMF,16.14,0.31,14.16,13.01,7,24.5
1601,11,B,Acer saccharum,0.06,2017,Quercus rubra,970,Non-Sterile,Heterospecific,AMF,EMF,13.31,0.01,10.11,10.79,11,38.5
2600,16,C,Quercus alba,0.13,2016,Populus grandidentata,285,Non-Sterile,Heterospecific,EMF,EMF,30.28,5.26,18.88,23.64,15,52.5


In [71]:
clf_laz = LazyClassifier(verbose=0, ignore_warnings=True)
lazy_model, lazy_pred_str = clf_laz.fit(X_train, X_test, y_train, y_test)
lazy_model

100%|██████████████████████████████████████████████████████████████████████████████████| 29/29 [00:02<00:00, 11.68it/s]

[LightGBM] [Info] Number of positive: 1185, number of negative: 901
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000130 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1224
[LightGBM] [Info] Number of data points in the train set: 2086, number of used features: 36
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.568073 -> initscore=0.273993
[LightGBM] [Info] Start training from score 0.273993





Unnamed: 0_level_0,Accuracy,Balanced Accuracy,ROC AUC,F1 Score,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
LGBMClassifier,0.98,0.97,0.97,0.98,0.06
RandomForestClassifier,0.97,0.97,0.97,0.97,0.22
XGBClassifier,0.97,0.97,0.97,0.97,0.06
BaggingClassifier,0.97,0.97,0.97,0.97,0.06
AdaBoostClassifier,0.96,0.96,0.96,0.96,0.14
ExtraTreesClassifier,0.96,0.96,0.96,0.96,0.17
DecisionTreeClassifier,0.95,0.95,0.95,0.95,0.02
ExtraTreeClassifier,0.94,0.93,0.93,0.94,0.02
KNeighborsClassifier,0.87,0.86,0.86,0.87,0.05
LabelPropagation,0.84,0.84,0.84,0.84,0.29


In [93]:
import seaborn as sns
import lightgbm as lgb
from sklearn.metrics import classification_report

In [94]:
#clf_lgb = lgb.LGBMClassifier()
#clf_lgb.fit(X_train, y_train)

ValueError: pandas dtypes must be int, float or bool.
Fields with bad pandas dtypes: Subplot: object, Species: object, Soil: object, Adult: object, Sterile: object, Conspecific: object, Myco: object, SoilMyco: object

In [72]:
columns

['Subplot',
 'Species',
 'Soil',
 'Adult',
 'Sterile',
 'Conspecific',
 'Myco',
 'SoilMyco']

In [73]:
for col in df[columns]:
  print(df[col].unique())

['C' 'D' 'A' 'B' 'E']
['Acer saccharum' 'Quercus alba' 'Quercus rubra' 'Prunus serotina']
['Prunus serotina' 'Quercus rubra' 'Acer rubrum' 'Populus grandidentata'
 'Sterile' 'Acer saccharum' 'Quercus alba']
['I' '970' 'J' '689' '1332' '891' '1595' '1323' '394' '561' '1478' '1320'
 '1454' '921' '984' '118' '1757' '1384' '1688' '961' '1715' '50' '1468'
 '1201' '1386' '277' '415' '285' '275' '1205' '1330' '1297' '1326' 'H'
 '1027' 'G']
['Non-Sterile' 'Sterile']
['Heterospecific' 'Sterilized' 'Conspecific']
['AMF' 'EMF']
['AMF' 'EMF' 'Sterile']


In [75]:
df_m = df[:]

In [77]:
df_m["Subplot"] = df_m["Subplot"].map({"A": 1, "B": 2, "C": 3, "D": 4, "E": 5})

In [78]:
df_m["Species"] = df_m["Species"].map({"Acer saccharum": 1, "Quercus alba": 2, "Quercus rubra": 3, "Prunus serotina": 4})
df_m["Soil"] = df_m["Soil"].map({"Prunus serotina": 1, "Quercus rubra": 2, "Acer rubrum": 3, "Populus grandidentata": 4, "Sterile": 5, 
                                "Acer saccharum": 6, "Quercus alba": 7})
df_m["Sterile"] = df_m["Sterile"].map({"Non-Sterile": 1, "Sterile": 2})
df_m["Conspecific"] = df_m["Conspecific"].map({"Heterospecific": 1, "Sterilized": 2, "Conspecific": 3})
df_m["Myco"] = df_m["Myco"].map({"AMF": 1, "EMF": 2})
df_m["SoilMyco"] = df_m["SoilMyco"].map({"AMF": 1, "EMF": 2, "Sterile": 3})

In [79]:
df_m.corr()

Unnamed: 0,Plot,Subplot,Species,Light_ISF,Core,Soil,Sterile,Conspecific,Myco,SoilMyco,AMF,Phenolics,Lignin,NSC,Census,Time,Event
Plot,1.0,0.01,0.0,0.25,0.04,-0.0,-0.01,0.0,0.01,-0.01,0.08,0.04,-0.01,0.03,-0.06,-0.06,-0.07
Subplot,0.01,1.0,-0.0,0.09,0.04,-0.0,0.02,0.05,0.01,0.01,-0.01,0.01,0.02,0.02,0.0,0.0,-0.0
Species,0.0,-0.0,1.0,-0.0,0.07,0.0,0.0,0.01,-0.01,0.0,0.47,0.05,-0.2,-0.2,-0.02,-0.02,-0.04
Light_ISF,0.25,0.09,-0.0,1.0,0.05,-0.0,-0.01,0.01,0.0,-0.01,0.1,0.1,0.08,0.23,0.02,0.02,-0.05
Core,0.04,0.04,0.07,0.05,1.0,-0.22,0.1,0.06,0.09,-0.02,-0.06,0.05,0.09,0.05,0.05,0.05,-0.05
Soil,-0.0,-0.0,0.0,-0.0,-0.22,1.0,0.22,0.1,-0.01,0.31,-0.04,0.05,-0.02,0.01,-0.02,-0.02,-0.0
Sterile,-0.01,0.02,0.0,-0.01,0.1,0.22,1.0,0.33,0.03,0.76,-0.38,-0.07,-0.0,-0.04,0.04,0.04,-0.08
Conspecific,0.0,0.05,0.01,0.01,0.06,0.1,0.33,1.0,0.0,0.22,-0.17,-0.03,0.03,-0.1,-0.01,-0.01,0.0
Myco,0.01,0.01,-0.01,0.0,0.09,-0.01,0.03,0.0,1.0,0.02,-0.24,0.94,0.88,0.72,0.38,0.38,-0.6
SoilMyco,-0.01,0.01,0.0,-0.01,-0.02,0.31,0.76,0.22,0.02,1.0,-0.31,-0.02,-0.01,-0.04,0.01,0.01,-0.05


In [86]:
# Cross tabulation between Adult and Event
CrosstabResult=pd.crosstab(index=df_m["Adult"],columns=df_m["Event"])
print(CrosstabResult)

Event  0.00  1.00
Adult            
1027     32    48
118      37    32
1201     24    41
1205     34    47
1297     39    43
1320     37    41
1323     34    45
1326     24    34
1330     32    51
1332     32    33
1384     25    36
1386     28    37
1454     37    48
1468     26    38
1478     37    48
1595     37    40
1688     38    40
1715     30    50
1757     38    48
275      37    51
277      28    40
285      24    38
394      23    41
415      39    43
50       25    38
561      32    50
689      43    46
891      25    38
921      44    44
961      34    50
970      44    44
984      41    49
G        25    58
H        34    54
I        39    51
J        37    52


In [87]:
# importing the required function
from scipy.stats import chi2_contingency
 
# Performing Chi-sq test
ChiSqResult = chi2_contingency(CrosstabResult)
 
# P-Value is the Probability of H0 being True
# If P-Value&gt;0.05 then only we Accept the assumption(H0)
 
print('The P-Value of the ChiSq Test is:', ChiSqResult[1])

The P-Value of the ChiSq Test is: 0.8717554419074809


In [96]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df_m["Adult"] = le.fit_transform(df_m["Adult"])
df_m["Adult"] = df_m["Adult"].astype("int")

In [97]:
df_m.head()

Unnamed: 0,Plot,Subplot,Species,Light_ISF,Core,Soil,Adult,Sterile,Conspecific,Myco,SoilMyco,AMF,Phenolics,Lignin,NSC,Census,Time,Event
0,1,3,1,0.11,2017,1,34,1,1,1,1,22.0,-0.56,13.86,12.15,4,14.0,1.0
1,1,3,2,0.11,2017,2,30,1,1,2,2,15.82,5.19,20.52,19.29,33,115.5,0.0
2,1,3,3,0.11,2017,1,35,1,1,2,1,24.45,3.36,24.74,15.01,18,63.0,1.0
3,7,4,1,0.08,2016,1,35,1,1,1,1,22.23,-0.71,14.29,12.36,4,14.0,1.0
4,14,1,1,0.06,2017,1,26,1,1,1,1,21.15,-0.58,10.85,11.2,4,14.0,1.0


In [100]:
y_m = df_m["Event"]
X_m = df_m.drop(columns=["Event"])

In [101]:
X_m_train, X_m_test, y_m_train, y_m_test = train_test_split(X_m, y_m, random_state=42)
X_m_train.head()

Unnamed: 0,Plot,Subplot,Species,Light_ISF,Core,Soil,Adult,Sterile,Conspecific,Myco,SoilMyco,AMF,Phenolics,Lignin,NSC,Census,Time
361,6,3,4,0.09,2017,5,33,2,2,1,3,8.73,0.06,7.69,11.93,7,24.5
1845,12,2,4,0.06,2016,2,16,1,1,1,2,21.36,0.68,6.47,10.46,12,42.0
1337,18,1,1,0.14,2016,7,14,1,1,1,2,16.14,0.31,14.16,13.01,7,24.5
1601,11,2,1,0.06,2017,2,30,1,1,1,2,13.31,0.01,10.11,10.79,11,38.5
2600,16,3,2,0.13,2016,4,21,1,1,2,2,30.28,5.26,18.88,23.64,15,52.5


In [102]:
model_lgb = lgb.LGBMClassifier()
model_lgb.fit(X_m_train, y_m_train)

[LightGBM] [Info] Number of positive: 1185, number of negative: 901
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000175 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1205
[LightGBM] [Info] Number of data points in the train set: 2086, number of used features: 17
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.568073 -> initscore=0.273993
[LightGBM] [Info] Start training from score 0.273993


In [103]:
predictions = model_lgb.predict(X_m_test)
print(classification_report(y_m_test, predictions))

              precision    recall  f1-score   support

         0.0       0.98      0.96      0.97       294
         1.0       0.97      0.98      0.98       402

    accuracy                           0.97       696
   macro avg       0.97      0.97      0.97       696
weighted avg       0.97      0.97      0.97       696



In [14]:
#for i in range(X.shape[1]):
    #print(df.columns[i])
    #np.corrcoef(X.iloc[:, i], y)

In [89]:
#scaler = StandardScaler().fit(X_train)
#X_train_scaled = scaler.transform(X_train)
#X_test_scaled = scaler.transform(X_test)

In [None]:
#clf_laz_str = LazyClassifier(verbose=0, ignore_warnings=True)
#lazy_model_str, lazy_pred_str = clf_laz_str.fit(X_train_scaled, X_test_scaled, y_train, y_test)
#lazy_model_str