In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import make_scorer, accuracy_score, f1_score, roc_auc_score
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, ParameterGrid
from sklearn import tree, metrics
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, AdaBoostClassifier, GradientBoostingClassifier, VotingClassifier

In [2]:
df = pd.read_csv('D:/FDEC/soybean.csv')
df.tail()

Unnamed: 0,date,plant-stand,precip,temp,hail,crop-hist,area-damaged,severity,seed-tmt,germination,...,sclerotia,fruit-pods,fruit-spots,seed,mold-growth,seed-discolor,seed-size,shriveling,roots,class
678,april,,,,,,upper-areas,,,,...,,,,,,,,,,2-4-d-injury
679,april,lt-normal,,lt-norm,,diff-lst-year,scattered,,,,...,,dna,,,,,,,rotted,herbicide-injury
680,june,lt-normal,,lt-norm,,diff-lst-year,scattered,,,,...,,dna,,,,,,,rotted,herbicide-injury
681,april,lt-normal,,lt-norm,,same-lst-yr,whole-field,,,,...,,dna,,,,,,,rotted,herbicide-injury
682,june,lt-normal,,lt-norm,,same-lst-yr,whole-field,,,,...,,dna,,,,,,,rotted,herbicide-injury


In [3]:
df.isna().values.any()

True

In [4]:
df.isna().sum().sum()

2337

In [5]:
df.shape

(683, 36)

In [6]:
original = df.copy()

In [7]:
target = df[['class']]

In [8]:
target.isna().any()
#no missing values in the target so no need to impute it.

class    False
dtype: bool

In [9]:
df.drop(columns='class', inplace=True)

In [10]:
#to handle NaN for categorical variables, we'll use the most frequent value in each variable
categ_pipeline = Pipeline([
    ('impute', SimpleImputer(strategy='most_frequent')),
    ('ohe', OneHotEncoder(handle_unknown='ignore'))
])

In [11]:
data= categ_pipeline.fit_transform(df)

In [12]:
#we will encode the label but no imputing since it doesn't contain any missing values
le = LabelEncoder()
y = le.fit_transform(target)

  return f(**kwargs)


In [13]:
X_train, X_test, y_train, y_test = train_test_split(data, y, test_size=0.2, random_state=0)

In [14]:
#let's use the randomforest and calculate the accuracy using the train/test split
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
y_score = rf.predict(X_test)
accuracy = rf.score(X_test, y_test)
print(accuracy)
print(f1_score(y_test, y_score, average='weighted'))

0.9562043795620438
0.9547487792013339


The accuracy is high about 97%

In [15]:
#let's use the cross validation for the randomforest
score_5 = cross_val_score(rf, data, y, cv=5)
print(score_5.mean())
score_10 = cross_val_score(rf, data, y, cv=10)
print(score_10.mean())

0.9283490768570202




0.9399403239556692


We see that the accuracy is high about 97% and the score using 10 folds is better than using 5 folds not the same results as we got using weka


let's tune the hyperparameters to get better performance:

In [16]:
param_grid = {
    'n_estimators': [i for i in range(10, 100, 10)],
    'criterion': ['gini', 'entropy'],
    'max_features': ['auto', 'sqrt', 'log2']
}

grid = GridSearchCV(RandomForestClassifier(), param_grid=param_grid, cv=5)
grid.fit(X_train, y_train)
print(grid.best_params_)
print(grid.best_score_)

The best estimator here using 5 folds is the RandomForest with entropy and max features as sqrt and n_estimators 90

In [18]:
grid = GridSearchCV(RandomForestClassifier(), param_grid=param_grid, cv=10)
grid.fit(X_train, y_train)
print(grid.best_params_)
print(grid.best_score_)
print()



{'criterion': 'entropy', 'max_features': 'auto', 'n_estimators': 50}
0.9303367003367005



In [22]:
print('the best estimator for using 10 folds is:', grid.best_estimator_)

the best estimator for using 10 folds is: RandomForestClassifier(criterion='entropy', n_estimators=50)


let's use the bagging using decision tree

In [24]:
bc = BaggingClassifier(tree.DecisionTreeClassifier(), max_samples=0.5, max_features=0.5, n_estimators=200)
bc.fit(X_train, y_train)
y_score = bc.predict(X_test).copy()
accuracy = bc.score(X_test, y_test)

print(accuracy)
print(f1_score(y_test, y_score, average='weighted'))

0.9635036496350365
0.9645823195458232


The accuracy is 96% and the f1_score is 96% so the model performs well

In [27]:
#let's use the cross validation and see the scores
score_5 = cross_val_score(bc, data, y, cv=5)
print(score_5.mean())
score_10 = cross_val_score(bc, data, y, cv=10)
print(score_10.mean())

0.9224774581365391




0.9282608695652174


THe model performs good but not better than the split train/test because the accuracy is lower also for the f1_score but the model is not generelized since using train/test split it overfits than using cross validation so it could not perform better on unseen data.

let's us the adaaboost with the decision tree as base classifier

In [28]:
abc = AdaBoostClassifier(tree.DecisionTreeClassifier(), n_estimators=500)
abc.fit(X_train, y_train)
y_score = abc.predict(X_test).copy()
accuracy = abc.score(X_test, y_test)

print(accuracy)
print(f1_score(y_test, y_score, average='weighted'))

0.9635036496350365
0.9641899057957452


Here the accuracy and f_score are the same as using the bagging on train and test set

In [29]:
score_5 = cross_val_score(abc, data, y, cv=5)
print(score_5.mean())
score_10 = cross_val_score(abc, data, y, cv=10)
print(score_10.mean())

0.9224667239158437




0.9310954816709293


The score here is higher than using the bagging but for using the train/test split it is lower but more generelizing as model it can perform better on unseen data than using the train/test split that use all its data

In [33]:
#let's use the gradientboost on the decisiontree classifier
gbc = GradientBoostingClassifier(n_estimators=200)
gbc.fit(X_train, y_train)
y_score = gbc.predict(X_test).copy()
accuracy = gbc.score(X_test, y_test)

print(accuracy)
print(f1_score(y_test, y_score, average='weighted'))

0.9124087591240876
0.9082923633840493


The accuracy and f-score here are lower than the other classifiers since we used lower number for estimator but it stays a good model

In [35]:
score_5 = cross_val_score(gbc, data, y, cv=5)
print(score_5.mean())
score_10 = cross_val_score(gbc, data, y, cv=10)
print(score_10.mean())

0.9298089308716186




0.9223785166240409


The score using the 5 folds is better than 10 folds is the same results we got using weka and it is better than using train/test split since it is more generelizing model.

In [36]:
#Let's use the votingclassifier to ensemble all the models we use
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
bag = BaggingClassifier()
bag.fit(X_train, y_train)
ada = AdaBoostClassifier()
ada.fit(X_train, y_train)
grad = GradientBoostingClassifier()
grad.fit(X_train, y_train)

GradientBoostingClassifier()

In [37]:
eclf1 = VotingClassifier(estimators=[('rf', rf), ('bc', bag), ('abc', ada), ('gbc', grad)])
eclf1.fit(X_train, y_train)
y_pred = eclf1.predict(X_test)

In [38]:
print(accuracy_score(y_test, y_pred))
print(f1_score(y_test, y_pred, average="weighted"))
print(cross_val_score(rf, data, y, cv=5).mean())
print(cross_val_score(rf, data, y, cv=10).mean())

0.948905109489051
0.9479412315178738
0.9356590811507084




0.9370417732310315


We see that the accuracy of the ensembling method using the voting with all classifiers is overall better than each model appart.