In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import ExtraTreesClassifier
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder, StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

In [2]:
df = pd.read_csv('D:\PROJECTWORSHOP\Soybean_Seeds\Csv data\hat_cuoi_basic_glcm_lbp.csv')
df.head()

Unnamed: 0,area,length,width,length_width_ratio,major_axis_length,minor_axis_length,convex_area,perimeter,r_mean,g_mean,...,1,2,3,4,5,6,7,8,9,class
0,89673,260,459,0.566449,455.995692,254.486127,92794,182549.6827,193.28266,204.596936,...,0.015058,0.009503,0.032839,0.054997,0.056311,0.024992,0.018933,0.756933,0.022592,Kirmizi_Pistachio
1,77242,306,386,0.792746,419.518438,235.842058,80317,155983.7209,171.47201,187.837951,...,0.010692,0.006278,0.029842,0.047219,0.055489,0.022267,0.016508,0.789869,0.016433,Kirmizi_Pistachio
2,91228,368,410,0.897561,490.239529,239.81988,93077,183450.9039,191.164193,194.322412,...,0.013833,0.011358,0.037514,0.063042,0.0586,0.025261,0.016306,0.743939,0.022367,Kirmizi_Pistachio
3,93910,300,456,0.657895,485.497337,249.129221,94954,187818.1997,184.290374,199.038196,...,0.014108,0.010272,0.036328,0.060631,0.060633,0.025722,0.018108,0.745097,0.021761,Kirmizi_Pistachio
4,72568,228,414,0.550725,417.2703,228.51545,75303,148327.506,216.744088,231.689395,...,0.007658,0.003603,0.016697,0.030242,0.041675,0.011364,0.011147,0.862417,0.0111,Kirmizi_Pistachio


In [3]:
X = df.drop(columns=['class'])
y = df['class']

scaler = StandardScaler()
X_scaler = scaler.fit_transform(X)

le = LabelEncoder()
y_scaler = le.fit_transform(y) 


In [4]:
X_train, X_test, y_train, y_test = train_test_split(X_scaler, y_scaler, test_size=0.3, random_state=42)

In [5]:
model_accuracies = {}

KNN

In [7]:
param_grid = {
    'n_neighbors': [3, 5, 7, 9, 11],
    'weights': ['uniform', 'distance']
}

knn = KNeighborsClassifier()
grid_search_knn = GridSearchCV(knn, param_grid, cv=5, scoring='accuracy')

grid_search_knn.fit(X_train, y_train)

print("Best parameters for KNN:", grid_search_knn.best_params_)
print("Best score for KNN:", grid_search_knn.best_score_)

y_pred_knn = grid_search_knn.predict(X_test)
print(classification_report(y_test, y_pred_knn))

Best parameters for KNN: {'n_neighbors': 5, 'weights': 'distance'}
Best score for KNN: 0.9480974529346623
              precision    recall  f1-score   support

           0       0.94      0.98      0.96       372
           1       0.97      0.92      0.94       273

    accuracy                           0.95       645
   macro avg       0.95      0.95      0.95       645
weighted avg       0.95      0.95      0.95       645



SVM

In [8]:
param_grid_svm = {
    'C': [0.1, 1, 10, 100],
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'gamma': ['scale', 'auto']
}

svm = SVC()
grid_search_svm = GridSearchCV(svm, param_grid_svm, cv=5, scoring='accuracy')

grid_search_svm.fit(X_train, y_train)

print("Best parameters for SVM:", grid_search_svm.best_params_)
print("Best score for SVM:", grid_search_svm.best_score_)

y_pred_svm = grid_search_svm.predict(X_test)
print(classification_report(y_test, y_pred_svm))

Best parameters for SVM: {'C': 1, 'gamma': 'scale', 'kernel': 'linear'}
Best score for SVM: 0.9713820598006645
              precision    recall  f1-score   support

           0       0.97      0.97      0.97       372
           1       0.96      0.96      0.96       273

    accuracy                           0.97       645
   macro avg       0.97      0.97      0.97       645
weighted avg       0.97      0.97      0.97       645



RF

In [25]:
param_grid_rf = {
    'criterion' : ['gini', 'entropy', 'log_loss'],
    'n_estimators': [100, 300, 500],
    'bootstrap': [True, False],
	'random_state' : [0,2,4]
}

rf = RandomForestClassifier(random_state=42)
grid_search_rf = GridSearchCV(rf, param_grid_rf, cv=5, scoring='accuracy')
grid_search_rf.fit(X_train, y_train)

print("Best parameters for Random Forest:", grid_search_rf.best_params_)
print("Best score for Random Forest:", grid_search_rf.best_score_)

y_pred_rf = grid_search_rf.predict(X_test)
print(classification_report(y_test, y_pred_rf))

Best parameters for Random Forest: {'bootstrap': False, 'criterion': 'entropy', 'n_estimators': 300, 'random_state': 0}
Best score for Random Forest: 0.956093023255814
              precision    recall  f1-score   support

           0       0.94      0.96      0.95       372
           1       0.94      0.92      0.93       273

    accuracy                           0.94       645
   macro avg       0.94      0.94      0.94       645
weighted avg       0.94      0.94      0.94       645



LR

In [26]:
param_grid_lr = {
    'penalty': ['l1', 'l2', 'elasticnet', 'none'],
    'C': [0.01, 0.1, 1, 10],
    'solver': ['lbfgs', 'liblinear', 'saga'],
    'max_iter': [100, 500, 1000]
}

lr = LogisticRegression(random_state=42)
grid_search_lr = GridSearchCV(lr, param_grid_lr, cv=5, scoring='accuracy')
grid_search_lr.fit(X_train, y_train)

print("Best parameters for Logistic Regression:", grid_search_lr.best_params_)
print("Best score for Logistic Regression:", grid_search_lr.best_score_)

y_pred_lr = grid_search_lr.predict(X_test)
print(classification_report(y_test, y_pred_lr))



Best parameters for Logistic Regression: {'C': 1, 'max_iter': 100, 'penalty': 'l2', 'solver': 'liblinear'}
Best score for Logistic Regression: 0.9680575858250278
              precision    recall  f1-score   support

           0       0.98      0.96      0.97       372
           1       0.95      0.97      0.96       273

    accuracy                           0.96       645
   macro avg       0.96      0.96      0.96       645
weighted avg       0.96      0.96      0.96       645



420 fits failed out of a total of 720.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
60 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\baohu\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\baohu\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "c:\Users\baohu\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\linear_model\_logistic.py", line 1194, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "c:\Us

XG Boost

In [27]:
param_grid_xgb = {
    'n_estimators': [100, 300, 500],
    'learning_rate': [0.01, 0.1],
    'subsample': [0.6, 1.0]
}

xgb_model = xgb.XGBClassifier(eval_metric='logloss', random_state=42)
grid_search_xgb = GridSearchCV(xgb_model, param_grid_xgb, cv=5, scoring='accuracy')
grid_search_xgb.fit(X_train, y_train)

print("Best parameters for XGBoost:", grid_search_xgb.best_params_)
print("Best score for XGBoost:", grid_search_xgb.best_score_)

y_pred_xgb = grid_search_xgb.predict(X_test)
print(classification_report(y_test, y_pred_xgb))

Best parameters for XGBoost: {'learning_rate': 0.1, 'n_estimators': 300, 'subsample': 0.6}
Best score for XGBoost: 0.9700598006644519
              precision    recall  f1-score   support

           0       0.97      0.98      0.97       372
           1       0.97      0.95      0.96       273

    accuracy                           0.97       645
   macro avg       0.97      0.97      0.97       645
weighted avg       0.97      0.97      0.97       645



CatBoost

In [28]:
param_grid_catboost = {
    'iterations': [100, 300, 500],
    'learning_rate': [0.01, 0.1, 0.2, 0.3],
    'depth': [4, 6, 8, 10]
}

catboost_model = CatBoostClassifier(verbose=0, random_state=42)
grid_search_catboost = GridSearchCV(catboost_model, param_grid_catboost, cv=5, scoring='accuracy')
grid_search_catboost.fit(X_train, y_train)

print("Best parameters for CatBoost:", grid_search_catboost.best_params_)
print("Best score for CatBoost:", grid_search_catboost.best_score_)

y_pred_catboost = grid_search_catboost.predict(X_test)
print(classification_report(y_test, y_pred_catboost))

Best parameters for CatBoost: {'depth': 4, 'iterations': 500, 'learning_rate': 0.3}
Best score for CatBoost: 0.9700575858250277
              precision    recall  f1-score   support

           0       0.97      0.97      0.97       372
           1       0.96      0.95      0.95       273

    accuracy                           0.96       645
   macro avg       0.96      0.96      0.96       645
weighted avg       0.96      0.96      0.96       645



ET

In [29]:
param_grid_et = {
    'n_estimators': [100, 300, 500],
    'max_depth': [None, 10, 30, 50],
    'bootstrap': [True, False]
}

et = ExtraTreesClassifier(random_state=42)
grid_search_et = GridSearchCV(et, param_grid_et, cv=5, scoring='accuracy')
grid_search_et.fit(X_train, y_train)

print("Best parameters for Extra Trees:", grid_search_et.best_params_)
print("Best score for Extra Trees:", grid_search_et.best_score_)

y_pred_et = grid_search_et.predict(X_test)
print(classification_report(y_test, y_pred_et))

Best parameters for Extra Trees: {'bootstrap': False, 'max_depth': None, 'n_estimators': 500}
Best score for Extra Trees: 0.9507663344407529
              precision    recall  f1-score   support

           0       0.95      0.95      0.95       372
           1       0.93      0.93      0.93       273

    accuracy                           0.94       645
   macro avg       0.94      0.94      0.94       645
weighted avg       0.94      0.94      0.94       645



DT

In [30]:
param_grid_dt = {
    'criterion' : ['gini', 'entropy', 'log_loss'],
    'splitter' : ['best', 'random'],
    'random_state' : [0,1,2,3,4]
}

dt = DecisionTreeClassifier(random_state=42)
grid_search_dt = GridSearchCV(dt, param_grid_dt, cv=5, scoring='accuracy')
grid_search_dt.fit(X_train, y_train)

print("Best parameters for Decision Tree:", grid_search_dt.best_params_)
print("Best score for Decision Tree:", grid_search_dt.best_score_)

y_pred_dt = grid_search_dt.predict(X_test)
print(classification_report(y_test, y_pred_dt))

Best parameters for Decision Tree: {'criterion': 'entropy', 'random_state': 0, 'splitter': 'best'}
Best score for Decision Tree: 0.9234795127353266
              precision    recall  f1-score   support

           0       0.94      0.92      0.93       372
           1       0.89      0.93      0.91       273

    accuracy                           0.92       645
   macro avg       0.92      0.92      0.92       645
weighted avg       0.92      0.92      0.92       645



GNB

In [31]:
param_grid_gnb = {
    'var_smoothing': [1e-09, 1e-05]
}

gnb = GaussianNB()
grid_search_gnb = GridSearchCV(gnb, param_grid_gnb, cv=5, scoring='accuracy')
grid_search_gnb.fit(X_train, y_train)

print("Best parameters for Gaussian Naive Bayes:", grid_search_gnb.best_params_)
print("Best score for Gaussian Naive Bayes:", grid_search_gnb.best_score_)

y_pred_gnb = grid_search_gnb.predict(X_test)
print(classification_report(y_test, y_pred_gnb))

Best parameters for Gaussian Naive Bayes: {'var_smoothing': 1e-09}
Best score for Gaussian Naive Bayes: 0.8310033222591363
              precision    recall  f1-score   support

           0       0.89      0.87      0.88       372
           1       0.82      0.85      0.84       273

    accuracy                           0.86       645
   macro avg       0.86      0.86      0.86       645
weighted avg       0.86      0.86      0.86       645



MLPClassifier

In [32]:
param_grid_mlp = {
    'hidden_layer_sizes': [(50,), (100,), (150, ), (200, )],
    'activation': ['tanh', 'relu', 'logistic', 'identity'],
    'solver': ['adam'],
    'learning_rate': ['constant', 'adaptive', 'invscaling']
}

mlp = MLPClassifier(random_state=42, max_iter=1000)
grid_search_mlp = GridSearchCV(mlp, param_grid_mlp, cv=5, scoring='accuracy')
grid_search_mlp.fit(X_train, y_train)

print("Best parameters for MLPClassifier:", grid_search_mlp.best_params_)
print("Best score for MLPClassifier:", grid_search_mlp.best_score_)

y_pred_mlp = grid_search_mlp.predict(X_test)
print(classification_report(y_test, y_pred_mlp))

Best parameters for MLPClassifier: {'activation': 'relu', 'hidden_layer_sizes': (100,), 'learning_rate': 'constant', 'solver': 'adam'}
Best score for MLPClassifier: 0.9740509413067553
              precision    recall  f1-score   support

           0       0.98      0.97      0.97       372
           1       0.96      0.97      0.97       273

    accuracy                           0.97       645
   macro avg       0.97      0.97      0.97       645
weighted avg       0.97      0.97      0.97       645



In [34]:
for model, accuracy in model_accuracies.items():
    print(f"{model} Accuracy score: {accuracy:.2f}%")