#### Data: Dry Bean

In [1]:
import pandas as pd
import numpy as np

from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
dry_bean = fetch_ucirepo(id=602) 
  
# data (as pandas dataframes) 
X = dry_bean.data.features 
y = dry_bean.data.targets 

In [2]:
X.shape, y.shape

((13611, 16), (13611, 1))

In [3]:
# concating the two data extracted from the source
data = pd.concat([X,y], axis = 1)

In [4]:
data.shape

(13611, 17)

In [5]:
data.head(3)

Unnamed: 0,Area,Perimeter,MajorAxisLength,MinorAxisLength,AspectRatio,Eccentricity,ConvexArea,EquivDiameter,Extent,Solidity,Roundness,Compactness,ShapeFactor1,ShapeFactor2,ShapeFactor3,ShapeFactor4,Class
0,28395,610.291,208.178117,173.888747,1.197191,0.549812,28715,190.141097,0.763923,0.988856,0.958027,0.913358,0.007332,0.003147,0.834222,0.998724,SEKER
1,28734,638.018,200.524796,182.734419,1.097356,0.411785,29172,191.272751,0.783968,0.984986,0.887034,0.953861,0.006979,0.003564,0.909851,0.99843,SEKER
2,29380,624.11,212.82613,175.931143,1.209713,0.562727,29690,193.410904,0.778113,0.989559,0.947849,0.908774,0.007244,0.003048,0.825871,0.999066,SEKER


In [6]:
data.columns

Index(['Area', 'Perimeter', 'MajorAxisLength', 'MinorAxisLength',
       'AspectRatio', 'Eccentricity', 'ConvexArea', 'EquivDiameter', 'Extent',
       'Solidity', 'Roundness', 'Compactness', 'ShapeFactor1', 'ShapeFactor2',
       'ShapeFactor3', 'ShapeFactor4', 'Class'],
      dtype='object')

In [7]:
data.isnull().sum()

Area               0
Perimeter          0
MajorAxisLength    0
MinorAxisLength    0
AspectRatio        0
Eccentricity       0
ConvexArea         0
EquivDiameter      0
Extent             0
Solidity           0
Roundness          0
Compactness        0
ShapeFactor1       0
ShapeFactor2       0
ShapeFactor3       0
ShapeFactor4       0
Class              0
dtype: int64

In [8]:
data.Class.value_counts()

Class
DERMASON    3546
SIRA        2636
SEKER       2027
HOROZ       1928
CALI        1630
BARBUNYA    1322
BOMBAY       522
Name: count, dtype: int64

In [9]:
unseen_data = data.sample(frac=0.15, random_state=42)
df = data.drop(unseen_data.index)

In [10]:
unseen_data.to_csv('./Project/drybean_unseen.csv', index=False)
df.to_csv('./Project/Dry_Bean.csv', index=False)

In [11]:
data = pd.read_csv('./Project/Dry_Bean.csv')

In [12]:
data.head(1)

Unnamed: 0,Area,Perimeter,MajorAxisLength,MinorAxisLength,AspectRatio,Eccentricity,ConvexArea,EquivDiameter,Extent,Solidity,Roundness,Compactness,ShapeFactor1,ShapeFactor2,ShapeFactor3,ShapeFactor4,Class
0,28395,610.291,208.178117,173.888747,1.197191,0.549812,28715,190.141097,0.763923,0.988856,0.958027,0.913358,0.007332,0.003147,0.834222,0.998724,SEKER


In [13]:
from pycaret.classification import ClassificationExperiment
s = ClassificationExperiment()
s.setup(data, target = 'Class', session_id = 42)

Unnamed: 0,Description,Value
0,Session id,42
1,Target,Class
2,Target type,Multiclass
3,Target mapping,"BARBUNYA: 0, BOMBAY: 1, CALI: 2, DERMASON: 3, HOROZ: 4, SEKER: 5, SIRA: 6"
4,Original data shape,"(11569, 17)"
5,Transformed data shape,"(11569, 17)"
6,Transformed train set shape,"(8098, 17)"
7,Transformed test set shape,"(3471, 17)"
8,Numeric features,16
9,Preprocess,True


<pycaret.classification.oop.ClassificationExperiment at 0x21df2d104d0>

In [14]:
best = s.compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
catboost,CatBoost Classifier,0.9302,0.0,0.9302,0.9308,0.9302,0.9155,0.9157,16.261
lightgbm,Light Gradient Boosting Machine,0.928,0.0,0.928,0.9287,0.9281,0.9128,0.913,1.252
gbc,Gradient Boosting Classifier,0.9262,0.0,0.9262,0.9269,0.9262,0.9106,0.9107,10.861
rf,Random Forest Classifier,0.9221,0.0,0.9221,0.9227,0.922,0.9056,0.9058,0.863
et,Extra Trees Classifier,0.9195,0.0,0.9195,0.9201,0.9195,0.9025,0.9026,0.23
qda,Quadratic Discriminant Analysis,0.909,0.0,0.909,0.9138,0.9095,0.8901,0.8912,0.023
lda,Linear Discriminant Analysis,0.9023,0.0,0.9023,0.9138,0.9042,0.8818,0.8838,0.022
dt,Decision Tree Classifier,0.8971,0.0,0.8971,0.8981,0.8971,0.8755,0.8758,0.075
lr,Logistic Regression,0.8668,0.0,0.8668,0.8686,0.8668,0.8385,0.8388,1.383
ridge,Ridge Classifier,0.8558,0.0,0.8558,0.8679,0.8481,0.8238,0.8314,0.021


In [15]:
s.models()

Unnamed: 0_level_0,Name,Reference,Turbo
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
lr,Logistic Regression,sklearn.linear_model._logistic.LogisticRegression,True
knn,K Neighbors Classifier,sklearn.neighbors._classification.KNeighborsCl...,True
nb,Naive Bayes,sklearn.naive_bayes.GaussianNB,True
dt,Decision Tree Classifier,sklearn.tree._classes.DecisionTreeClassifier,True
svm,SVM - Linear Kernel,sklearn.linear_model._stochastic_gradient.SGDC...,True
rbfsvm,SVM - Radial Kernel,sklearn.svm._classes.SVC,False
gpc,Gaussian Process Classifier,sklearn.gaussian_process._gpc.GaussianProcessC...,False
mlp,MLP Classifier,sklearn.neural_network._multilayer_perceptron....,False
ridge,Ridge Classifier,sklearn.linear_model._ridge.RidgeClassifier,True
rf,Random Forest Classifier,sklearn.ensemble._forest.RandomForestClassifier,True


In [16]:
lightgbm = s.create_model('lightgbm')

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.9259,0.0,0.9259,0.9287,0.9265,0.9103,0.9107
1,0.9346,0.0,0.9346,0.9352,0.9347,0.9207,0.9208
2,0.9333,0.0,0.9333,0.9333,0.9332,0.9192,0.9193
3,0.9346,0.0,0.9346,0.9344,0.9345,0.9208,0.9209
4,0.9136,0.0,0.9136,0.9146,0.9139,0.8954,0.8955
5,0.9235,0.0,0.9235,0.9238,0.9236,0.9074,0.9074
6,0.9346,0.0,0.9346,0.9348,0.9343,0.9207,0.9209
7,0.9136,0.0,0.9136,0.914,0.9133,0.8953,0.8956
8,0.9184,0.0,0.9184,0.9186,0.9184,0.9011,0.9012
9,0.9481,0.0,0.9481,0.9498,0.9485,0.9371,0.9373


In [17]:
lightgbm

In [18]:
tune_lightgbm = s.tune_model(lightgbm)

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.9358,0.0,0.9358,0.9379,0.9363,0.9222,0.9225
1,0.9383,0.0,0.9383,0.9392,0.9385,0.9252,0.9253
2,0.9309,0.0,0.9309,0.9307,0.9306,0.9163,0.9164
3,0.9346,0.0,0.9346,0.9344,0.9342,0.9208,0.9209
4,0.9074,0.0,0.9074,0.9078,0.9076,0.888,0.888
5,0.9173,0.0,0.9173,0.9184,0.9175,0.8999,0.9
6,0.9333,0.0,0.9333,0.9339,0.9329,0.9192,0.9196
7,0.921,0.0,0.921,0.9218,0.9205,0.9043,0.9047
8,0.9221,0.0,0.9221,0.9223,0.9219,0.9056,0.9057
9,0.9493,0.0,0.9493,0.9505,0.9497,0.9386,0.9387


Fitting 10 folds for each of 10 candidates, totalling 100 fits


In [19]:
final_lightgbm = s.finalize_model(tune_lightgbm)

In [20]:
final_lightgbm

In [21]:
s.evaluate_model(final_lightgbm)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

In [22]:
data = s.predict_model(final_lightgbm)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Light Gradient Boosting Machine,0.9381,0.9958,0.9381,0.9381,0.938,0.925,0.925




In [23]:
data[data['Class'] !=data['prediction_label']]

Unnamed: 0,Area,Perimeter,MajorAxisLength,MinorAxisLength,AspectRatio,Eccentricity,ConvexArea,EquivDiameter,Extent,Solidity,Roundness,Compactness,ShapeFactor1,ShapeFactor2,ShapeFactor3,ShapeFactor4,Class,prediction_label,prediction_score
11080,37307,728.892029,267.065674,179.108551,1.491083,0.741771,37886,217.946655,0.741631,0.984717,0.882417,0.816079,0.007159,0.001959,0.665985,0.993038,DERMASON,SIRA,0.5022
6304,34189,687.252014,249.062119,174.993652,1.423264,0.711575,34559,208.640335,0.730066,0.989294,0.909629,0.837704,0.007285,0.002213,0.701748,0.998772,SIRA,DERMASON,0.9604
3280,45666,789.770020,274.863342,211.885132,1.297228,0.636987,46074,241.130173,0.748378,0.991145,0.920029,0.877273,0.006019,0.002199,0.769608,0.998357,CALI,SEKER,0.9971
6139,63083,993.172974,384.437622,210.901138,1.822833,0.836087,63902,283.407440,0.752996,0.987184,0.803660,0.737200,0.006094,0.001110,0.543464,0.990644,HOROZ,CALI,0.5806
11243,38471,726.153992,266.091248,184.475708,1.442419,0.720668,38875,221.320572,0.806791,0.989608,0.916824,0.831747,0.006917,0.002042,0.691803,0.997870,DERMASON,SIRA,0.5391
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5800,57371,905.573975,352.953033,209.091858,1.688029,0.805639,58240,270.272125,0.674921,0.985079,0.879133,0.765745,0.006152,0.001305,0.586366,0.989802,HOROZ,SIRA,0.3459
4766,41486,753.172974,278.307251,190.358139,1.462019,0.729496,41862,229.829529,0.805366,0.991018,0.919014,0.825812,0.006708,0.001925,0.681966,0.997047,HOROZ,SIRA,0.5735
11113,37514,725.846985,269.881165,177.418228,1.521158,0.753547,37920,218.550476,0.793309,0.989293,0.894773,0.809803,0.007194,0.001908,0.655780,0.997545,DERMASON,SIRA,0.5074
6285,73451,1081.600952,421.208191,222.836227,1.890214,0.848597,74535,305.811584,0.644714,0.985457,0.788993,0.726034,0.005735,0.000983,0.527126,0.996380,HOROZ,BARBUNYA,0.6689


In [24]:
data = pd.read_csv('./Project/Dry_Bean.csv')

In [25]:
data.head(1)

Unnamed: 0,Area,Perimeter,MajorAxisLength,MinorAxisLength,AspectRatio,Eccentricity,ConvexArea,EquivDiameter,Extent,Solidity,Roundness,Compactness,ShapeFactor1,ShapeFactor2,ShapeFactor3,ShapeFactor4,Class
0,28395,610.291,208.178117,173.888747,1.197191,0.549812,28715,190.141097,0.763923,0.988856,0.958027,0.913358,0.007332,0.003147,0.834222,0.998724,SEKER


In [26]:
from pycaret.classification import setup, models, create_model, tune_model, finalize_model, save_model, evaluate_model
cls = setup(data, target='Class', session_id = 42)

Unnamed: 0,Description,Value
0,Session id,42
1,Target,Class
2,Target type,Multiclass
3,Target mapping,"BARBUNYA: 0, BOMBAY: 1, CALI: 2, DERMASON: 3, HOROZ: 4, SEKER: 5, SIRA: 6"
4,Original data shape,"(11569, 17)"
5,Transformed data shape,"(11569, 17)"
6,Transformed train set shape,"(8098, 17)"
7,Transformed test set shape,"(3471, 17)"
8,Numeric features,16
9,Preprocess,True


In [27]:
lightgbm = create_model('lightgbm')
lightgbm

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.9259,0.0,0.9259,0.9287,0.9265,0.9103,0.9107
1,0.9346,0.0,0.9346,0.9352,0.9347,0.9207,0.9208
2,0.9333,0.0,0.9333,0.9333,0.9332,0.9192,0.9193
3,0.9346,0.0,0.9346,0.9344,0.9345,0.9208,0.9209
4,0.9136,0.0,0.9136,0.9146,0.9139,0.8954,0.8955
5,0.9235,0.0,0.9235,0.9238,0.9236,0.9074,0.9074
6,0.9346,0.0,0.9346,0.9348,0.9343,0.9207,0.9209
7,0.9136,0.0,0.9136,0.914,0.9133,0.8953,0.8956
8,0.9184,0.0,0.9184,0.9186,0.9184,0.9011,0.9012
9,0.9481,0.0,0.9481,0.9498,0.9485,0.9371,0.9373


In [28]:
tuned_lightgbm = tune_model(lightgbm)

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.9358,0.0,0.9358,0.9379,0.9363,0.9222,0.9225
1,0.9383,0.0,0.9383,0.9392,0.9385,0.9252,0.9253
2,0.9309,0.0,0.9309,0.9307,0.9306,0.9163,0.9164
3,0.9346,0.0,0.9346,0.9344,0.9342,0.9208,0.9209
4,0.9074,0.0,0.9074,0.9078,0.9076,0.888,0.888
5,0.9173,0.0,0.9173,0.9184,0.9175,0.8999,0.9
6,0.9333,0.0,0.9333,0.9339,0.9329,0.9192,0.9196
7,0.921,0.0,0.921,0.9218,0.9205,0.9043,0.9047
8,0.9221,0.0,0.9221,0.9223,0.9219,0.9056,0.9057
9,0.9493,0.0,0.9493,0.9505,0.9497,0.9386,0.9387


Fitting 10 folds for each of 10 candidates, totalling 100 fits


In [29]:
final_lightgbm = finalize_model(tuned_lightgbm)
final_lightgbm

In [30]:
save_model(final_lightgbm, './Project/Dry_Bean_Unseen')

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=Memory(location=None),
          steps=[('label_encoding',
                  TransformerWrapperWithInverse(exclude=None, include=None,
                                                transformer=LabelEncoder())),
                 ('numerical_imputer',
                  TransformerWrapper(exclude=None,
                                     include=['Area', 'Perimeter',
                                              'MajorAxisLength',
                                              'MinorAxisLength', 'AspectRatio',
                                              'Eccentricity', 'ConvexArea',
                                              'EquivDiameter', 'Extent',
                                              'Solidity', 'Roundn...
                                 boosting_type='gbdt', class_weight=None,
                                 colsample_bytree=1.0, feature_fraction=1.0,
                                 importance_type='split', learning_rate=0.1,
                      

In [31]:
import joblib

from sklearn.metrics import classification_report
test = pd.read_csv('./Project/drybean_unseen.csv')

In [32]:
test.shape

(2042, 17)

In [33]:
cls = joblib.load('./Project/Dry_Bean_Unseen.pkl')
cls

In [34]:
unseen_X = test.drop('Class', axis=1)
y = test['Class']

In [35]:
y_pred = cls.predict(unseen_X)
y_pred



0          SEKER
1       BARBUNYA
2          SEKER
3          SEKER
4       DERMASON
          ...   
2037    BARBUNYA
2038       HOROZ
2039        SIRA
2040       SEKER
2041       SEKER
Name: Class, Length: 2042, dtype: object

In [36]:
print(classification_report(y, y_pred))

              precision    recall  f1-score   support

    BARBUNYA       0.92      0.91      0.91       204
      BOMBAY       1.00      1.00      1.00        82
        CALI       0.95      0.95      0.95       243
    DERMASON       0.89      0.94      0.92       494
       HOROZ       0.98      0.97      0.98       303
       SEKER       0.97      0.94      0.95       306
        SIRA       0.89      0.86      0.87       410

    accuracy                           0.93      2042
   macro avg       0.94      0.94      0.94      2042
weighted avg       0.93      0.93      0.93      2042



This classification result is showing the performance of a model in predicting different classes in Dry Bean dataset. Here's the explanationa of each metric:

Precision: It measures how many of the predicted instances of a class are actually correct. i.e, for class BARBUNYA, the precision is 0.92, which means that out of all instances predicted as BARBUNYA, 92% were correct.

Recall: It measures how many of the actual instances of a class were predicted correctly by the model. therefore, for class DERMASON, the recall is 0.94, which means that out of all actual instances of DERMASON, 94% were predicted correctly by the model.

F1-score: It is the harmonic mean of precision and recall, providing a single score that balances both metrics. It's useful when we want to consider both precision and recall together.

Support: It indicates the number of actual instances of each class in Dry Bean dataset.

Accuracy: It measures the overall correctness of the model across all classes. Here, the accuracy is 0.93, which means that the model correctly predicted 93% of all instances in Dry Bean dataset.

The "macro avg" and "weighted avg" are the averages of precision, recall, and F1-score across all classes in Dry Bean dataset. "Macro avg" gives equal weight to each class, while "weighted avg" takes into account the class imbalance by weighting each class's score by its support.

Overall, this classification result suggests that the model has good performance, with high precision, recall, and F1-score for most classes, and a high overall accuracy of 93%.