# Handling Class Imbalance
This notebook deals with handling class imbalance in our data.

In [1]:
import os
import cv2
from skimage.feature import greycomatrix, greycoprops
from skimage.segmentation import slic
from sklearn.preprocessing import MinMaxScaler
from sklearn.utils import resample
import numpy as np
import pandas as pd

In [2]:
feature_files_folder = r'E:\Btech project\leaf_disease\feature_files'

In [3]:
os.listdir(feature_files_folder)

['training.csv',
 'features(binary_classify_RGB_equalized).csv',
 'features(binary_classify).csv',
 'features(disease).csv',
 'features(multiclass_classify).csv',
 'features(otsu).csv',
 'features.csv',
 'Image_path_labels.csv',
 'Leaf_data.csv',
 'Leaf_disease_path.csv',
 'Mask_path_labels.csv',
 'mask_prediction_features_and_labels.csv',
 'Balanced_binary_features(RGB).csv',
 '.ipynb_checkpoints',
 'Balanced_multiclass_features(RGB).csv',
 'features(multiclass_classify_RGB_equalized).csv',
 'features(binary_classify)(RGB).csv',
 'features(multiclass_classify)(RGB).csv']

## 1. Trying to equalize the class distributions
Use: `sklearn.utils.resample` to equalize the class distributions by udersampling the majority class and makign all classes equal to the clas with the least number of examples.

In [4]:
binary_features = pd.read_csv(feature_files_folder + '\\features(binary_classify)(RGB).csv') 

In [5]:
binary_features.head()

Unnamed: 0,name,contrast,dissimilarity,homogeneity,ASM,energy,label
0,DSC_0100_Brown spot_10_binary.npy,2.761285,0.125333,0.999509,0.978454,0.989168,0
1,DSC_0100_Brown spot_11_binary.npy,3.17254,0.144,0.999435,0.975907,0.98788,0
2,DSC_0100_Brown spot_12_binary.npy,2.526282,0.114667,0.99955,0.979848,0.989873,0
3,DSC_0100_Brown spot_13_binary.npy,2.585033,0.117333,0.99954,0.97911,0.9895,0
4,DSC_0100_Brown spot_14_binary.npy,2.643783,0.12,0.999529,0.978127,0.989003,0


In [6]:
binary_features.loc[:, 'contrast':'energy'] = binary_features.loc[:, 'contrast':'energy']/binary_features.loc[:, 'contrast':'energy'].max(axis=0)

In [7]:
binary_features.head()

Unnamed: 0,name,contrast,dissimilarity,homogeneity,ASM,energy,label
0,DSC_0100_Brown spot_10_binary.npy,0.225653,0.11864,0.999565,0.988419,0.994193,0
1,DSC_0100_Brown spot_11_binary.npy,0.259261,0.136309,0.999492,0.985846,0.992898,0
2,DSC_0100_Brown spot_12_binary.npy,0.206448,0.108543,0.999607,0.989827,0.9949,0
3,DSC_0100_Brown spot_13_binary.npy,0.211249,0.111067,0.999596,0.989082,0.994526,0
4,DSC_0100_Brown spot_14_binary.npy,0.216051,0.113591,0.999586,0.988089,0.994026,0


In [8]:
binary_features = binary_features.sample(frac=1).reset_index(drop=True)

In [9]:
binary_features.head()

Unnamed: 0,name,contrast,dissimilarity,homogeneity,ASM,energy,label
0,DSC_0406_Bacterial leaf blight_42_binary.npy,0.016406,0.034532,0.992309,0.976344,0.988101,0
1,DSC_0316_Leaf smut_82_binary.npy,0.052344,0.033285,0.998278,0.98442,0.992179,0
2,DSC_0397_Bacterial leaf blight_36_binary.npy,0.059599,0.040517,0.997362,0.985846,0.992898,0
3,DSC_0395_Bacterial leaf blight_50_binary.npy,0.018247,0.062973,0.986909,0.965891,0.982798,1
4,DSC_0303_Brown spot_35_binary.npy,0.029912,0.021672,0.9981,0.991275,0.995628,0


In [10]:
binary_features['label'].value_counts()

0    6943
1    2481
Name: label, dtype: int64

In [11]:
diseased = binary_features[binary_features['label'] == 1]
non_diseased = binary_features[binary_features['label'] == 0]

In [12]:
diseased.head()

Unnamed: 0,name,contrast,dissimilarity,homogeneity,ASM,energy,label
3,DSC_0395_Bacterial leaf blight_50_binary.npy,0.018247,0.062973,0.986909,0.965891,0.982798,1
10,DSC_0314_Leaf smut_49_binary.npy,0.009354,0.031648,0.992564,0.984426,0.992182,1
14,DSC_0402_Bacterial leaf blight_54_binary.npy,0.021815,0.050722,0.990764,0.972096,0.985949,1
17,DSC_0121_Brown spot_38_binary.npy,0.047268,0.065976,0.994243,0.994123,0.997057,1
18,DSC_0308_Leaf smut_49_binary.npy,0.018807,0.037463,0.994134,0.987832,0.993897,1


In [13]:
non_diseased.head()

Unnamed: 0,name,contrast,dissimilarity,homogeneity,ASM,energy,label
0,DSC_0406_Bacterial leaf blight_42_binary.npy,0.016406,0.034532,0.992309,0.976344,0.988101,0
1,DSC_0316_Leaf smut_82_binary.npy,0.052344,0.033285,0.998278,0.98442,0.992179,0
2,DSC_0397_Bacterial leaf blight_36_binary.npy,0.059599,0.040517,0.997362,0.985846,0.992898,0
4,DSC_0303_Brown spot_35_binary.npy,0.029912,0.021672,0.9981,0.991275,0.995628,0
5,DSC_0313_Leaf smut_66_binary.npy,0.083058,0.052962,0.997063,0.974817,0.987328,0


In [14]:
?resample

[1;31mSignature:[0m [0mresample[0m[1;33m([0m[1;33m*[0m[0marrays[0m[1;33m,[0m [1;33m**[0m[0moptions[0m[1;33m)[0m[1;33m[0m[1;33m[0m[0m
[1;31mDocstring:[0m
Resample arrays or sparse matrices in a consistent way

The default strategy implements one step of the bootstrapping
procedure.

Parameters
----------
*arrays : sequence of indexable data-structures
    Indexable data-structures can be arrays, lists, dataframes or scipy
    sparse matrices with consistent first dimension.

Other Parameters
----------------
replace : boolean, True by default
    Implements resampling with replacement. If False, this will implement
    (sliced) random permutations.

n_samples : int, None by default
    Number of samples to generate. If left to None this is
    automatically set to the first dimension of the arrays.
    If replace is False it should not be larger than the length of
    arrays.

random_state : int, RandomState instance or None, optional (default=None)
    The seed

In [15]:
non_diseased_downsampled = resample(non_diseased, 
                                    n_samples=len(diseased), 
                                    replace=True, random_state=42)

In [16]:
len(non_diseased_downsampled)

2481

In [17]:
non_diseased_downsampled.head()

Unnamed: 0,name,contrast,dissimilarity,homogeneity,ASM,energy,label
1151,DSC_0302_Brown spot_39_binary.npy,0.028726,0.043466,0.993142,0.981461,0.990687,0
7312,DSC_0307_Brown spot_33_binary.npy,0.030539,0.053664,0.991589,0.977925,0.988901,0
7095,DSC_0376_Bacterial leaf blight_22_binary.npy,0.048636,0.032347,0.998258,0.990269,0.995123,0
7051,DSC_0397_Bacterial leaf blight_21_binary.npy,0.02642,0.019208,0.998351,0.992853,0.99642,0
5142,DSC_0121_Brown spot_23_binary.npy,0.112273,0.059776,0.999673,0.987073,0.993515,0


In [18]:
label_df = pd.concat([non_diseased_downsampled, diseased])

In [19]:
label_df.head()

Unnamed: 0,name,contrast,dissimilarity,homogeneity,ASM,energy,label
1151,DSC_0302_Brown spot_39_binary.npy,0.028726,0.043466,0.993142,0.981461,0.990687,0
7312,DSC_0307_Brown spot_33_binary.npy,0.030539,0.053664,0.991589,0.977925,0.988901,0
7095,DSC_0376_Bacterial leaf blight_22_binary.npy,0.048636,0.032347,0.998258,0.990269,0.995123,0
7051,DSC_0397_Bacterial leaf blight_21_binary.npy,0.02642,0.019208,0.998351,0.992853,0.99642,0
5142,DSC_0121_Brown spot_23_binary.npy,0.112273,0.059776,0.999673,0.987073,0.993515,0


In [20]:
len(label_df)

4962

In [21]:
label_df.head(20)

Unnamed: 0,name,contrast,dissimilarity,homogeneity,ASM,energy,label
1151,DSC_0302_Brown spot_39_binary.npy,0.028726,0.043466,0.993142,0.981461,0.990687,0
7312,DSC_0307_Brown spot_33_binary.npy,0.030539,0.053664,0.991589,0.977925,0.988901,0
7095,DSC_0376_Bacterial leaf blight_22_binary.npy,0.048636,0.032347,0.998258,0.990269,0.995123,0
7051,DSC_0397_Bacterial leaf blight_21_binary.npy,0.02642,0.019208,0.998351,0.992853,0.99642,0
5142,DSC_0121_Brown spot_23_binary.npy,0.112273,0.059776,0.999673,0.987073,0.993515,0
4219,DSC_0305_Brown spot_67_binary.npy,0.03088,0.046416,0.992939,0.979632,0.989764,0
7774,DSC_0100_Brown spot_18_binary.npy,0.211249,0.111067,0.999596,0.98934,0.994656,0
8494,DSC_0506_Leaf smut_64_binary.npy,0.156381,0.083089,0.9995,0.99277,0.996378,0
625,DSC_0308_Leaf smut_38_binary.npy,0.064744,0.042643,0.997443,0.986388,0.993171,0
7232,DSC_0700_Bacterial leaf blight_3_binary.npy,0.050518,0.02656,0.999946,0.991916,0.99595,0


In [22]:
label_df = label_df.sample(frac=1).reset_index(drop=True)

In [23]:
label_df.head(20)

Unnamed: 0,name,contrast,dissimilarity,homogeneity,ASM,energy,label
0,DSC_0703_Bacterial leaf blight_30_binary.npy,0.055018,0.035129,0.998448,0.98717,0.993564,0
1,DSC_0500_Leaf smut_43_binary.npy,0.375537,0.330866,0.990392,0.982297,0.991109,1
2,DSC_0302_Brown spot_63_binary.npy,0.025859,0.029368,0.996936,0.994946,0.99747,0
3,DSC_0405_Bacterial leaf blight_14_binary.npy,0.070272,0.048437,0.996582,0.980302,0.990102,1
4,DSC_0400_Bacterial leaf blight_50_binary.npy,0.025616,0.063429,0.990068,0.981858,0.990887,1
5,DSC_0328_Leaf smut_1_binary.npy,0.040343,0.030795,0.997109,0.976517,0.988189,1
6,DSC_0314_Leaf smut_49_binary.npy,0.009354,0.031648,0.992564,0.984426,0.992182,1
7,DSC_0301_Brown spot_72_binary.npy,0.025807,0.022078,0.997766,0.988204,0.994085,0
8,DSC_0335_Leaf smut_50_binary.npy,0.016223,0.028696,0.995579,0.98925,0.99461,1
9,DSC_0501_Leaf smut_51_binary.npy,0.347389,0.375902,0.987102,0.977708,0.988791,1


In [24]:
label_df.to_csv(feature_files_folder + '\\features(binary_classify_RGB_equalized).csv')

In [25]:
multiclass_features = pd.read_csv(feature_files_folder + '\\features(multiclass_classify)(RGB).csv')

In [26]:
multiclass_features.head()

Unnamed: 0,name,contrast,dissimilarity,homogeneity,ASM,energy,label
0,DSC_0100_Brown spot_34_multiclass.npy,3.223639,0.170918,0.998858,0.974272,0.987052,1
1,DSC_0100_Brown spot_35_multiclass.npy,2.065076,0.112742,0.999272,0.980139,0.99002,1
2,DSC_0100_Brown spot_43_multiclass.npy,2.726757,0.145004,0.99868,0.974958,0.987399,1
3,DSC_0100_Brown spot_45_multiclass.npy,0.822008,0.128753,0.987073,0.963744,0.981705,1
4,DSC_0100_Brown spot_46_multiclass.npy,1.567647,0.160889,0.988569,0.968246,0.983995,1


In [27]:
blb = multiclass_features[multiclass_features['label'] == 0]
bs = multiclass_features[multiclass_features['label'] == 1]
ls = multiclass_features[multiclass_features['label'] == 2]

In [28]:
multiclass_features['label'].value_counts()

2    1116
1     761
0     604
Name: label, dtype: int64

In [29]:
ls_downsampled = resample(ls, n_samples=len(blb), replace=True, random_state=42)
bs_downsampled = resample(bs, n_samples=len(blb), replace=True, random_state=42)

In [30]:
assert len(ls_downsampled) == len(bs_downsampled) == len(blb)

In [31]:
multiclass_features = pd.concat([ls_downsampled, bs_downsampled, blb])

In [32]:
multiclass_features.head()

Unnamed: 0,name,contrast,dissimilarity,homogeneity,ASM,energy,label
2122,DSC_0503_Leaf smut_70_multiclass.npy,0.304603,0.071268,0.992627,0.972032,0.985917,2
2357,DSC_0515_Leaf smut_4_multiclass.npy,0.451353,0.072484,0.996129,0.973151,0.986484,2
2306,DSC_0512_Leaf smut_43_multiclass.npy,1.162301,0.168514,0.987514,0.960598,0.980101,2
813,DSC_0313_Leaf smut_33_multiclass.npy,0.26012,0.043005,0.995596,0.98564,0.992794,2
1180,DSC_0328_Leaf smut_62_multiclass.npy,0.91479,0.059909,0.996419,0.964109,0.98189,2


In [33]:
assert len(multiclass_features) == len(blb)*3

In [34]:
multiclass_features['label'].value_counts()

2    604
1    604
0    604
Name: label, dtype: int64

In [35]:
multiclass_features = multiclass_features.sample(frac=1).reset_index(drop=True)

In [36]:
multiclass_features.loc[:, 'contrast':'energy'] = multiclass_features.loc[:, 'contrast':'energy']/multiclass_features.loc[:, 'contrast':'energy'].max(axis=0)

In [37]:
multiclass_features.head(20)

Unnamed: 0,name,contrast,dissimilarity,homogeneity,ASM,energy,label
0,DSC_0105_Brown spot_43_multiclass.npy,0.339724,0.37481,0.986995,0.977902,0.988889,1
1,DSC_0502_Leaf smut_67_multiclass.npy,0.558719,0.50289,0.992058,0.987643,0.993802,2
2,DSC_0516_Leaf smut_28_multiclass.npy,0.079,0.292625,0.972375,0.923497,0.960987,2
3,DSC_0317_Leaf smut_53_multiclass.npy,0.016351,0.039045,0.99574,0.992301,0.996143,2
4,DSC_0379_Bacterial leaf blight_61_multiclass.npy,0.022831,0.044952,0.995465,0.987238,0.993599,0
5,DSC_0377_Bacterial leaf blight_38_multiclass.npy,0.034834,0.053678,0.996154,0.990227,0.995101,0
6,DSC_0306_Brown spot_72_multiclass.npy,0.144055,0.12137,0.991662,0.976495,0.988177,1
7,DSC_0383_Bacterial leaf blight_37_multiclass.npy,0.070546,0.176224,0.983544,0.957146,0.978338,0
8,DSC_0114_Brown spot_43_multiclass.npy,0.390102,0.348134,0.993678,0.99221,0.996097,1
9,DSC_0312_Leaf smut_39_multiclass.npy,0.071044,0.127323,0.989363,0.969824,0.984796,2


In [38]:
multiclass_features.to_csv(feature_files_folder + '\\features(multiclass_classify_RGB_equalized).csv')

### Results: 
1. Binary Classification: 69% (`CatBoost`)
2. Multi-class Classification: 72% (`Extra Trees`)

## 2. SMOTE Oversampling 

In [39]:
from imblearn.over_sampling import SMOTE

In [40]:
?SMOTE

[1;31mInit signature:[0m
[0mSMOTE[0m[1;33m([0m[1;33m
[0m    [0msampling_strategy[0m[1;33m=[0m[1;34m'auto'[0m[1;33m,[0m[1;33m
[0m    [0mrandom_state[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mk_neighbors[0m[1;33m=[0m[1;36m5[0m[1;33m,[0m[1;33m
[0m    [0mn_jobs[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m[1;33m)[0m[1;33m[0m[1;33m[0m[0m
[1;31mDocstring:[0m     
Class to perform over-sampling using SMOTE.

This object is an implementation of SMOTE - Synthetic Minority
Over-sampling Technique as presented in [1]_.

Read more in the :ref:`User Guide <smote_adasyn>`.

Parameters
----------
sampling_strategy : float, str, dict or callable, default='auto'
    Sampling information to resample the data set.

    - When ``float``, it corresponds to the desired ratio of the number of
      samples in the minority class over the number of samples in the
      majority class after resampling. Therefore, the ratio is expressed as
      