In [1]:
import pandas as pd
import numpy as np
from scipy.stats import skew, boxcox, yeojohnson
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder

In [2]:
file_path = 'DryBeanDataSet.xlsx'
df = pd.read_excel(file_path)
df.replace('?', np.nan, inplace=True)
df.shape

(13611, 22)

In [3]:
# Impute missing values for categorical features using the most frequent value (mode)
mode_imputer_colour = SimpleImputer(strategy='most_frequent')
df['Colour'] = mode_imputer_colour.fit_transform(df[['Colour']])

# Verify imputation
print("Categorical Imputation:")
print(df[['Colour', 'Class']].isnull().sum())

Categorical Imputation:
Colour     0
Class     17
dtype: int64


In [4]:
# Impute missing values using median
imputer_shape_factor = SimpleImputer(strategy='median')
df['ShapeFactor6'] = imputer_shape_factor.fit_transform(df[['ShapeFactor6']])

# Verify imputation
print("ShapeFactor6 Imputation:")
print(df['ShapeFactor6'].isnull().sum())

# Impute missing values using mean (or median if preferred)
imputer_compactness = SimpleImputer(strategy='mean')
df['Compactness'] = imputer_compactness.fit_transform(df[['Compactness']])

# Verify imputation
print("\nCompactness Imputation:")
print(df['Compactness'].isnull().sum())

# Impute missing values using mean
imputer_extent = SimpleImputer(strategy='mean')
df['Extent'] = imputer_extent.fit_transform(df[['Extent']])

# Verify imputation
print("\nExtent Imputation:")
print(df['Extent'].isnull().sum())

# Check for any remaining missing values
missing_values = df.isnull().sum()

# Print results
print("Missing Values After Imputation:")
print(missing_values)

ShapeFactor6 Imputation:
0

Compactness Imputation:
0

Extent Imputation:
0
Missing Values After Imputation:
Area                0
Perimeter           0
MajorAxisLength     0
MinorAxisLength     0
AspectRation        0
Eccentricity        0
ConvexArea          0
Constantness        0
EquivDiameter       0
Colour              0
Extent              0
Solidity            0
roundness           0
Compactness         0
ShapeFactor1        0
ShapeFactor2        0
ShapeFactor3        0
ShapeFactor4        0
ShapeFactor5        0
ShapeFactor6        0
Class              17
Sort order          0
dtype: int64


In [5]:
# Drop rows where 'Class' is missing
df_cleaned = df.dropna(subset=['Class'])

# Check the result
print(df_cleaned.info())
df = df_cleaned
print(df.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 13594 entries, 0 to 13610
Data columns (total 22 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Area             13594 non-null  int64  
 1   Perimeter        13594 non-null  float64
 2   MajorAxisLength  13594 non-null  float64
 3   MinorAxisLength  13594 non-null  float64
 4   AspectRation     13594 non-null  float64
 5   Eccentricity     13594 non-null  float64
 6   ConvexArea       13594 non-null  int64  
 7   Constantness     13594 non-null  int64  
 8   EquivDiameter    13594 non-null  float64
 9   Colour           13594 non-null  object 
 10  Extent           13594 non-null  float64
 11  Solidity         13594 non-null  float64
 12  roundness        13594 non-null  float64
 13  Compactness      13594 non-null  float64
 14  ShapeFactor1     13594 non-null  float64
 15  ShapeFactor2     13594 non-null  float64
 16  ShapeFactor3     13594 non-null  float64
 17  ShapeFactor4

In [6]:
# Assuming you've already loaded the DataFrame as df
class_counts = df['Class'].value_counts()

# Print the counts for each class
print(class_counts)

DERMASON    3542
SIRA        2634
SEKER       2025
HOROZ       1927
CALI        1628
BARBUNYA    1317
BOMBAY       521
Name: Class, dtype: int64


In [7]:
# Check for any remaining missing values
missing_values = df.isnull().sum()

# Print results
print("Missing Values After Imputation and Deletion:")
print(missing_values)

Missing Values After Imputation and Deletion:
Area               0
Perimeter          0
MajorAxisLength    0
MinorAxisLength    0
AspectRation       0
Eccentricity       0
ConvexArea         0
Constantness       0
EquivDiameter      0
Colour             0
Extent             0
Solidity           0
roundness          0
Compactness        0
ShapeFactor1       0
ShapeFactor2       0
ShapeFactor3       0
ShapeFactor4       0
ShapeFactor5       0
ShapeFactor6       0
Class              0
Sort order         0
dtype: int64


In [8]:
df.shape

(13594, 22)

In [9]:
# Initialize the OneHotEncoder
encoder = OneHotEncoder(drop='first', sparse=False)

# Apply OneHotEncoder to the 'Colour' column
encoded_features = encoder.fit_transform(df[['Colour']])

# Get the names of the new columns
encoded_feature_names = encoder.get_feature_names_out(['Colour'])

# Create a DataFrame with the encoded features
encoded_df = pd.DataFrame(encoded_features, columns=encoded_feature_names)

# Reset indices of both DataFrames to ensure proper alignment
encoded_df.reset_index(drop=True, inplace=True)
df.reset_index(drop=True, inplace=True)

# Combine the encoded features with the original dataset (excluding the original 'Colour' column)
final_df = pd.concat([encoded_df, df.drop(['Colour'], axis=1)], axis=1)

# Check the shape of the final DataFrame
print(final_df.shape)

(13594, 24)


In [10]:
df.shape

(13594, 22)

In [11]:
final_df.shape

(13594, 24)

In [12]:
df = final_df
df.shape

(13594, 24)

In [13]:
# Initialize the LabelEncoder
label_encoder = LabelEncoder()

# Fit and transform the 'Class' column to numeric values
df['Class'] = label_encoder.fit_transform(df['Class'])

# To see the mapping of original values to integers
class_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
print("Class Mapping:", class_mapping)
df

Class Mapping: {'BARBUNYA': 0, 'BOMBAY': 1, 'CALI': 2, 'DERMASON': 3, 'HOROZ': 4, 'SEKER': 5, 'SIRA': 6}


Unnamed: 0,Colour_brown,Colour_green,Colour_white,Area,Perimeter,MajorAxisLength,MinorAxisLength,AspectRation,Eccentricity,ConvexArea,...,roundness,Compactness,ShapeFactor1,ShapeFactor2,ShapeFactor3,ShapeFactor4,ShapeFactor5,ShapeFactor6,Class,Sort order
0,1.0,0.0,0.0,84648,1132.054,439.915710,248.961941,1.767000,0.824453,86779,...,0.830027,0.746266,0.005197,0.000994,0.556914,3.487008,0.984066,50.809833,2,0.134791
1,0.0,0.0,0.0,39704,736.873,271.659919,186.481404,1.456767,0.727175,40106,...,0.918880,0.827650,0.006842,0.001980,0.685004,1.242335,0.997891,148.508874,3,0.898848
2,0.0,1.0,0.0,35835,687.914,237.704180,192.176759,1.236904,0.588537,36190,...,0.951588,0.898611,0.006633,0.002668,0.807503,2.813489,0.998803,82.107117,5,0.539684
3,1.0,0.0,0.0,223035,1817.278,680.812555,419.994347,1.621004,0.787040,225889,...,0.848673,0.782734,0.003052,0.000707,0.612672,3.074472,0.993144,147.347735,1,0.149130
4,1.0,0.0,0.0,41957,767.143,287.962974,186.166695,1.546802,0.762918,42310,...,0.895904,0.802640,0.006863,0.001757,0.644230,2.213954,0.996497,22.893826,6,0.447635
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13589,0.0,0.0,0.0,23018,578.382,203.761000,144.297607,1.412089,0.706041,23465,...,0.864665,0.840171,0.008852,0.002721,0.705888,2.197446,0.996776,85.821969,3,0.764478
13590,1.0,0.0,0.0,69773,1095.057,387.116310,231.153291,1.674717,0.802155,71055,...,0.731179,0.769941,0.005548,0.001203,0.592809,1.004792,0.992787,152.980591,0,0.679735
13591,1.0,0.0,0.0,55773,968.988,347.920243,204.491191,1.701395,0.809040,57047,...,0.746444,0.765927,0.006238,0.001324,0.586644,2.219590,0.998113,102.437378,0,0.946849
13592,0.0,0.0,1.0,64235,1005.875,409.304636,201.686606,2.029409,0.870169,65018,...,0.797799,0.698706,0.006372,0.000937,0.488190,2.078504,0.990736,174.754039,4,0.046135


In [14]:
# Identify skewed features
skewed_features = df.apply(lambda x: skew(x.dropna()))

# Print the skewness before transformation
print("Skewness before transformation:")
print(skewed_features)

# Define thresholds for skewness
highly_pos_skewed = skewed_features[skewed_features > 2]
highly_neg_skewed = skewed_features[skewed_features < -2]

Skewness before transformation:
Colour_brown         0.204982
Colour_green         1.973463
Colour_white         2.055047
Area                 2.954367
Perimeter            1.627059
MajorAxisLength      1.357995
MinorAxisLength      2.239976
AspectRation         0.582001
Eccentricity        -1.062552
ConvexArea           2.942211
Constantness        -2.719986
EquivDiameter      116.568639
Extent              -0.896482
Solidity            -2.551501
roundness           -0.636396
Compactness          0.037389
ShapeFactor1        -0.534861
ShapeFactor2         0.300939
ShapeFactor3         0.242613
ShapeFactor4         0.007912
ShapeFactor5        -2.759326
ShapeFactor6         0.006011
Class               -0.324392
Sort order          -0.010739
dtype: float64


In [15]:
# Print highly skewed features with their skewness values
print("\nHighly Skewed POSITIVE Features:")
if not highly_pos_skewed.empty:
    for feature in highly_pos_skewed.index:
        print(f"Feature: {feature}, Skewness: {highly_pos_skewed[feature]:.2f}")
else:
    print("No features with positive skewness greater than 2.")

print("\nHighly Skewed NEGATIVE Features:")
if not highly_neg_skewed.empty:
    for feature in highly_neg_skewed.index:
        print(f"Feature: {feature}, Skewness: {highly_neg_skewed[feature]:.2f}")
else:
    print("No features with negative skewness less than -2.")
    
# Define thresholds for skewness
highly_pos_skewed = skewed_features[skewed_features > 2].index
highly_neg_skewed = skewed_features[skewed_features < -2].index    


Highly Skewed POSITIVE Features:
Feature: Colour_white, Skewness: 2.06
Feature: Area, Skewness: 2.95
Feature: MinorAxisLength, Skewness: 2.24
Feature: ConvexArea, Skewness: 2.94
Feature: EquivDiameter, Skewness: 116.57

Highly Skewed NEGATIVE Features:
Feature: Constantness, Skewness: -2.72
Feature: Solidity, Skewness: -2.55
Feature: ShapeFactor5, Skewness: -2.76


In [16]:
# Apply transformations
# Log Transform for positive skew
for feature in highly_pos_skewed:
    df[feature] = np.log1p(df[feature])

# Yeo-Johnson Transform for negative skew
for feature in highly_neg_skewed:
    df[feature], _ = yeojohnson(df[feature])

# Verify the effect of transformation
new_skewed_features = df.apply(lambda x: skew(x.dropna()))

  result = getattr(ufunc, method)(*inputs, **kwargs)
  x = um.multiply(x, x, out=x)


In [17]:
comparison_df = pd.DataFrame({
    'Feature': skewed_features.index,
    'Skewness Before': skewed_features.values,
    'Skewness After': new_skewed_features[skewed_features.index].values
})

# Print the comparison DataFrame
print("\nComparison of Skewness Before and After Transformation:")
print(comparison_df)


Comparison of Skewness Before and After Transformation:
            Feature  Skewness Before  Skewness After
0      Colour_brown         0.204982        0.204982
1      Colour_green         1.973463        1.973463
2      Colour_white         2.055047        2.055047
3              Area         2.954367        1.071932
4         Perimeter         1.627059        1.627059
5   MajorAxisLength         1.357995        1.357995
6   MinorAxisLength         2.239976        1.313619
7      AspectRation         0.582001        0.582001
8      Eccentricity        -1.062552       -1.062552
9        ConvexArea         2.942211        1.066529
10     Constantness        -2.719986       -2.719986
11    EquivDiameter       116.568639        5.563415
12           Extent        -0.896482       -0.896482
13         Solidity        -2.551501       -0.133281
14        roundness        -0.636396       -0.636396
15      Compactness         0.037389        0.037389
16     ShapeFactor1        -0.534861      

In [18]:
# Separate features and target variable
X = df.drop('Class', axis=1)
y = df['Class']

# Initialize the StandardScaler
scaler = StandardScaler()

# Fit and transform the features
X_scaled = scaler.fit_transform(X)

# Convert the scaled features back to a DataFrame
X_scaled_df = pd.DataFrame(X_scaled, columns=X.columns)

# Combine scaled features with the original target variable
df_scaled = pd.concat([X_scaled_df, y.reset_index(drop=True)], axis=1)

In [19]:
# Initialize the MinMaxScaler
scaler = MinMaxScaler()

# Fit and transform the features
X_scaled = scaler.fit_transform(X)

# Convert the scaled features back to a DataFrame
X_scaled_df = pd.DataFrame(X_scaled, columns=X.columns)

# Combine scaled features with the original target variable
df_scaled = pd.concat([X_scaled_df, y.reset_index(drop=True)], axis=1)

In [20]:
df = df_scaled
df.shape

(13594, 24)

In [24]:
df

Unnamed: 0,Colour_brown,Colour_green,Colour_white,Area,Perimeter,MajorAxisLength,MinorAxisLength,AspectRation,Eccentricity,ConvexArea,...,roundness,Compactness,ShapeFactor1,ShapeFactor2,ShapeFactor3,ShapeFactor4,ShapeFactor5,ShapeFactor6,Sort order,Class
0,1.0,0.0,0.0,0.563551,0.415791,0.461613,0.535084,0.528043,0.874406,0.563724,...,0.679368,0.304822,0.315253,0.138709,0.259688,0.853507,0.065144,0.283876,0.134716,2
1,0.0,0.0,0.0,0.263520,0.145236,0.158590,0.316767,0.307306,0.733928,0.260302,...,0.856695,0.539541,0.529654,0.456737,0.486626,0.167176,0.726351,0.829727,0.898852,3
2,0.0,1.0,0.0,0.222888,0.111717,0.097437,0.339482,0.150868,0.533720,0.219913,...,0.921972,0.744203,0.502438,0.678501,0.703657,0.647572,0.850971,0.458736,0.539651,5
3,1.0,0.0,0.0,0.947516,0.884918,0.895459,0.930770,0.424164,0.820379,0.939812,...,0.716579,0.409998,0.035771,0.045995,0.358475,0.727370,0.318028,0.823240,0.149057,1
4,1.0,0.0,0.0,0.285394,0.165960,0.187952,0.315492,0.371367,0.785544,0.281333,...,0.810842,0.467409,0.532411,0.384714,0.414387,0.464258,0.570006,0.127907,0.447592,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13589,0.0,0.0,0.0,0.047462,0.036728,0.036307,0.123296,0.275516,0.703408,0.049590,...,0.748496,0.575654,0.791621,0.695523,0.523625,0.459211,0.598342,0.479491,0.764468,3
13590,1.0,0.0,0.0,0.486961,0.390461,0.366523,0.478984,0.462382,0.842206,0.485136,...,0.482093,0.373101,0.361026,0.205929,0.323283,0.094545,0.298848,0.854711,0.679717,0
13591,1.0,0.0,0.0,0.398203,0.304150,0.295932,0.386387,0.481364,0.852149,0.398816,...,0.512557,0.361525,0.450940,0.245139,0.312361,0.465982,0.754892,0.572323,0.946859,0
13592,0.0,0.0,1.0,0.454186,0.329404,0.406483,0.375957,0.714753,0.940424,0.450231,...,0.615049,0.167651,0.468383,0.120162,0.137929,0.422843,0.209039,0.976361,0.046051,4


In [27]:
# Check for NaN values
print("Checking for NaN values:")
print(df.isna().sum())
# Initialize the imputer with median strategy
imputer = SimpleImputer(strategy='median')

# Impute missing values
df['ConvexArea'] = imputer.fit_transform(df[['ConvexArea']])
print(df.isna().sum())

Checking for NaN values:
Colour_brown       0
Colour_green       0
Colour_white       0
Area               0
Perimeter          0
MajorAxisLength    0
MinorAxisLength    0
AspectRation       0
Eccentricity       0
ConvexArea         0
Constantness       0
EquivDiameter      0
Extent             0
Solidity           0
roundness          0
Compactness        0
ShapeFactor1       0
ShapeFactor2       0
ShapeFactor3       0
ShapeFactor4       0
ShapeFactor5       0
ShapeFactor6       0
Sort order         0
Class              0
dtype: int64
Colour_brown       0
Colour_green       0
Colour_white       0
Area               0
Perimeter          0
MajorAxisLength    0
MinorAxisLength    0
AspectRation       0
Eccentricity       0
ConvexArea         0
Constantness       0
EquivDiameter      0
Extent             0
Solidity           0
roundness          0
Compactness        0
ShapeFactor1       0
ShapeFactor2       0
ShapeFactor3       0
ShapeFactor4       0
ShapeFactor5       0
ShapeFactor6     

In [28]:
df.shape

(13594, 24)

In [29]:

X = df.drop(['Class'], axis=1)
y = df['Class']

from imblearn.combine import SMOTETomek
from imblearn.pipeline import Pipeline

# Initialize SMOTETomek
smote_tomek = SMOTETomek(random_state=42)

# Resample the dataset
X_resampled, y_resampled = smote_tomek.fit_resample(X, y)

In [30]:
# Create a DataFrame with the resampled data
resampled_df = pd.DataFrame(X_resampled, columns=X.columns)
resampled_df['Class'] = y_resampled

In [31]:
df = resampled_df
df.shape

(24778, 24)

In [32]:
class_counts = df['Class'].value_counts()

# Print the results
print("Number of observations per class:")
print(class_counts)

Number of observations per class:
3    3542
5    3542
1    3542
4    3542
2    3538
6    3537
0    3535
Name: Class, dtype: int64


In [33]:
# Save the resampled data to an Excel file
output_file_path = 'PROCESSED_DryBeanDataSet.xlsx'
resampled_df.to_excel(output_file_path, index=False, engine='openpyxl')

In [34]:
df

Unnamed: 0,Colour_brown,Colour_green,Colour_white,Area,Perimeter,MajorAxisLength,MinorAxisLength,AspectRation,Eccentricity,ConvexArea,...,roundness,Compactness,ShapeFactor1,ShapeFactor2,ShapeFactor3,ShapeFactor4,ShapeFactor5,ShapeFactor6,Sort order,Class
0,1.0,0.0,0.0,0.563551,0.415791,0.461613,0.535084,0.528043,0.874406,0.563724,...,0.679368,0.304822,0.315253,0.138709,0.259688,0.853507,0.065144,0.283876,0.134716,2
1,0.0,0.0,0.0,0.263520,0.145236,0.158590,0.316767,0.307306,0.733928,0.260302,...,0.856695,0.539541,0.529654,0.456737,0.486626,0.167176,0.726351,0.829727,0.898852,3
2,0.0,1.0,0.0,0.222888,0.111717,0.097437,0.339482,0.150868,0.533720,0.219913,...,0.921972,0.744203,0.502438,0.678501,0.703657,0.647572,0.850971,0.458736,0.539651,5
3,1.0,0.0,0.0,0.947516,0.884918,0.895459,0.930770,0.424164,0.820379,0.939812,...,0.716579,0.409998,0.035771,0.045995,0.358475,0.727370,0.318028,0.823240,0.149057,1
4,1.0,0.0,0.0,0.285394,0.165960,0.187952,0.315492,0.371367,0.785544,0.281333,...,0.810842,0.467409,0.532411,0.384714,0.414387,0.464258,0.570006,0.127907,0.447592,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24773,1.0,0.0,0.0,0.287562,0.164524,0.179253,0.333024,0.328060,0.751961,0.285199,...,0.830546,0.513371,0.512609,0.417506,0.460161,0.577602,0.494199,0.510934,0.880818,6
24774,1.0,0.0,0.0,0.309608,0.179557,0.192633,0.357810,0.321075,0.744666,0.306551,...,0.829118,0.518394,0.486970,0.405174,0.465465,0.683431,0.295885,0.803042,0.957172,6
24775,1.0,0.0,0.0,0.340549,0.201465,0.213344,0.386105,0.322223,0.746904,0.337226,...,0.825842,0.518442,0.454303,0.382393,0.465288,0.177490,0.394451,0.511542,0.501793,6
24776,1.0,0.0,0.0,0.305506,0.177047,0.195320,0.343775,0.345986,0.766270,0.301628,...,0.826982,0.493782,0.500194,0.389715,0.440580,0.457582,0.498552,0.070115,0.680242,6
