In [1]:
import pandas  as pd
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import  SMOTENC

In [2]:
# Load data (last 10 columns) and drop missing values
df=pd.read_csv("sample_dataset.csv").iloc[:,-10:].dropna()


In [3]:
X=df.iloc[:,0:-1]# Features (all columns except last)
X

Unnamed: 0,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
5,23.75,103.40,741.6,0.1791,0.52490,0.5355,0.17410,0.3985,0.12440
7,28.14,110.60,897.0,0.1654,0.36820,0.2678,0.15560,0.3196,0.11510
9,40.68,97.65,711.4,0.1853,1.05800,1.1050,0.22100,0.4366,0.20750
17,31.48,136.80,1315.0,0.1789,0.42330,0.4784,0.20730,0.3706,0.11420
18,30.88,186.80,2398.0,0.1512,0.31500,0.5372,0.23880,0.2768,0.07615
...,...,...,...,...,...,...,...,...,...
507,20.74,76.08,411.1,0.1662,0.20310,0.1256,0.09514,0.2780,0.11680
517,25.23,160.50,1646.0,0.1417,0.33090,0.4185,0.16130,0.2549,0.09136
528,15.38,94.52,653.3,0.1394,0.13640,0.1559,0.10150,0.2160,0.07253
548,25.59,69.10,364.2,0.1199,0.09546,0.0935,0.03846,0.2552,0.07920


In [4]:
y=df["target"]# Target variable (last column)
y

5      0
7      0
9      0
17     0
18     0
      ..
507    1
517    0
528    1
548    1
564    0
Name: target, Length: 96, dtype: int64

In [5]:
X_scaled=StandardScaler().fit_transform(X) #Normalizes features to have mean=0 and standard deviation=1.
X_scaled

array([[-3.05034029e-01, -2.02384749e-01, -3.22364633e-01,
         2.20913302e+00,  1.59793897e+00,  1.23949554e+00,
         8.75488391e-01,  1.55847947e+00,  2.00473023e+00],
       [ 4.01342922e-01, -2.52999008e-03, -7.27065039e-02,
         1.53237320e+00,  6.51568391e-01, -2.58626085e-02,
         5.92251199e-01,  4.03051354e-01,  1.53166756e+00],
       [ 2.41910305e+00, -3.61990980e-01, -3.70882494e-01,
         2.51540389e+00,  4.81753153e+00,  3.93139493e+00,
         1.59353295e+00,  2.11642385e+00,  6.23177409e+00],
       [ 9.38768666e-01,  7.24719272e-01,  5.98832093e-01,
         2.19925332e+00,  9.84338134e-01,  9.69596587e-01,
         1.38378432e+00,  1.14990603e+00,  1.48588730e+00],
       [ 8.42225119e-01,  2.11259954e+00,  2.33872755e+00,
         8.30914106e-01,  3.30273467e-01,  1.24753106e+00,
         1.86605305e+00, -2.23720807e-01, -4.49600288e-01],
       [-1.02750157e+00, -3.05087889e-01, -3.71203804e-01,
         4.75244709e-01, -5.01348922e-01, -1.619937

In [6]:
y.value_counts() #Shows the count of samples in each class (likely imbalanced).
y

5      0
7      0
9      0
17     0
18     0
      ..
507    1
517    0
528    1
548    1
564    0
Name: target, Length: 96, dtype: int64

# SMOTE with numeric features

In [7]:
resampler = SMOTENC(categorical_features=[1, 2], random_state=0)# Define SMOTENC (assuming columns 1 & 2 are categorical)

In [8]:
X_res,y_res=resampler.fit_resample(X_scaled,y)

In [9]:
y_res.value_counts()

target
0    61
1    61
Name: count, dtype: int64

# SMOTE with categorical feature

In [10]:
df=pd.read_csv("sample_dataset.csv").loc[:,['mean radius','mean texture','mean perimeter','area error','target']].dropna()

In [11]:
df.columns

Index(['mean radius', 'mean texture', 'mean perimeter', 'area error',
       'target'],
      dtype='object')

In [12]:
df

Unnamed: 0,mean radius,mean texture,mean perimeter,area error,target
1,20.57,17.77,132.90,A,0
2,19.69,21.25,130.00,A,0
3,11.42,20.38,77.58,A,0
7,13.71,20.83,90.20,A,0
8,13.00,21.82,87.50,A,0
...,...,...,...,...,...
561,11.20,29.37,70.67,A,1
563,20.92,25.09,143.00,A,0
564,21.56,22.39,142.00,A,0
566,16.60,28.08,108.30,A,0


In [13]:
X=df.iloc[:,0:-1]# Selected features
y=df['target'] # Target


In [14]:
resampler=SMOTENC(random_state=0,categorical_features=[3],k_neighbors=10)

# categorical_features=[3]: 

# Specifies that the 4th column (area error) is categorical (index starts at 0).
# SMOTENC will handle it differently from continuous features.


#k_neighbors=10:

#Uses 10 nearest neighbors (default=5) to generate synthetic samples.
#Higher k_neighbors = more diverse but potentially noisier samples.


#y_res.value_counts() shows balanced classes.

In [15]:
X_res,y_res=resampler.fit_resample(X,y)

In [16]:
y_res.value_counts()

target
0    211
1    211
Name: count, dtype: int64

# SMOTENC vs SMOTE:

SMOTE: Only works with continuous features.

SMOTENC: Handles mixed data (continuous + categorical).


## When to Use categorical_features:

If any feature is non-numeric (e.g., categories like "High/Low").

If a numeric feature should be treated as discrete (e.g., area error in this case).

## Avoid Data Leakage:

Always apply SMOTENC only on the training set, not validation/test data.

## Example Output (Before & After SMOTENC)


Class	Before SMOTENC	After SMOTENC
0	    900	900
1	    100	900

Now both classes have equal samples, improving model fairness