In [31]:
import numpy as np
import pandas as pd

In [194]:
df = pd.read_csv('/content/uci-secom.csv')

In [195]:
df.head(2)

Unnamed: 0,Time,0,1,2,3,4,5,6,7,8,...,581,582,583,584,585,586,587,588,589,Pass/Fail
0,2008-07-19 11:55:00,3030.93,2564.0,2187.7333,1411.1265,1.3602,100.0,97.6133,0.1242,1.5005,...,,0.5005,0.0118,0.0035,2.363,,,,,-1
1,2008-07-19 12:32:00,3095.78,2465.14,2230.4222,1463.6606,0.8294,100.0,102.3433,0.1247,1.4966,...,208.2045,0.5019,0.0223,0.0055,4.4447,0.0096,0.0201,0.006,208.2045,-1


In [196]:
df.shape

(1567, 592)

In [197]:
df.duplicated().sum()

np.int64(0)

In [198]:
df.drop(columns=['Time'],inplace=True)

In [199]:
df.isna().sum()

Unnamed: 0,0
0,6
1,7
2,14
3,14
4,14
...,...
586,1
587,1
588,1
589,1


In [200]:
df.mean(numeric_only=True)

Unnamed: 0,0
0,3014.452896
1,2495.850231
2,2200.547318
3,1396.376627
4,4.197013
...,...
586,0.021458
587,0.016475
588,0.005283
589,99.670066


In [201]:
df = df.fillna(df.mean(numeric_only=True))


In [202]:
df.isna().sum()

Unnamed: 0,0
0,0
1,0
2,0
3,0
4,0
...,...
586,0
587,0
588,0
589,0


In [203]:
df.shape

(1567, 591)

In [204]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_selection import VarianceThreshold

In [205]:
x = df.iloc[:,0:-1]

In [206]:
y = df.iloc[:,-1]

In [207]:
y.value_counts()

Unnamed: 0_level_0,count
Pass/Fail,Unnamed: 1_level_1
-1,1463
1,104


In [208]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=2)

In [209]:
scaler = StandardScaler()

x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

# check accuracy without Feature selection

### 1. KNN without oversampling

In [210]:
knn = KNeighborsClassifier(n_neighbors=5)

knn.fit(x_train,y_train)
y_pred = knn.predict(x_test)

In [211]:
accuracy_score(y_test,y_pred)

0.9394904458598726

In [212]:
confusion_matrix(y_test,y_pred)

array([[293,   1],
       [ 18,   2]])

In [213]:
from imblearn.over_sampling import SMOTE
# Oversample only the training data
smote = SMOTE(random_state=42)
x_train_resample, y_train_resample = smote.fit_resample(x_train, y_train)



In [214]:
y_train_resample.value_counts()

Unnamed: 0_level_0,count
Pass/Fail,Unnamed: 1_level_1
-1,1169
1,1169


### applying knn after resampling

In [218]:

knn = KNeighborsClassifier(n_neighbors=2)

knn.fit(x_train_resample, y_train_resample)
y_pred = knn.predict(x_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


Accuracy: 0.6178343949044586
Confusion Matrix:
 [[181 113]
 [  7  13]]


# Goal is to find same accuracy with less number of columns

In [219]:
df.shape

(1567, 591)

### step 1. Variance Threshold

In [220]:

var = VarianceThreshold(threshold=0.05)

x_select = var.fit_transform(x)


In [221]:
var.get_support().sum()

np.int64(285)

In [222]:
columns = x.columns[var.get_support()]
columns

Index(['0', '1', '2', '3', '4', '6', '12', '14', '15', '16',
       ...
       '570', '571', '572', '573', '574', '576', '577', '581', '585', '589'],
      dtype='object', length=285)

In [223]:

x = pd.DataFrame(x_select, columns=columns)


In [225]:
x.shape

(1567, 285)

# Correlation for ---> check Multicolinairity

In [224]:
# find correlation between all pair of conbination of columns

corr_matrix = x.corr()

In [226]:
corr_matrix

Unnamed: 0,0,1,2,3,4,6,12,14,15,16,...,570,571,572,573,574,576,577,581,585,589
0,1.000000,-0.143840,0.004756,-0.007613,-0.011014,0.002270,0.010368,-0.007058,0.030675,-0.005749,...,-0.018953,-0.023166,0.013678,-0.002067,0.015206,0.013228,0.008601,-0.017120,0.023589,0.004174
1,-0.143840,1.000000,0.005767,-0.007568,-0.001636,-0.025564,0.034062,-0.037667,-0.087315,-0.001878,...,-0.009000,-0.037932,0.001753,-0.011467,0.001303,0.002570,-0.010145,0.054006,0.002273,0.044797
2,0.004756,0.005767,1.000000,0.298935,0.095891,-0.136225,0.018326,0.006476,0.006115,-0.000788,...,-0.037070,-0.015600,-0.000518,-0.030674,0.001342,0.002592,-0.028705,-0.007054,0.015752,-0.032890
3,-0.007613,-0.007568,0.298935,1.000000,-0.058483,-0.685835,-0.028223,-0.019827,-0.013157,-0.004596,...,0.002231,-0.017820,0.007634,0.013163,0.006822,0.008216,0.016438,-0.071311,0.026019,-0.080341
4,-0.011014,-0.001636,0.095891,-0.058483,1.000000,-0.074368,-0.002707,-0.017523,0.011435,-0.001763,...,0.005273,-0.081983,-0.012024,-0.016533,-0.012264,-0.012163,-0.004070,0.088167,-0.001616,0.050910
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
576,0.013228,0.002570,0.002592,0.008216,-0.012163,0.007409,0.035743,-0.000985,-0.023509,-0.014167,...,-0.360498,-0.136232,0.994772,0.790026,0.991738,1.000000,0.859278,-0.012419,-0.017147,-0.022567
577,0.008601,-0.010145,-0.028705,0.016438,-0.004070,-0.012342,0.031434,0.009505,-0.019152,-0.004396,...,-0.247655,-0.121115,0.863768,0.957874,0.851784,0.859278,1.000000,-0.009745,-0.023910,-0.024766
581,-0.017120,0.054006,-0.007054,-0.071311,0.088167,0.042446,-0.038481,0.045729,0.014444,-0.001357,...,0.011714,-0.001168,-0.012384,-0.010192,-0.011594,-0.012419,-0.009745,1.000000,-0.000471,0.585288
585,0.023589,0.002273,0.015752,0.026019,-0.001616,-0.039517,0.000523,0.002535,0.017745,0.002643,...,0.010143,0.006716,-0.017179,-0.022662,-0.016812,-0.017147,-0.023910,-0.000471,1.000000,-0.003800


In [227]:
columns = corr_matrix.columns

In [228]:
columns_to_drop = []

for i in range(len(columns)):
    for j in range(i + 1, len(columns)):
        if corr_matrix.loc[columns[i], columns[j]] > 0.90:
            columns_to_drop.append(columns[j])

print(len(columns_to_drop))

216


In [229]:
columns_to_drop = set(columns_to_drop)

In [230]:
len(columns_to_drop)

109

In [231]:
x.drop(columns=columns_to_drop, axis=1, inplace=True)

In [232]:
x.shape

(1567, 176)

### 3. ANOVA

In [233]:
from sklearn.feature_selection import f_classif
from sklearn.feature_selection import SelectKBest

sel = SelectKBest(f_classif, k=150)
sel.fit(x,y)

In [234]:
columns = x.columns[sel.get_support()]

In [235]:
x = sel.transform(x)

In [236]:

x.shape

(1567, 150)

In [237]:
x = pd.DataFrame(x, columns=columns)

In [238]:
x


Unnamed: 0,0,3,4,6,14,18,21,22,23,24,...,551,555,561,562,564,569,571,572,573,581
0,3030.93,1411.1265,1.3602,97.6133,7.955800,192.396300,-5419.00,2916.50,-4043.75,751.00,...,0.78,39.8842,42.3877,262.729683,6.444985,21.117674,2.1113,8.95,0.3157,97.934373
1,3095.78,1463.6606,0.8294,102.3433,10.154800,191.287200,-5441.50,2604.25,-3498.75,-1640.25,...,1.33,53.1836,18.1087,262.729683,6.444985,21.117674,2.4335,5.92,0.2653,208.204500
2,2932.61,1698.0172,1.5102,95.4878,9.515700,192.703500,-5447.75,2701.75,-4047.00,-1916.50,...,0.85,23.0713,24.7524,267.064000,1.100000,68.848900,2.0293,11.21,0.1882,82.860200
3,2988.72,909.7926,1.3204,104.2367,9.605200,192.155700,-5468.25,2648.25,-4515.00,-1657.25,...,39.33,161.4081,62.7572,268.228000,7.320000,25.036300,2.0253,9.33,0.1738,73.843200
4,3032.24,1326.5200,1.5334,100.3967,10.566100,191.603700,-5476.25,2635.25,-3987.50,117.00,...,1.98,70.9706,22.0500,262.729683,6.444985,21.117674,2.0275,8.83,0.2224,97.934373
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1562,2899.41,3085.3781,1.4843,82.2467,11.769200,193.747000,-5418.75,2608.00,-6228.25,356.00,...,0.80,85.1806,32.3812,264.272000,4.980000,15.466200,2.0153,7.98,0.2363,203.172000
1563,3052.31,1124.6595,0.8763,98.4689,9.162000,193.788900,-6408.75,2277.50,-3675.50,339.00,...,1.33,27.0176,32.1048,266.832000,4.560000,20.911800,2.1814,5.48,0.3891,97.934373
1564,2978.81,1110.4967,0.8236,99.4122,9.005371,190.047354,-5153.25,2707.00,-4102.00,-1226.00,...,1.50,74.1541,13.0316,256.730000,11.090000,29.095400,2.3435,6.49,0.4154,43.523100
1565,2894.92,1183.7287,1.5726,98.7978,9.735400,187.381800,-5271.75,2676.50,-4001.50,394.75,...,1.33,27.0176,18.9966,264.272000,4.980000,15.466200,1.9098,9.13,0.3669,93.494100


In [239]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=2)

In [240]:
x_train

Unnamed: 0,0,3,4,6,14,18,21,22,23,24,...,551,555,561,562,564,569,571,572,573,581
743,2996.90,1041.1557,0.8479,107.2622,8.4330,187.9085,-5685.00,2803.00,-3576.00,188.25,...,1.230712,57.746537,37.5266,261.994000,10.100000,12.248600,2.3488,9.4400,0.4671,97.934373
418,2946.70,1657.3518,1.6603,100.8022,6.0355,187.6157,-5532.25,2743.00,-5756.75,-6160.25,...,0.930000,67.738000,17.5662,265.268000,3.990000,25.367100,2.3705,8.5699,0.3475,97.934373
957,2999.04,1009.0221,1.3187,101.9178,9.9037,192.3538,-5501.00,2551.75,-4632.00,1713.00,...,0.870000,15.604000,70.6158,268.800000,8.450000,34.481200,2.2421,7.7000,0.5593,103.352000
564,3108.50,1312.3198,0.8286,100.3633,7.0014,191.4793,-5242.25,3108.00,-4059.00,115.00,...,1.230712,57.746537,20.7785,262.729683,6.444985,21.117674,1.9955,8.8800,0.2739,97.934373
696,3046.63,1663.7024,1.0203,100.4456,10.9501,192.5712,-6529.75,2916.25,-1767.25,1271.75,...,1.230712,57.746537,32.9353,262.729683,6.444985,21.117674,2.0007,8.0900,0.3016,97.934373
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
299,2968.06,1632.2734,0.8396,98.1311,10.5176,190.6087,-6463.25,3096.25,-5185.75,1522.50,...,1.380000,81.980700,10.3089,249.514000,5.480000,16.676600,1.9110,8.6500,0.1435,70.558800
1558,3012.30,1032.2836,1.4802,101.3511,8.6079,187.5852,-6499.00,2241.75,-3660.25,176.50,...,1.330000,27.017600,57.0019,270.314000,6.860000,3.896700,2.1510,9.1600,0.5035,176.678300
493,3037.38,1792.7115,1.6513,100.7244,10.0095,190.1128,-5723.50,3112.25,-3240.00,461.25,...,1.230712,57.746537,23.0054,262.729683,6.444985,21.117674,1.8394,7.2700,0.1347,102.165200
527,2987.43,1180.2820,0.8465,100.7978,8.3687,191.0420,-6235.25,3027.75,-3967.75,1559.50,...,1.160000,165.651000,76.4208,252.686000,8.270000,21.055400,2.0118,7.8400,0.2580,81.947200


In [241]:
scaler = StandardScaler()

x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

In [242]:
x_train.shape

(1253, 150)

In [243]:

smote = SMOTE(random_state=42)
x_train_resample, y_train_resample = smote.fit_resample(x_train, y_train)

knn = KNeighborsClassifier(n_neighbors=2)

knn.fit(x_train_resample, y_train_resample)
y_pred = knn.predict(x_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


Accuracy: 0.6719745222929936
Confusion Matrix:
 [[201  93]
 [ 10  10]]


### 4. Chi - square