In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

In [2]:
data = pd.read_csv('ML_Project_data.csv')
data_model = data.drop(['label'], axis=1)
labels = data['label']

In [3]:
scaler = preprocessing.MinMaxScaler(feature_range=(0, 1)).fit(data_model)
data_model_scaled = scaler.transform(data_model)

In [4]:
features = list(data_model.columns)
selectModel = SelectKBest(chi2, k=50)
selectModel.fit(data_model_scaled, labels)
selected_features = selectModel.get_feature_names_out(features)
selected_features

array(['f0', 'f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9', 'f10',
       'f11', 'f12', 'f13', 'f14', 'f15', 'f16', 'f17', 'f18', 'f19',
       'f20', 'f23', 'f24', 'f25', 'f26', 'f27', 'f28', 'f29', 'f30',
       'f31', 'f33', 'f34', 'f37', 'f40', 'f41', 'f42', 'f44', 'f46',
       'f50', 'f52', 'f56', 'f60', 'f61', 'f62', 'f63', 'f64', 'f65',
       'f66', 'f67', 'f68'], dtype=object)

In [5]:
deleted_cols = dict()
selected_cols = []
for i in range(len(selected_features)):
    col = int(selected_features[i][1:])
    deleted_cols[col] = 0

for i in range(len(data_model.columns)):
    if i not in deleted_cols:
        selected_cols.append(i)
    

data_model_scaled = np.delete(data_model_scaled, selected_cols, 1)
data_model_scaled.shape

(12240, 50)

In [6]:
trainData, testData, trainLabels, testLabels = train_test_split(data_model_scaled, labels, test_size=0.1,  random_state=42)                                                    
# trainData, validData, trainLabels, validLabels = train_test_split(trainData, trainLabels, test_size=0.15,  random_state=42)

In [7]:
from sklearn.cluster import KMeans

k_means = KMeans(n_clusters=5, random_state=42, n_init=10, algorithm='elkan')
k_means.fit(trainData, trainLabels)

predicted_trainLabels = k_means.predict(trainData)
print(accuracy_score(np.array(trainLabels), predicted_trainLabels))

predicted_testLabels = k_means.predict(testData)
print(accuracy_score(np.array(testLabels), predicted_testLabels))

0.27723311546840956
0.2867647058823529


In [8]:
from sklearn.cluster import DBSCAN

dbscan = DBSCAN(eps=0.3, min_samples=50, metric='manhattan')
predicted_testLabels = dbscan.fit_predict(trainData, trainLabels)

# accuracy_score(np.array(trainLabels), predicted_testLabels)

0.0

In [9]:
from sklearn.cluster import MiniBatchKMeans

MB_k_means = MiniBatchKMeans(n_clusters=5, random_state=42, max_iter=400, batch_size=1024)
MB_k_means.fit(trainData, trainLabels)

predicted_trainLabels = MB_k_means.predict(trainData)
print(accuracy_score(np.array(trainLabels), predicted_trainLabels))

predicted_testLabels = MB_k_means.predict(testData)
print(accuracy_score(np.array(testLabels), predicted_testLabels))

0.2648874364560639
0.2769607843137255


In [10]:
from sklearn.cluster import Birch

brc = Birch(n_clusters=5, threshold=0.2, branching_factor=10)
brc.fit(trainData, trainLabels)

predicted_trainLabels = brc.predict(trainData)
print(accuracy_score(np.array(trainLabels), predicted_trainLabels))

predicted_testLabels = brc.predict(testData)
print(accuracy_score(np.array(testLabels), predicted_testLabels))

0.24355482933914308
0.24918300653594772


In [11]:
from sklearn.cluster import SpectralCoclustering

spc = SpectralCoclustering(n_clusters=2, random_state=42)
spc.fit(trainData, trainLabels)

predicted_trainLabels = spc.row_labels_ 
print(accuracy_score(np.array(trainLabels), predicted_trainLabels))

# predicted_testLabels = spc.predict(testData)
# print(accuracy_score(np.array(testLabels), predicted_testLabels))

0.22785039941902688


In [12]:
from sklearn.cluster import AffinityPropagation

aff = AffinityPropagation()
aff.fit(trainData, trainLabels)
predicted_trainLabels = aff.labels_ 
print(accuracy_score(np.array(trainLabels), predicted_trainLabels))

0.0011801016702977488
