# 2440023002 - Andreas Christianto

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import sklearn.cluster as cluster
import sklearn.metrics as metrics

from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cluster import KMeans
from sklearn.metrics import classification_report

In [2]:
url = 'https://raw.githubusercontent.com/AndreChristianto/csv-UAS-ML/main/experiment.csv'
df = pd.read_csv(url)

In [3]:
df.isna().any()

F1       False
F2       False
F3       False
F4       False
F5       False
         ...  
F197     False
F198     False
F199     False
F200     False
Class    False
Length: 201, dtype: bool

In [4]:
df = df.dropna()

## Visualization

In [None]:
print(df.head)

plt.figure(figsize=(25, 8))

df.hist()
plt.subplots_adjust(hspace = 1, wspace = 1)
plt.rcParams["figure.figsize"] = [32, 18]
plt.show()

<bound method NDFrame.head of             F1        F2        F3        F4   F5        F6        F7  \
0     0.006711  0.000000  0.013423  0.006711  0.0  0.006711  0.006711   
1     0.000000  0.000000  0.000000  0.007246  0.0  0.000000  0.000000   
2     0.011696  0.000000  0.005848  0.000000  0.0  0.005848  0.000000   
3     0.000000  0.000000  0.020833  0.000000  0.0  0.000000  0.010417   
4     0.000000  0.000000  0.034483  0.000000  0.0  0.000000  0.000000   
...        ...       ...       ...       ...  ...       ...       ...   
4075  0.000000  0.016667  0.000000  0.000000  0.0  0.000000  0.000000   
4076  0.000000  0.000000  0.000000  0.000000  0.0  0.000000  0.000000   
4077  0.014925  0.000000  0.000000  0.000000  0.0  0.000000  0.000000   
4078  0.000000  0.017544  0.017544  0.000000  0.0  0.000000  0.000000   
4079  0.017241  0.000000  0.000000  0.000000  0.0  0.000000  0.000000   

            F8        F9       F10  ...      F192      F193      F194  \
0     0.020134  0.02

<Figure size 1800x576 with 0 Axes>

Bisa dilihat tabel-tabel di atas bahwa didapatkan banyak sekali kolom data (atribut). Bisa ditarik sebuah kesimpulan bahwa akan ada kesulitan dalam memproses data sebanyak itu.

## Pre-Processing

### Outlier Detection

Dilakukan karena ada banyak sekali data, supaya cepat diproses bisa menggunakan outlier detection.

In [None]:
z_scores = stats.zscore(df)
z_scores

In [None]:
abs_z_scores = np.abs(z_scores)
filtered_entries = (abs_z_scores < 3).all(axis=1)
df_wo_outliers = df[filtered_entries]

In [None]:
df_wo_outliers.shape

In [None]:
df_wo_outliers

### Feature Selection dengan Univariate

In [None]:
X = df_wo_outliers.drop(['Class'], axis=1)
y = df_wo_outliers['Class']

uni = SelectKBest(score_func = f_classif, k = 10)
fit = uni.fit(X, y)

In [None]:
X.columns[fit.get_support(indices=True)].tolist()

In [None]:
df_wo_outliers = df_wo_outliers[['F47', 'F72', 'F80', 'F92', 'F102', 'F110', 'F132', 'F145', 'F163', 'F186', 'Class']]

### Feature Selection dengan HeatMap

In [None]:
plt.figure(figsize=(10,10))
sns.heatmap(df_wo_outliers.corr(), annot=True,cmap='Reds')
plt.title("Correlation Matrix")

Bisa dilihat di heatmap di atas bahwa urutan korelasi masing-masing kolom dengan Class adalah seperti berikut :
*   F92 -(0.37)
*   F102 -(0.36)
*   F132 (0.29) & F163 -(0.29)
*   F110 (0.28)
*   F47 (0.25) & F186 (0.25)
*   F72 (0.18) & F145 (0.18)
*   F80 -(0.065)

Setelah dicoba-coba dengan metode KNN di bawah, didapatkan bahwa akurasi tertinggi terjadi saat 5 yang terendah dihapus.

In [None]:
new_X = df_wo_outliers.drop(['F47', 'F72', 'F80', 'F145', 'F186', 'Class'], axis=1)
new_y = df_wo_outliers['Class']

In [None]:
print(new_X.shape)
print(new_y.shape)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(new_X, new_y, test_size=0.2, random_state=0)

In [None]:
print(X_train.shape)
print(X_test.shape)

## Machine Learning Method

### K-Nearest Neighbor

In [None]:
for i in range(2,13):
    labels=cluster.KMeans(n_clusters=i,init="k-means++", random_state=200).fit(df_wo_outliers).labels_
    print ("Silhouette score for k(clusters) = "+str(i)+" is "
           +str(metrics.silhouette_score(df_wo_outliers, labels,metric="euclidean", sample_size=1000, random_state=200)))

k optimal adalah 5

In [None]:
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
print(classification_report(y_test, y_pred))

### SVM with Linear Kernel

In [None]:
svc_pipe = Pipeline([('scaler', StandardScaler()), ('svc', SVC())])

In [None]:
svc_lin_params = {'svc__C': [0.01, 0.1, 1, 10], 'svc__kernel': ['linear']}

In [None]:
svc_lin = GridSearchCV(svc_pipe, svc_lin_params, n_jobs=-1)

In [None]:
svc_lin.fit(X_train, y_train)

In [None]:
svc_lin.best_params_

In [None]:
lin_pred = svc_lin.predict(X_test)

In [None]:
print(classification_report(y_test, lin_pred))

## Evaluation & Comparison

### Comparison

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
print(classification_report(y_test, lin_pred))

### Evaluation / Conclusion

Dari kedua hasil di atas, bisa disimpulkan bahwa akurasi menggunakan KNN lebih baik dengan skor 0.64 dibandingkan Support Vector Machine (SVM) dengan kernel Linear yang mempunyai skor 0.59.