In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

In [2]:
with open('../data/features_list.bin', 'rb') as f:
    features_list = pickle.load(f)
    f.close()

FileNotFoundError: [Errno 2] No such file or directory: '../data/features_list.bin'

In [None]:
scaler = StandardScaler()

In [None]:
scaled_features_list = []
for features in features_list:
    scaled_features_list.append(scaler.fit_transform(features))

In [None]:
np.random.seed(32)

In [None]:
features_scaled_list = []
kmeans_list = []
for idx in range(5):
    features_scaled = pd.DataFrame(data=scaled_features_list[idx],
                                   index=features_list[idx].index,
                                   columns=features_list[idx].columns)
    kmeans = KMeans(n_clusters=3, random_state=12)
    kmeans.fit(features_scaled)
    features_scaled['label'] = kmeans.labels_
    features_scaled_list.append(features_scaled)
    kmeans_list.append(kmeans)

In [None]:
def cluster_centers_visualize(idx):
    fig, ax = plt.subplots(3, 1, figsize=(12,6), sharex=True, sharey=True)
    ax = ax.ravel()
    plt.tight_layout()
    plt.subplots_adjust(hspace=0.4)
    plt.title('INIT')
    for i in range(kmeans_list[idx].n_clusters):
        ax[i].set_title(f'Cluster {i}')
        sns.barplot(np.arange(features_list[idx].columns.size), kmeans_list[idx].cluster_centers_[i], ax=ax[i])
        ax[i].set_xticks(np.arange(features_list[idx].columns.size))
        ax[i].set_xticklabels(features_list[idx].columns, rotation=90)

In [None]:
features_scaled_list[3]['label'].value_counts()

In [None]:
cluster_centers_visualize(1)

In [None]:
cluster_centers_visualize(2)

In [None]:
cluster_centers_visualize(3)

In [None]:
cluster_centers_visualize(4)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.metrics import confusion_matrix

In [None]:
X = features_scaled_list[0].iloc[:, :-1]
y = features_scaled_list[0].iloc[:, -1]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=10)

In [None]:
dt_clf = DecisionTreeClassifier(max_depth=2, min_samples_leaf=2)
dt_clf.fit(X_train, y_train)

In [None]:
pred = dt_clf.predict(X_test)

In [None]:
accuracy_score(y_test, pred)

In [None]:
confusion_matrix(y_test, pred)

In [None]:
rf_clf = RandomForestClassifier(n_estimators=200, max_depth=2)
rf_clf.fit(X_train, y_train)

In [None]:
pred = rf_clf.predict(X_test)

In [None]:
accuracy_score(y_test, pred)

In [None]:
confusion_matrix(y_test, pred)

In [None]:
svm_clf = LinearSVC(C=1, loss='hinge')
svm_clf.fit(X_train, y_train)

In [None]:
pred = svm_clf.predict(X_test)

In [None]:
accuracy = accuracy_score(y_test, pred)
precision = precision_score(y_test, pred, average='weighted')
recall = recall_score(y_test, pred, average='weighted')
print('정확도: {0:.4f}, 정밀도: {1:.4f}, 재현율: {2:.4f}'.format(accuracy, precision, recall))

In [None]:
poly_svm_clf = SVC(C=1, coef0=1, kernel='poly', degree=3)

In [None]:
poly_svm_clf.fit(X_train, y_train)

In [None]:
pred = poly_svm_clf.predict(X_test)

In [None]:
accuracy = accuracy_score(y_test, pred)
precision = precision_score(y_test, pred, average='weighted')
recall = recall_score(y_test, pred, average='weighted')
print('정확도: {0:.4f}, 정밀도: {1:.4f}, 재현율: {2:.4f}'.format(accuracy, precision, recall))

In [None]:
lr_clf = LogisticRegression(solver='liblinear')
lr_clf.fit(X_train, y_train)

In [None]:
pred = lr_clf.predict(X_test)

In [None]:
accuracy = accuracy_score(y_test, pred)
precision = precision_score(y_test, pred, average='weighted')
recall = recall_score(y_test, pred, average='weighted')
print('정확도: {0:.4f}, 정밀도: {1:.4f}, 재현율: {2:.4f}'.format(accuracy, precision, recall))