In [1]:
from ucimlrepo import fetch_ucirepo
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import silhouette_score
from sklearn.metrics import davies_bouldin_score
import xlsxwriter
from itertools import combinations
import numpy as np
from sklearn.cluster import KMeans
from awesome_doanh_ml.helper import cal_num_combinations, flat_arr, cal_NC1_twoSet
from awesome_doanh_ml.utils.numpy_utils import TwoDIndexObject

In [2]:
# fetch dataset
iris = fetch_ucirepo(id=53)

# data (as pandas dataframes)
X = iris.data.features
y = iris.data.targets
X_columns = np.array(list(X.columns))

# number of clusters
cluster_nums = [2,3,4]

# metadata
print(iris.metadata)

# variable information
print(iris.variables)

{'uci_id': 53, 'name': 'Iris', 'repository_url': 'https://archive.ics.uci.edu/dataset/53/iris', 'data_url': 'https://archive.ics.uci.edu/static/public/53/data.csv', 'abstract': 'A small classic dataset from Fisher, 1936. One of the earliest known datasets used for evaluating classification methods.\n', 'area': 'Biology', 'tasks': ['Classification'], 'characteristics': ['Tabular'], 'num_instances': 150, 'num_features': 4, 'feature_types': ['Real'], 'demographics': [], 'target_col': ['class'], 'index_col': None, 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_dataset_creation': 1936, 'last_updated': 'Tue Sep 12 2023', 'dataset_doi': '10.24432/C56C76', 'creators': ['R. A. Fisher'], 'intro_paper': {'ID': 191, 'type': 'NATIVE', 'title': 'The Iris data set: In search of the source of virginica', 'authors': 'A. Unwin, K. Kleinman', 'venue': 'Significance, 2021', 'year': 2021, 'journal': 'Significance, 2021', 'DOI': '1740-9713.01589', 'URL': 'https://www.semanticscholar.org

In [None]:
kmeans = KMeans(n_clusters=2, random_state=0, n_init="auto").fit(X)
predictions = kmeans.labels_
kmeans.cluster_centers_

In [49]:
cluster_dfs = []
for inx in range(kmeans.n_clusters):
    cluster_dfs.append(pd.DataFrame(columns=['sepal length', 'sepal width']).T)

for inx, row in enumerate(X.iterrows()):
    if inx == 0:
        cluster_dfs[predictions[inx]] = pd.concat([cluster_dfs[predictions[inx]].T,row[1].to_frame().T])
    else:
        cluster_dfs[predictions[inx]] = pd.concat([cluster_dfs[predictions[inx]],row[1].to_frame().T])

In [None]:
a = X.loc[0]
b = X.loc[5]
c = pd.concat([a.to_frame().T,b.to_frame().T])

In [None]:
x = cluster_dfs[0]['sepal length'].values
y = cluster_dfs[0]['sepal width'].values

In [None]:
for inx, cluster_df in enumerate(cluster_dfs):
    x = cluster_dfs[inx]['sepal length'].values
    y = cluster_dfs[inx]['sepal width'].values
    plt.scatter(x, y, label=f'cluster {inx}')
    plt.xlabel('sepal length')
    plt.ylabel('sepal width')
    plt.legend()

In [None]:
print(silhouette_score(X, predictions))
print(davies_bouldin_score(X, predictions))

In [None]:
class ModelMetrics:
    def __init__(self, db_scores, si_scores, db_mean, si_mean, db_dev, si_dev):
        self.db_scores = db_scores
        self.si_scores = si_scores
        self.db_mean = db_mean
        self.si_mean = si_mean
        self.db_dev = db_dev
        self.si_dev = si_dev

In [None]:
def save_evalMetrics_to_excel(self):
        workbook = xlsxwriter.Workbook("Result.xlsx")

        for inx, model_metrics in enumerate(self.model_metrics_s):

            worksheet_fold = workbook.add_worksheet(model_metrics.model.name)

            row = 1

            worksheet_fold.write(0, 0, f"Fold number")
            worksheet_fold.write(0, 1, f"Accuracy score")

            for iny, mse in enumerate(model_metrics.acc_scores):
                worksheet_fold.write(row, 0, f"{iny}")
                worksheet_fold.write(row, 1, f"{mse}")
                row += 1

            worksheet_fold.write(row, 0, f"Average value: {model_metrics.acc_mean}")
            row += 1
            worksheet_fold.write(row, 0, f"Standard Derivation: {model_metrics.acc_dev}")
            row += 1

            row += 3

            worksheet_fold.write(row, 0, f"Fold number")
            worksheet_fold.write(row, 1, f"F1 Score")

            row += 1

            for iny, mse in enumerate(model_metrics.f1_scores):
                worksheet_fold.write(row, 0, f"{iny}")
                worksheet_fold.write(row, 1, f"{mse}")
                row += 1

            worksheet_fold.write(row, 0, f"Average value: {model_metrics.f1_mean}")
            row += 1
            worksheet_fold.write(row, 0, f"Standard Derivation: {model_metrics.f1_dev}")
            row += 1

            row += 3

        workbook.close()

        return

In [4]:
# # Prepare column combinations
column_combination_indexes = cal_num_combinations(len(X.columns))

In [None]:
#One cluster with combination of features
for inx, indexes in enumerate(column_combination_indexes_flatten):
    kmeans = KMeans(n_clusters=3, n_init="auto")
    column_names = list(X_columns[list(indexes)])
    data_prepared = X[column_names]
    print(f'Columns; {data_prepared.columns.values}')
    kmeans.fit(X)
    predictions = kmeans.labels_
    print(silhouette_score(X, predictions))
    print(davies_bouldin_score(X, predictions))

In [5]:
kmean_result = TwoDIndexObject(cluster_nums, column_combination_indexes, columns = ['sil_score', 'db_score'])
kmean_result.twod_df.sort_index()

kmean_result.assign_value(2, (0, 1), 10)
kmean_result.twod_df.loc[pd.IndexSlice[2, (0,1)], :] = 4
for inx, cluster_num in enumerate(cluster_nums):
    for iny, column_combination_index in enumerate(column_combination_indexes):
        kmeans = KMeans(n_clusters=cluster_num, n_init="auto")
        column_names = list(X_columns[list(column_combination_index)])
        data_prepared = X[column_names]
        print(f'Columns; {data_prepared.columns.values}')
        kmeans.fit(X)
        predictions = kmeans.labels_
        sil_score = silhouette_score(X, predictions)
        db_score = davies_bouldin_score(X, predictions)
        kmean_result.assign_value(cluster_num, column_combination_index, [sil_score, db_score])
# print(kmean_result.loc(2,(0, 1)))jk

Columns; ['sepal length']
Columns; ['sepal width']
Columns; ['petal length']
Columns; ['petal width']
Columns; ['sepal length' 'sepal width']
Columns; ['sepal length' 'petal length']
Columns; ['sepal length' 'petal width']
Columns; ['sepal width' 'petal length']
Columns; ['sepal width' 'petal width']
Columns; ['petal length' 'petal width']
Columns; ['sepal length' 'sepal width' 'petal length']
Columns; ['sepal length' 'sepal width' 'petal width']
Columns; ['sepal length' 'petal length' 'petal width']
Columns; ['sepal width' 'petal length' 'petal width']
Columns; ['sepal length']
Columns; ['sepal width']
Columns; ['petal length']
Columns; ['petal width']
Columns; ['sepal length' 'sepal width']
Columns; ['sepal length' 'petal length']
Columns; ['sepal length' 'petal width']
Columns; ['sepal width' 'petal length']
Columns; ['sepal width' 'petal width']
Columns; ['petal length' 'petal width']
Columns; ['sepal length' 'sepal width' 'petal length']
Columns; ['sepal length' 'sepal width' 'pet

In [None]:
# importing libraries
import matplotlib.pyplot as plt
import numpy as np
import math
import seaborn as sns

X = np.arange(0, math.pi*2, 0.05)

# Using built-in trigonometric function we can directly plot
# the given cosine wave for the given angles
Y1 = np.sin(X)
Y2 = np.cos(X)
Y3 = np.tan(X)
Y4 = np.tanh(X)

# Initialise the subplot function using number of rows and columns
# figure, axis = plt.subplots(2, 2, sharex=True, sharey=True)
figure, axis = plt.subplots(2, 2)

# For Sine Function
axis[0, 0].plot(X, Y1, label='test label')
axis[0, 0].set_title("Sine Function")
axis[0, 0].legend(loc='upper right')
axis[0, 0].grid('on')
plt.setp(axis[0, 0].get_xticklabels(), rotation=0, horizontalalignment='right')

# For Cosine Function
axis[0, 1].plot(X, Y2, label='test label')
axis[0, 1].set_title("Cosine Function")
plt.setp(axis[0, 1].get_xticklabels(), rotation=0, horizontalalignment='right')

# For Tangent Function
axis[1, 0].plot(X, Y3, label='test label')
axis[1, 0].set_title("Tangent Function")
plt.setp(axis[1, 0].get_xticklabels(), rotation=0, horizontalalignment='right')

# For Tanh Function
axis[1, 1].plot(X, Y4, label='test label')
axis[1, 1].set_title("Tanh Function")
plt.setp(axis[1, 1].get_xticklabels(), rotation=0, horizontalalignment='right')

# Combine all the operations and display
figure.tight_layout()