## Imports

In [None]:
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.preprocessing import MinMaxScaler
from memory_profiler import profile
from sklearn.cluster import KMeans
import pandas as pd
import numpy as np
import random

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting memory_profiler
  Downloading memory_profiler-0.60.0.tar.gz (38 kB)
Building wheels for collected packages: memory-profiler
  Building wheel for memory-profiler (setup.py) ... [?25l[?25hdone
  Created wheel for memory-profiler: filename=memory_profiler-0.60.0-py3-none-any.whl size=31284 sha256=f8a08bf70e54acbc858910107ab9846bd27a7df2e89975880048c7527b21243d
  Stored in directory: /root/.cache/pip/wheels/67/2b/fb/326e30d638c538e69a5eb0aa47f4223d979f502bbdb403950f
Successfully built memory-profiler
Installing collected packages: memory-profiler
Successfully installed memory-profiler-0.60.0


## Defining global variables

In [None]:
change_ratio = 0.9
sampling_ratio = 0.75
n_clusters = 5
ratio = [0.1, 0.3, 0.5, 0.7, 0.9]
N = int(22544 * ratio[0])   # N is the number of initial data to be clustered.


## Defining utility functions

In [None]:
# Function to create list of lists.

def init_list_of_objects(size):
    list_of_objects = list()
    for i in range(0,size):
        list_of_objects.append(list())
    return list_of_objects

# Function to create a list of dictionnaries.

def init_list_of_dict(size):
    list_of_objects = list()
    for i in range(0,size):
        list_of_objects.append(dict())
    return list_of_objects

# Defining our sampling probability. 

def probability(e, center):
    norm_e = np.linalg.norm(e)
    norm_center = np.linalg.norm(center)
    return int(abs(1 - norm_e / norm_center) * 100)

# Defining our error function.

def calculateErrorWithSample(e, sample, init_mean):
    sample = sample.append(e, ignore_index = True)
    sample_mean = calculateMean(sample)
    overall_mean = init_mean - sample_mean
    result = overall_mean.div(init_mean).replace(np.inf, 0)
    return  abs(result.mean()) * 100  

def calculateErrorWithoutSample(sample, init_mean):
    sample_mean = calculateMean(sample)
    overall_mean = init_mean - sample_mean
    result = overall_mean.div(init_mean).replace(np.inf, 0)
    return  abs(result.mean()) * 100  

# calculating the mean for a giving cluster.

def calculateMean(df):
        return df.mean(axis = 0)

# flatten a list of lists

def flatten(t):
    return [item for sublist in t for item in sublist]

# Simple random sampling function.

def simpleRandomSampling(prob):
    a = random.randint(1, 100)
    if a < prob:
        return True
    return False

def calculateChi(observed, expected):
  b = expected
  a = np.square(np.subtract(observed, expected))
  return np.sum(np.sqrt(np.divide(a, b, out=np.zeros_like(a), where=b!=0)))

def calculateOS(observed, expected):
  b = expected
  a = np.subtract(observed, expected)
  return np.sum(np.abs(np.divide(a, b, out=np.zeros_like(a), where=b!=0)))



## Reading Data

In [None]:
# Reading and adjusting the data through header renaming and feature selection

data = pd.read_csv("KDDTrain.txt", sep =',', header = None)
data.columns = ["feature {}".format(i+1) for i in range (43)]
data = data.drop(data.columns[[0, 1, 2, 3, 8, 9, 11, 12, 14, 15, 16, 17, 18,
                               19, 20, 21, 24, 25, 26, 28, 30, 31, 32, 33,
                               34, 36, 37, 39, 40, 42]], axis = 1)

## Calculating Expected Standard Deviation, Mean and Median

In [None]:
expected_std = np.array(data.loc[:, data.columns != 'feature 42'].std())
expected_mean = np.array(data.loc[:, data.columns != 'feature 42'].mean())
expected_median = np.array(data.loc[:, data.columns != 'feature 42'].median()) 

472786.4310880747 10395.450230660043 54.0


## Data Normalization Using Min-Max

In [None]:
scaler = MinMaxScaler()
norm_array = scaler.fit_transform(data.loc[:, data.columns != 'feature 42'])
norm_data = pd.DataFrame(norm_array, columns = data.columns[:-1])

## Running the pseudo-code

In [None]:
# Taking N first rows of data for K-means algorithm.

sampled_df=norm_data.head(N)
kmeans = KMeans(n_clusters = n_clusters)
kmeans.fit(sampled_df)
labels = kmeans.predict(sampled_df)
centroids = kmeans.cluster_centers_

sampled_df['cluster'] = kmeans.labels_
print('Clusters: ')
print(sampled_df['cluster'].value_counts())

Clusters: 
1    1222
2     424
3     195
4     194
0     135
5      84
Name: cluster, dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':


In [None]:
# Variables mapper:
# initial_means => holds the mean value for each initial cluster obtained from learning base.
# clusters => the initial clusters obtained from the learning base.
# cluster_sample => the sampled clusters obtained.
# Adding a timer for testing purposes

import timeit

start = timeit.default_timer()

# Storing our cluster indices in the clusters lists, each list corresponding to a cluster index.

clusters_size = sampled_df['cluster'].value_counts()
clusters = init_list_of_objects(n_clusters)

for i in range(N):
    clusters[sampled_df['cluster'][i]].append(i)

# Storing the initial mean values for all clusters.

initial_means = []

for i in range(n_clusters):
    initial_means.append(calculateMean(norm_data.iloc[clusters[i]]))

# Initializing the sampled clusters.

cluster_sample = init_list_of_objects(n_clusters)

# Running our first block code of the algorithm.
# While we haven't reached the sampling ratio desired, restart the loop.
# Sample the data based on the probability given in the paper, combined with SRS.

for i in range(n_clusters):
    j = 0
    k = 0
    while len(cluster_sample[i]) < int(sampling_ratio * len(clusters[i])):
        if (clusters[i][j] not in cluster_sample[i]):
            if k > 20:
                cluster_sample[i].append(clusters[i][j])
            if simpleRandomSampling(probability(norm_data.iloc[clusters[i][j]], centroids[i]) * (2**k)):
                cluster_sample[i].append(clusters[i][j])
        j = j + 1
        if j == len(clusters[i]):
            print("round " + str(k))
            k += 1
            j = 0

# Initialize the rank table.

rank_table = init_list_of_dict(n_clusters)

# Calculating the mean of each sample, as well as storing its size and the total changes.

samples_means = []
cluster_sizes = []
total_changes = []

for i in range(n_clusters):
    samples_means.append(calculateMean(norm_data.iloc[cluster_sample[i]]))
    cluster_sizes.append(len(cluster_sample[i]))
    total_changes.append(0)

# Calculating the error to be stored in the rank table.

for i in range(n_clusters):
    for j in range(len(cluster_sample[i])):
        rank_table[i][cluster_sample[i][j]] = calculateErrorWithoutSample(norm_data.iloc[(cluster_sample[i][:j] + cluster_sample[i][j+1:])],
                                                                          samples_means[i])

# Sorting the rank table.

for i in range(n_clusters):
    rank_table[i] = dict(
        sorted(rank_table[i].items(), key=lambda item: item[1]))

# Setting M to N because if we restart the cell execution, the last value of M will be stored.


# For each remaining element, find its cluster and check if it can be sampled.
@profile
def algo():
  M = N
  while M < len(norm_data):
      pred = kmeans.predict(norm_data.iloc[M: M+1])
      error = calculateErrorWithSample(
          norm_data.iloc[M: M+1], norm_data.iloc[cluster_sample[pred[0]]], initial_means[pred[0]])
      error_out = calculateErrorWithoutSample(
          norm_data.iloc[cluster_sample[pred[0]]], initial_means[pred[0]])
      if error < error_out and error/error_out < 0.997:
        if simpleRandomSampling(probability(norm_data.iloc[M], centroids[pred[0]])):
            cluster_sample[pred[0]].append(M)
            total_changes[pred[0]] = total_changes[pred[0]] + 1
            cluster_sample[pred[0]].remove(list(rank_table[pred[0]].keys())[0])
            del rank_table[pred[0]][list(rank_table[pred[0]].keys())[0]]

      M = M + 1
      start = timeit.default_timer()

      if int(cluster_sizes[pred[0]] * change_ratio) == total_changes[pred[0]]:
          print("Entered !")
          print("Cluster {0} size: {1} with {2} total changes. Entered at iteration {3}".format(
              pred[0], cluster_sizes[pred[0]], total_changes[pred[0]], M))
          print(M)
          total_changes[pred[0]] = 0
          samples_means = []
          for i in range(n_clusters):
              samples_means.append(calculateMean(
                  norm_data.iloc[cluster_sample[i]]))

          for i in range(n_clusters):
              for j in range(len(cluster_sample[i])):
                  rank_table[i][cluster_sample[i][j]] = calculateErrorWithoutSample(norm_data.iloc[(cluster_sample[i][:j] + cluster_sample[i][j+1:])],
                                                                                    samples_means[i])

          for i in range(n_clusters):
              rank_table[i] = dict(
                  sorted(rank_table[i].items(), key=lambda item: item[1]))

          stop = timeit.default_timer()
          execution_time = stop - start

          # Execution time in sec
          print("Program Executed in " + str(execution_time))

algo()

round 0
round 1
round 2
round 0
round 0
round 1
round 2
round 3
round 4
round 0
round 1
round 2
round 3
round 4
round 5
round 6
round 7
round 8
round 9
round 10
round 11
round 12
round 13
round 14
round 15
round 16
round 17
round 18
round 19
round 20
round 0
round 1
round 2
round 3
round 0
round 1
round 2
ERROR: Could not find file <ipython-input-13-29a5ff08db47>
NOTE: %mprun can only be used on functions defined in physical files, and not in the IPython environment.
Entered !
Cluster 1 size: 916 with 824 total changes. Entered at iteration 6200
6200
Program Executed in 9.541888153999992
peak memory: 228.12 MiB, increment: 0.18 MiB


In [None]:
final_cluster = data[data.index.isin(flatten(cluster_sample))]

observed_mean =  np.array(final_cluster.
                          loc[:, data.columns != 'feature 42'].mean())
observed_median = np.array(final_cluster.
                           loc[:, data.columns != 'feature 42'].median())
observed_std = np.array(final_cluster.
                        loc[:, data.columns != 'feature 42'].std())

shi_square = []
OS = []

print(observed_mean)

for i in range(len(observed_mean)):
  shi_square.append(calculateChi(observed_mean[i], expected_mean[i]) 
                    + calculateChi(observed_std[i], expected_std[i]) 
                    + calculateChi(observed_median[i], expected_median[i]))
  
  OS.append(calculateOS(observed_mean[i], expected_mean[i]) 
            + calculateOS(observed_std[i], expected_std[i]) 
            + calculateOS(observed_median[i], expected_median[i]))
  
df = pd.DataFrame({'features': final_cluster.
                   loc[:, data.columns != 'feature 42'].std().index,
                   'mean':observed_mean,
                   'median': observed_median,
                   'standard deviation': observed_std,
                   'OS': OS,
                   'shi_square': shi_square})

df.to_csv('file_name.csv', index=False)


In [None]:
import matplotlib.pyplot as plt
import matplotlib.cm as cm

cluster_labels = []
X = []

for i in range(n_clusters):
  for j in range(len(cluster_sample[i])):
    X.append(norm_data.iloc[cluster_sample[i][j]].values)
    cluster_labels.append(i)

range_n_clusters = [n_clusters]

for n in range_n_clusters:
    fig, (ax1) = plt.subplots(1, 1)
    fig.set_size_inches(18, 7)
    ax1.set_xlim([-0.1, 1])
    ax1.set_ylim([0, len(X) + (n + 1) * 10])
    clusterer = KMeans(n_clusters=n, random_state=10)
    cluster_labels = clusterer.fit_predict(X)
    silhouette_avg = silhouette_score(X, cluster_labels)
    print("silhouette score ", silhouette_avg)
    sample_silhouette_values = silhouette_samples(X, cluster_labels)
    y_lower = 10
    for i in range(n_clusters):

        ith_cluster_silhouette_values = sample_silhouette_values[cluster_labels == i]
        ith_cluster_silhouette_values.sort()
        size_cluster_i = ith_cluster_silhouette_values.shape[0]
        y_upper = y_lower + size_cluster_i
        color = cm.nipy_spectral(float(i) / n)
        ax1.fill_betweenx(
            np.arange(y_lower, y_upper),
            0,
            ith_cluster_silhouette_values,
            facecolor=color,
            edgecolor=color,
            alpha=0.7,
        )

        ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))
        y_lower = y_upper + 10

    ax1.set_title("The silhouette plot for the various clusters.")
    ax1.set_xlabel("The silhouette coefficient values")
    ax1.set_ylabel("Cluster label")
    ax1.axvline(x=silhouette_avg, color="red", linestyle="--")
    ax1.set_yticks([])
    ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])

plt.show()