In [None]:
import sys
import os
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
import numpy as np
from sklearn.cluster import DBSCAN
from sklearn.neighbors import NearestNeighbors
import copy
import math
from scipy import stats


CLASSES_PATH = os.path.dirname(os.path.abspath('D:/Code/Classes'))
if not (CLASSES_PATH in sys.path):
    sys.path.append(CLASSES_PATH)
from Classes.Files_Handler_Class import Files_Handler
from Classes.CSV_Files_Class import CSV_Files
from Classes.Bcolors_Class import Bcolors as bcolors


In [None]:
colors_list = ['b', 'g', 'r', 'c', 'm', 'y', 'peru', 'tan', 'gold', 'lime', 'teal', 'aqua', 'navy', 'plum', 'pink']

In [None]:
layer_centrality = ['layer_density','layer_degree_histogram','layer_edge_weight',
                    'layer_sombor_index', 'layer_nodes_weight','layer_k_shell_weight']
node_centrality = ['degree', 'clustering', 'nip', 'sombor_index', 'ego_density','ego_degree',
                     'ego_k_shell', 'ego_degree_mean','kss', 'vote_power']
drop_centrality = ['layer_id', 'node_id', 'k_shell', 'k_shell_itr']

In [None]:
def gaussian_kde_func(dataset:pd.DataFrame, clean_data_path:str):
    for column in list(dataset):
        kde = stats.gaussian_kde(dataset[column])
        x = np.linspace(dataset[column].min(), dataset[column].max(), 100)
        y = kde(x)
        plt.figure(figsize=(10,6))
        plt.plot(x, y)
        plt.fill_between(x, y, alpha=0.5)
        plt.title(f'gaussian_kde {column}')
        plt.xlabel(column)
        plt.ylabel('Frequency')
        plt.xlim(math.floor(dataset[column].min()), math.ceil(dataset[column].max()))
        plt.savefig(clean_data_path + f'gaussian_kde {column} before outlier detection.png')
        plt.show()

    pass

def NearestNeighbors_func(dataset:np.array, n_neighbors:int, clean_data_path:str):
    neighbors = NearestNeighbors(n_neighbors=n_neighbors)
    neighbors_fit = neighbors.fit(dataset)
    distances, indices = neighbors_fit.kneighbors(dataset)
    distances = np.sort(distances, axis=0)
    distances_ = copy.deepcopy(distances)
    distances_ = np.mean(distances[:,1:],axis=1)
    figure(figsize=(8, 6), dpi=256)
    plt.grid(visible=True)
    plt.plot(distances_)
    plt.savefig(clean_data_path + f'NearestNeighbors n_neighbors={n_neighbors}.png')
    dist_mean = distances_.mean()
    start_i = 0
    for i, item in enumerate(distances_):
        if item > (dist_mean * 50):
            start_i = int(i * (i / len(distances_))/1.5)
            distances_ = distances_[start_i:i]
            break
    figure(figsize=(8, 6), dpi=256)
    plt.grid(visible=True)
    plt.plot(distances_)
    plt.savefig(clean_data_path + f'NearestNeighbors zoomed n_neighbors={n_neighbors}.png')

    pass

def DBSCAN_func(dataset:np.array, esp:float=50, min_samples:int=500, inp_x:int=85000):
    labels = []
    core_samples_mask = []
    dataset_parts = []
    x, i, c = inp_x, 0, 0
    j = math.ceil(len(dataset)/x)
    print(f"Dataset parts count: {j}")
    while i < j:
        if i == (j - 1):
            k = len(dataset)
        else:
            k = c + x
        dataset_parts.append(dataset[c:k])
        c += x
        i += 1

    for i, item in enumerate(dataset_parts):
        print(f"DBSCAN Started on part {i+1}th {item.shape}: ")
        db = DBSCAN(eps=esp, min_samples=min_samples).fit(item)
        temp_core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
        temp_core_samples_mask[db.core_sample_indices_] = True
        temp_labels = db.labels_
        core_samples_mask.extend(temp_core_samples_mask)
        print(np.histogram(temp_labels))
        
        # Number of clusters in labels, ignoring noise if present.
        n_clusters_ = len(set(temp_labels)) - (1 if -1 in temp_labels else 0)
        labels.extend(db.labels_)
        print('number of clusters: %d' % n_clusters_)
        print(f"DBSCAN Finished part {i+1}th \n")
        
    core_samples_mask = np.asarray(core_samples_mask, dtype=bool)
    labels = np.asarray(labels, dtype=int)
    # Number of clusters in labels, ignoring noise if present.
    n_clusters = len(set(labels)) - (1 if -1 in temp_labels else 0)
    
    return labels, core_samples_mask, n_clusters

def plot_data_classes(dataset:np.ndarray, labels:np.array, core_samples_mask:np.array,
                       n_clusters:np.array, colors_list:list, clean_data_path:str, n_neighbors:int, min_samples:int):
    # Plot result
    # Black removed and is used for noise instead.
    unique_labels = list(set(labels))
    color_cunt = len(unique_labels)-1
    if color_cunt >= len(colors_list):
        color_cunt = len(colors_list) - 1
    colors = colors_list[:color_cunt]

    figure(figsize=(8, 6), dpi=256)
    for k, col in zip(unique_labels, colors):
        markersize = 3
        class_member_mask = (labels == k)
        if k == -1:
            # Black used for noise.
            col = 'k'

        xy = dataset[class_member_mask & core_samples_mask]
        plt.plot(xy[:, 0], xy[:, 8], 'o', markerfacecolor=col,
                markeredgecolor='w',
                markersize=markersize)

        xy = dataset[class_member_mask & ~core_samples_mask]
        plt.plot(xy[:, 0], xy[:, 8], '.', markerfacecolor=col,
                markeredgecolor='k',
                markersize=markersize)

    plt.title('number of clusters: %d' % n_clusters)
    plt.savefig(clean_data_path + f'DBSCAN esp={n_neighbors} min_samples={min_samples}.png')
    plt.show()

    pass

In [None]:
select_file_object = Files_Handler()
csv_files_object = CSV_Files()
multiple_selecion = False

data_path = select_file_object.select_files("text files", ".csv", 'Select Datasets', multiple_selecion)
path = ""
networks_content = []
if multiple_selecion:
    path = data_path[0][:data_path[0].rfind("/")] + "/"
    for item in data_path:
        networks_content.append(pd.read_csv(item))
else:
    path = data_path[:data_path.rfind("/")] + "/"
    networks_name = select_file_object.get_files_in_path(path)
    print(len(networks_name))
    networks_content = csv_files_object.get_content_of_csv_files(path, networks_name, drop_centrality)

# print(networks_content)


In [None]:
path
root_path = ('/'.join(path.split('/')[:-2])) + '/'
clean_data_path = select_file_object.make_dir(root_path, 'Clean Data')

In [None]:
for i, item in enumerate(networks_content):
    if item.isnull().values.any():
        print(i + 1, networks_name[i])
    

In [None]:
data = pd.concat(networks_content, axis=0)
del networks_content
data = csv_files_object.scale_data(data, drop_centrality)
data = data.sample(frac=1)
orgin_data = copy.deepcopy(data)

In [None]:
print(data.shape)
data.head(5)


In [None]:
# gaussian_kde_func(data, clean_data_path)

In [None]:
y_data = data['SIR'].values
data = data.drop('SIR', axis=1)
data.head(5)

In [None]:
dataset = data.values
print(len(data.index))

In [None]:
n_neighbors = 500
neighbors = NearestNeighbors(n_neighbors=n_neighbors)
neighbors_fit = neighbors.fit(dataset)
distances, indices = neighbors_fit.kneighbors(dataset)
distances = np.sort(distances, axis=0)

In [None]:
np.savetxt(clean_data_path + 'distances.csv', distances)

In [None]:
import numpy as np
clean_data_path = "D:/Masters thesis/Networks Dataset/Results/Multilayer/Clean Data/"
distances = np.loadtxt(clean_data_path + 'distances.csv')

In [None]:
n_neighbors = 500

distances_ = copy.deepcopy(distances)
distances_ = np.mean(distances[:,1:],axis=1)
figure(figsize=(8, 6), dpi=256)
plt.xlabel("Sampels")
plt.ylabel("Distance Average")

plt.grid(visible=True)
plt.plot(distances_)
plt.savefig(clean_data_path + f'NearestNeighbors n_neighbors={n_neighbors}.png')


In [None]:
dist_mean = distances_.mean()
start_i = 0
j = 0
for i, item in enumerate(distances_):
    if item > (dist_mean * 20):
        start_i = int(i * (i / len(distances_))/1.5)
        distances_t = distances_[start_i:i]
        j = i
        break
figure(figsize=(8, 6), dpi=256)
plt.xlabel("Sampels")
plt.ylabel("Distance Average")
plt.xlim((start_i,j))
plt.grid(visible=True)
plt.plot(distances)
plt.savefig(clean_data_path + f'NearestNeighbors zoomed n_neighbors={n_neighbors}.png')


In [None]:
esp = 0.1
min_samples = n_neighbors
x = 300000
labels, core_samples_mask, n_clusters = DBSCAN_func(dataset, esp, min_samples, x)
np.histogram(labels)

In [None]:
plot_data_classes(dataset, labels, core_samples_mask, n_clusters, colors_list, clean_data_path, n_clusters, min_samples)

In [None]:
orgin_data = orgin_data.assign(class_type=pd.Series(labels).values)

non_outlier_data = orgin_data.loc[orgin_data['class_type'] != -1]

orgin_outlier_data = orgin_data.loc[orgin_data['class_type'] == -1]



In [None]:
# del orgin_data, data, dataset

In [None]:
orgin_data = copy.deepcopy(orgin_outlier_data)
data = copy.deepcopy(orgin_outlier_data)
data = data.drop(['SIR', 'class_type'], axis=1)
print(data.shape)
data.head(5)

In [None]:
dataset = data.values

In [None]:
# esp = 0.07
# min_samples = n_neighbors
# x = 20000
labels, core_samples_mask, n_clusters = DBSCAN_func(dataset, esp, min_samples, x)
np.histogram(labels)

In [None]:
plot_data_classes(dataset, labels, core_samples_mask, n_clusters, colors_list, clean_data_path, n_clusters, min_samples)

In [None]:
orgin_data = orgin_data.assign(class_type=pd.Series(labels).values)

new_non_outlier_data = orgin_data.loc[orgin_data['class_type'] != -1]

orgin_outlier_data = orgin_data.loc[orgin_data['class_type'] == -1]



In [None]:
non_outlier_data = pd.concat([non_outlier_data, new_non_outlier_data], axis=0)
non_outlier_data.drop('class_type', axis=1)
non_outlier_data.head(5)

In [None]:
print(non_outlier_data.shape)

In [None]:
non_outlier_data.to_csv(clean_data_path + "data.csv", encoding='utf-8', index=False, header=True)

In [None]:
gaussian_kde_func(non_outlier_data, clean_data_path)