# Гиперпараметры DBSCAN 


Давайте посмотрим, какие есть гиперпараметры в DBSCAN, и как они влияют на результаты работы модели!

## DBSCAN и примеры кластеризации

# Сделать запись в разрезе курьеров

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import DBSCAN

In [2]:
df = pd.read_csv("../data/data.csv")
df = df[:3000]
pd.options.display.max_columns = 20
pd.options.display.max_rows = 40

In [3]:
from IPython.display import display

def find_bool_cols(df):
    temp_df_max = pd.DataFrame(df.max())
    cols_max = temp_df_max[temp_df_max == 1].dropna().T.columns
    temp_df_min = pd.DataFrame(df.min())
    cols_min = temp_df_min[temp_df_min == 0].dropna().T.columns
    return cols_min.intersection(cols_max)

def display_clusters(df, cluser_count):
    pd.options.display.max_columns = None
    pd.options.display.max_rows = None
    rows = []
    try:
        for cl in df["Cluster" + str(cluser_count)].unique():
            rows.append(df[df["Cluster" + str(cluser_count)] == cl].mean())
    except TypeError as e:
        raise TypeError(
            f"Dataframe has no column named {'Cluster' + str(cluser_count)}"
        ) from e

    df_cluster_means = pd.DataFrame(rows)
    df_cluster_means = df_cluster_means[
        df_cluster_means.columns.drop(list(df.filter(regex=r"Cluster\d+")))
    ]

    # display(pd.DataFrame(df_cluster_means.std().sort_values(ascending=False)).T)
    # display(pd.DataFrame(df_cluster_means.mean().sort_values(ascending=False)).T)

    temp_mean = df_cluster_means.mean()
    cols = pd.DataFrame(temp_mean).T.columns.intersection(find_bool_cols(df))
    temp_mean[cols] = 1

    df_std = (
        pd.DataFrame(df_cluster_means.std() / temp_mean)
        .sort_values(by=[0], ascending=False)
        .T
    )
    display(df_std)
    display(df_cluster_means[df_std.columns])
    pd.options.display.max_columns = 20
    pd.options.display.max_rows = 40

In [4]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
df_scaled = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)

In [5]:
df_scaled

Unnamed: 0,Delivery_person_deliveries,Delivery_person_Age,Delivery_person_Ratings,Restaurant_latitude,Restaurant_longitude,Delivery_location_latitude,Delivery_location_longitude,Road_traffic_density,Vehicle_condition,multiple_deliveries,...,Type_of_order_Buffet,Type_of_order_Drinks,Type_of_order_Meal,Type_of_order_Snack,Type_of_vehicle_electric_scooter,Type_of_vehicle_motorcycle,Type_of_vehicle_scooter,City_Metropolitian,City_Semi-Urban,City_Urban
0,0.919355,0.894737,0.96,0.867852,0.199413,0.607396,0.198946,0.666667,1.0,0.000000,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
1,0.822581,0.736842,0.80,0.708801,0.313731,0.145889,0.319616,1.000000,1.0,0.333333,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0
2,0.790323,0.894737,0.76,0.708801,0.313731,0.141617,0.313903,0.666667,0.0,0.666667,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
3,0.919355,0.421053,0.96,0.781895,0.359510,0.355159,0.358159,0.000000,0.5,0.333333,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
4,0.887097,0.894737,0.84,0.877790,0.801567,0.638933,0.800948,0.000000,0.0,0.333333,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2995,0.532258,0.210526,0.96,0.842174,0.000000,0.532520,0.001270,1.000000,0.0,0.000000,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
2996,0.967742,0.526316,0.88,0.842174,0.000003,0.536318,0.006352,0.000000,0.5,0.333333,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
2997,0.903226,0.263158,0.84,0.935249,0.192810,0.810869,0.199997,1.000000,0.5,0.666667,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
2998,0.854839,0.684211,0.68,0.678022,0.267111,0.050347,0.266271,0.000000,0.0,0.000000,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0


In [6]:
from IPython.display import clear_output

outlier_percent = []
x_n = np.arange(3, 50)
y_n = x_n
z_n = np.arange(0.5, 2, 0.1)

for n in x_n:
    temparr = []
    # Создаём модеь
    for m in z_n:
        dbscan = DBSCAN(eps=m, min_samples=n)
        dbscan.fit(df_scaled)

        # Сохраняем процент точек-выбросов (количество выбросов как процент от общего количества точек)
        perc_outliers = 100 * np.sum(dbscan.labels_ == -1) / len(dbscan.labels_)

        labels = dbscan.labels_
        n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)

        df_scaled["temp"] = labels
        d = dict(df_scaled["temp"].value_counts())
        d.pop(-1, None)

        #
        # (Кол-во кластеров | процент выбросов | разброс количеств элементов в кластерах)
        #
        res = np.NaN
        if n_clusters_ > 1:
            res = (
                n_clusters_,
                round(perc_outliers, 1),
                round(np.std(list(d.values()))),
            )

        df_scaled = df_scaled.drop(["temp"], axis=1)

        temparr.append(res)
    clear_output(wait=True)
    print(n)

    outlier_percent.append(temparr)

49


In [7]:
temp_df = pd.DataFrame(outlier_percent, columns=[round(z, 1) for z in z_n], index=x_n)
temp_df.head(10)

Unnamed: 0,0.5,0.6,0.7,0.8,0.9,1.0,1.1,1.2,1.3,1.4,1.5,1.6,1.7,1.8,1.9
3,"(55, 93.4, 1)","(153, 76.9, 2)","(179, 51.3, 9)","(154, 33.2, 15)","(136, 19.4, 18)","(132, 11.1, 19)","(125, 6.3, 19)","(123, 3.7, 19)","(126, 2.1, 19)","(127, 1.4, 19)","(2, 0.6, 1488)",,,,
4,"(14, 98.0, 1)","(58, 89.0, 3)","(101, 64.5, 9)","(99, 42.3, 16)","(110, 25.6, 18)","(114, 14.2, 19)","(113, 8.4, 19)","(117, 4.5, 19)","(119, 3.0, 19)","(120, 2.1, 19)","(2, 0.8, 1480)",,,,
5,,"(28, 94.1, 2)","(72, 74.8, 7)","(70, 49.9, 16)","(89, 31.4, 18)","(101, 18.0, 19)","(106, 10.9, 19)","(109, 6.6, 19)","(114, 3.9, 19)","(117, 2.5, 19)","(2, 1.0, 1476)",,,,
6,,"(8, 98.3, 1)","(49, 82.9, 6)","(59, 57.1, 14)","(69, 37.6, 17)","(83, 23.0, 19)","(97, 13.3, 19)","(104, 8.0, 19)","(109, 4.9, 19)","(110, 3.8, 19)","(3, 1.3, 1387)",,,,
7,,,"(31, 89.1, 5)","(54, 62.6, 13)","(57, 42.3, 16)","(72, 27.0, 18)","(86, 16.7, 19)","(95, 10.6, 19)","(101, 6.9, 19)","(104, 5.0, 19)","(3, 1.6, 1381)",,,,
8,,,"(18, 93.9, 4)","(43, 70.0, 12)","(52, 46.0, 15)","(61, 31.4, 17)","(75, 20.6, 19)","(85, 13.5, 19)","(95, 8.6, 19)","(97, 6.8, 19)","(2, 2.3, 1454)",,,,
9,,,"(6, 97.4, 5)","(37, 75.8, 10)","(49, 49.4, 15)","(57, 33.5, 16)","(68, 23.3, 18)","(76, 16.5, 18)","(86, 11.5, 19)","(92, 8.2, 19)","(2, 3.1, 1444)",,,,
10,,,"(5, 97.9, 4)","(30, 80.6, 9)","(41, 53.7, 12)","(55, 35.6, 16)","(59, 26.5, 16)","(69, 19.2, 18)","(79, 13.8, 18)","(83, 11.2, 19)","(2, 3.6, 1436)",,,,
11,,,,"(18, 86.7, 7)","(40, 56.9, 13)","(52, 39.0, 16)","(58, 27.6, 16)","(63, 22.0, 17)","(74, 15.9, 18)","(78, 13.1, 18)","(4, 4.2, 1232)",,,,
12,,,,"(14, 90.9, 7)","(38, 61.0, 12)","(48, 42.4, 16)","(55, 29.7, 16)","(59, 23.7, 16)","(66, 18.9, 17)","(74, 14.6, 18)",,,,,


In [16]:
from collections import OrderedDict

stacks = temp_df.stack().to_dict()
# stacks.head(15)
df_params = pd.DataFrame(OrderedDict(sorted(stacks.items(), key=lambda x: x[1][0]))).T

In [18]:
df_params[
    (df_params[0] < 50) & 
    (df_params[1] < 30) &
    (df_params[2] < 500)
    ]
# (Кол-во кластеров | процент выбросов | разброс количеств элементов в кластерах)

Unnamed: 0,Unnamed: 1,0,1,2
20,1.4,49.0,27.7,14.0
21,1.4,49.0,27.8,15.0


In [19]:
stacks = temp_df.stack().sort_values(key=lambda x: x)
stacks.head(5)

3  1.5    (2, 0.6, 1488)
4  1.5    (2, 0.8, 1480)
5  1.5    (2, 1.0, 1476)
8  1.5    (2, 2.3, 1454)
9  1.5    (2, 3.1, 1444)
dtype: object

In [20]:
temp_df.columns

Float64Index([0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7,
              1.8, 1.9],
             dtype='float64')

In [21]:
dbscan = DBSCAN(eps=1.5,min_samples=8)
dbscan.fit_predict(df_scaled)

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [22]:
dbscan.labels_

core_samples_mask = np.zeros_like(dbscan.labels_, dtype=bool)
core_samples_mask[dbscan.core_sample_indices_] = True
labels = dbscan.labels_

# Number of clusters in labels, ignoring noise if present.
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
n_noise_ = list(labels).count(-1)

print('Estimated number of clusters: %d' % n_clusters_)
print('Estimated number of outliers: %d' % n_noise_)

Estimated number of clusters: 2
Estimated number of outliers: 70


In [23]:
cluster_col = "Cluster2"
df[cluster_col] = labels
d = dict(df[cluster_col].value_counts())
d.pop(-1, None)
np.std(list(d.values()))

1454.0

In [24]:
df[cluster_col].value_counts().std()

1662.1645526240775

In [25]:
df_rem = df[df[cluster_col] >=0]

In [26]:
display_clusters(df_rem, 2)

Unnamed: 0,Road_traffic_density,Distance,Type_of_vehicle_electric_scooter,City_Urban,City_Metropolitian,Weather conditions_Fog,Vehicle_condition,Type_of_vehicle_motorcycle,Time_taken (min),Type_of_order_Snack,Type_of_order_Drinks,Type_of_vehicle_scooter,multiple_deliveries,Weather conditions_Stormy,Weather conditions_Sandstorms,Type_of_order_Buffet,Weather conditions_Windy,Weather conditions_Sunny,Type_of_order_Meal,Delivery_location_latitude,Delivery_person_deliveries,Restaurant_latitude,Weather conditions_Cloudy,Time_to_pick,Delivery_person_Age,Festival,Delivery_person_Ratings,Restaurant_longitude,Delivery_location_longitude,City_Semi-Urban
0,0.848014,0.640326,0.60237,0.556916,0.556674,0.531635,0.510528,0.427316,0.420072,0.405096,0.177322,0.175053,0.135743,0.129358,0.124513,0.116673,0.114823,0.112158,0.111101,0.065639,0.064237,0.060726,0.050783,0.043882,0.02183,0.012839,0.006839,0.000463,0.000419,0.000242


Unnamed: 0,Road_traffic_density,Distance,Type_of_vehicle_electric_scooter,City_Urban,City_Metropolitian,Weather conditions_Fog,Vehicle_condition,Type_of_vehicle_motorcycle,Time_taken (min),Type_of_order_Snack,Type_of_order_Drinks,Type_of_vehicle_scooter,multiple_deliveries,Weather conditions_Stormy,Weather conditions_Sandstorms,Type_of_order_Buffet,Weather conditions_Windy,Weather conditions_Sunny,Type_of_order_Meal,Delivery_location_latitude,Delivery_person_deliveries,Restaurant_latitude,Weather conditions_Cloudy,Time_to_pick,Delivery_person_Age,Festival,Delivery_person_Ratings,Restaurant_longitude,Delivery_location_longitude,City_Semi-Urban
0,1.452895,24.402912,0.057211,0.212402,0.787256,0.157246,0.93902,0.604317,17.276807,0.245289,0.250771,0.338472,0.771497,0.182939,0.176088,0.25591,0.162384,0.158616,0.24803,18.84696,48.311408,18.650805,0.162727,9.825283,29.721823,0.018157,4.627749,76.890224,76.954965,0.000343
1,0.363636,9.191893,0.909091,1.0,0.0,0.909091,2.0,0.0,9.363636,0.818182,0.0,0.090909,0.636364,0.0,0.0,0.090909,0.0,0.0,0.090909,17.175034,52.909091,17.115034,0.090909,10.454545,28.818182,0.0,4.672727,76.940557,77.000557,0.0
