### DB Scan Clustering

Problem Statement: Cluster the Following Wholesale Data

Dataset--> Wholesale customers data.csv

In [1]:
#Loading the Required libraries
from sklearn.cluster import DBSCAN  # for DBScan
from sklearn.preprocessing import StandardScaler
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
#Loading the Dataset
df = pd.read_csv("C:/Users/Akaash/Downloads/Wholesale customers data.csv");
df.head()

Unnamed: 0,Channel,Region,Fresh,Milk,Grocery,Frozen,Detergents_Paper,Delicassen
0,2,3,12669,9656,7561,214,2674,1338
1,2,3,7057,9810,9568,1762,3293,1776
2,2,3,6353,8808,7684,2405,3516,7844
3,1,3,13265,1196,4221,6404,507,1788
4,2,3,22615,5410,7198,3915,1777,5185


In [3]:
#Checking For null values
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 440 entries, 0 to 439
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype
---  ------            --------------  -----
 0   Channel           440 non-null    int64
 1   Region            440 non-null    int64
 2   Fresh             440 non-null    int64
 3   Milk              440 non-null    int64
 4   Grocery           440 non-null    int64
 5   Frozen            440 non-null    int64
 6   Detergents_Paper  440 non-null    int64
 7   Delicassen        440 non-null    int64
dtypes: int64(8)
memory usage: 27.6 KB


Inference: There are no NA Values in the Dataset

In [4]:
df['Channel'].var()

0.2190722716918627

In [5]:
df['Region'].var()

0.5994978256367766

Inference: There is very less Variabilty in 'Channel' & 'Region' column so Droping them 

In [6]:
#Droping
df.drop(['Channel','Region'], axis = 1 , inplace = True)
df

Unnamed: 0,Fresh,Milk,Grocery,Frozen,Detergents_Paper,Delicassen
0,12669,9656,7561,214,2674,1338
1,7057,9810,9568,1762,3293,1776
2,6353,8808,7684,2405,3516,7844
3,13265,1196,4221,6404,507,1788
4,22615,5410,7198,3915,1777,5185
...,...,...,...,...,...,...
435,29703,12051,16027,13135,182,2204
436,39228,1431,764,4510,93,2346
437,14531,15488,30243,437,14841,1867
438,10290,1981,2232,1038,168,2125


In [7]:
#Converting the df in array for standard scalar
array = df.values
array

array([[12669,  9656,  7561,   214,  2674,  1338],
       [ 7057,  9810,  9568,  1762,  3293,  1776],
       [ 6353,  8808,  7684,  2405,  3516,  7844],
       ...,
       [14531, 15488, 30243,   437, 14841,  1867],
       [10290,  1981,  2232,  1038,   168,  2125],
       [ 2787,  1698,  2510,    65,   477,    52]], dtype=int64)

In [8]:
#Standardizing the Value
stscaler = StandardScaler().fit(array)
X = stscaler.transform(array)
X

array([[ 0.05293319,  0.52356777, -0.04111489, -0.58936716, -0.04356873,
        -0.06633906],
       [-0.39130197,  0.54445767,  0.17031835, -0.27013618,  0.08640684,
         0.08915105],
       [-0.44702926,  0.40853771, -0.0281571 , -0.13753572,  0.13323164,
         2.24329255],
       ...,
       [ 0.20032554,  1.31467078,  2.34838631, -0.54337975,  2.51121768,
         0.12145607],
       [-0.13538389, -0.51753572, -0.60251388, -0.41944059, -0.56977032,
         0.21304614],
       [-0.72930698, -0.5559243 , -0.57322717, -0.62009417, -0.50488752,
        -0.52286938]])

### Building the DBScan Algorithms

In [9]:
dbscan = DBSCAN(eps=0.8, min_samples=7)
dbscan.fit(X)

DBSCAN(eps=0.8, min_samples=7)

Inference:eps is the epsilon and min_samples is the min_pts parameter

In [10]:
#Noisy samples are given the label -1.
dbscan.labels_

array([ 0,  0, -1,  0, -1,  0,  0,  0,  0,  0,  0,  0, -1,  0,  0,  0,  0,
       -1,  0,  0,  0,  0, -1, -1, -1,  0,  0,  0, -1,  0,  0,  0,  0, -1,
        0,  0,  0,  0,  0, -1, -1,  0,  0,  0,  0, -1,  0, -1,  0, -1,  0,
        0,  0,  0,  0,  0, -1,  0,  0,  0,  0, -1,  0,  0,  0, -1,  0,  0,
        0,  0,  0, -1,  0,  0,  0,  0,  0, -1,  0,  0,  0,  0,  0,  0,  0,
       -1, -1, -1,  0,  0,  0,  0, -1, -1,  0,  0,  0,  0,  0,  0, -1,  0,
        0, -1,  0,  0,  0,  0,  0, -1,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0, -1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0, -1, -1,  0,  0, -1,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0, -1,  0, -1,  0,  0,  0,  0,
        0, -1,  0,  0,  0,  0, -1,  0,  0,  0,  0, -1,  0, -1,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0, -1,  0,  0,  0,  0, -1, -1,  0,
        0,  0,  0,  0,  0, -1,  0, -1,  0,  0,  0,  0, -1,  0, -1,  0,  0,
        0,  0,  0,  0,  0

Inference: The -1 values are outliers and by industry standard noise/outliers must be betwwen 1% to 30%, in this case it can be less then 30%.

In [11]:
#Creating a DataFrame of these Cluster Value
cl=pd.DataFrame(dbscan.labels_,columns=['cluster'])
cl.head()

Unnamed: 0,cluster
0,0
1,0
2,-1
3,0
4,-1


In [12]:
#Appending cl Dataframe to the original Dataset
final = pd.concat([df,cl],axis =1)
final

Unnamed: 0,Fresh,Milk,Grocery,Frozen,Detergents_Paper,Delicassen,cluster
0,12669,9656,7561,214,2674,1338,0
1,7057,9810,9568,1762,3293,1776,0
2,6353,8808,7684,2405,3516,7844,-1
3,13265,1196,4221,6404,507,1788,0
4,22615,5410,7198,3915,1777,5185,-1
...,...,...,...,...,...,...,...
435,29703,12051,16027,13135,182,2204,-1
436,39228,1431,764,4510,93,2346,0
437,14531,15488,30243,437,14841,1867,-1
438,10290,1981,2232,1038,168,2125,0


In [13]:
#List of Outliers Datapoint
list(np.where(final.cluster==-1))

[array([  2,   4,  12,  17,  22,  23,  24,  28,  33,  39,  40,  45,  47,
         49,  56,  61,  65,  71,  77,  85,  86,  87,  92,  93, 100, 103,
        109, 125, 141, 142, 145, 163, 165, 171, 176, 181, 183, 196, 201,
        202, 209, 211, 216, 218, 228, 239, 251, 253, 254, 258, 259, 265,
        277, 282, 284, 304, 309, 312, 319, 325, 331, 333, 338, 343, 351,
        357, 358, 372, 376, 382, 384, 403, 409, 411, 413, 425, 427, 430,
        431, 435, 437], dtype=int64)]

Inference: It gives the index of the Outliers Datapoints.