### Dissimilarity Matrix
---

In [24]:
import pandas as pd
import numpy as np
from scipy.spatial.distance import pdist, squareform
from scipy.spatial import distance_matrix

data = pd.read_csv('garments_worker_productivity.csv')
data.head()
df = pd.DataFrame(data)

# Filter rows with non-null and non-zero values
subset = pd.DataFrame(df[df.notna()].head(10))
print(len(subset))
subset


10


Unnamed: 0,date,quarter,department,day,team,targeted_productivity,smv,wip,over_time,incentive,idle_time,idle_men,no_of_style_change,no_of_workers,actual_productivity
0,1/1/2015,Quarter1,sweing,Thursday,8,0.8,26.16,1108.0,7080,98,0.0,0,0,59.0,0.940725
1,1/1/2015,Quarter1,finishing,Thursday,1,0.75,3.94,,960,0,0.0,0,0,8.0,0.8865
2,1/1/2015,Quarter1,sweing,Thursday,11,0.8,11.41,968.0,3660,50,0.0,0,0,30.5,0.80057
3,1/1/2015,Quarter1,sweing,Thursday,12,0.8,11.41,968.0,3660,50,0.0,0,0,30.5,0.80057
4,1/1/2015,Quarter1,sweing,Thursday,6,0.8,25.9,1170.0,1920,50,0.0,0,0,56.0,0.800382
5,1/1/2015,Quarter1,sweing,Thursday,7,0.8,25.9,984.0,6720,38,0.0,0,0,56.0,0.800125
6,1/1/2015,Quarter1,finishing,Thursday,2,0.75,3.94,,960,0,0.0,0,0,8.0,0.755167
7,1/1/2015,Quarter1,sweing,Thursday,3,0.75,28.08,795.0,6900,45,0.0,0,0,57.5,0.753683
8,1/1/2015,Quarter1,sweing,Thursday,2,0.75,19.87,733.0,6000,34,0.0,0,0,55.0,0.753098
9,1/1/2015,Quarter1,sweing,Thursday,1,0.75,28.08,681.0,6900,45,0.0,0,0,57.5,0.750428


In [25]:
def numeric(column_name):
    data = subset[column_name]
    length = len(data)
    arr = []
    d = max(data) - min(data)
    for i in range(length):
        col = []  
        for j in range(length):
            dij = abs(data[i] - data[j]) / d
            col.append(dij)
        arr.append(col)

    return arr

M1 = numeric('targeted_productivity')
M1
# print(len(M1))
# print(len(M1[0]))


[[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0],
 [1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0],
 [0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0],
 [0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0],
 [0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0],
 [0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0],
 [1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0],
 [1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0],
 [1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0],
 [1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0]]

In [26]:
def nominal(column_name):
    data = subset[column_name]
    length = len(data)
    arr = []
    for i in range(length):
        col = []
        for j in range(length):
            if data[i] == data[j]:
                col.append(0)
            else:
                col.append(1)
        arr.append(col)

    return arr

# Test the nominal function
M2 = nominal('department')
M2
# print(len(M2))
# print(len(M2[0]))

[[0, 1, 0, 0, 0, 0, 1, 0, 0, 0],
 [1, 0, 1, 1, 1, 1, 0, 1, 1, 1],
 [0, 1, 0, 0, 0, 0, 1, 0, 0, 0],
 [0, 1, 0, 0, 0, 0, 1, 0, 0, 0],
 [0, 1, 0, 0, 0, 0, 1, 0, 0, 0],
 [0, 1, 0, 0, 0, 0, 1, 0, 0, 0],
 [1, 0, 1, 1, 1, 1, 0, 1, 1, 1],
 [0, 1, 0, 0, 0, 0, 1, 0, 0, 0],
 [0, 1, 0, 0, 0, 0, 1, 0, 0, 0],
 [0, 1, 0, 0, 0, 0, 1, 0, 0, 0]]

In [27]:
def ordinal(column_name):
    data = subset[column_name]
    arr = []
    r = data.rank(method='dense')
    max_r = max(r)
    for i in range(len(data)):
        dij = (r - 1) / (max_r - 1)
        arr.append(dij.tolist())
    return arr

M3 = ordinal('incentive')
M3
# print(len(M3))
# print(len(M3[0]))

[[1.0, 0.0, 0.8, 0.8, 0.8, 0.4, 0.0, 0.6, 0.2, 0.6],
 [1.0, 0.0, 0.8, 0.8, 0.8, 0.4, 0.0, 0.6, 0.2, 0.6],
 [1.0, 0.0, 0.8, 0.8, 0.8, 0.4, 0.0, 0.6, 0.2, 0.6],
 [1.0, 0.0, 0.8, 0.8, 0.8, 0.4, 0.0, 0.6, 0.2, 0.6],
 [1.0, 0.0, 0.8, 0.8, 0.8, 0.4, 0.0, 0.6, 0.2, 0.6],
 [1.0, 0.0, 0.8, 0.8, 0.8, 0.4, 0.0, 0.6, 0.2, 0.6],
 [1.0, 0.0, 0.8, 0.8, 0.8, 0.4, 0.0, 0.6, 0.2, 0.6],
 [1.0, 0.0, 0.8, 0.8, 0.8, 0.4, 0.0, 0.6, 0.2, 0.6],
 [1.0, 0.0, 0.8, 0.8, 0.8, 0.4, 0.0, 0.6, 0.2, 0.6],
 [1.0, 0.0, 0.8, 0.8, 0.8, 0.4, 0.0, 0.6, 0.2, 0.6]]

In [28]:
def check_nan(col_name):
    if subset[col_name].isnull().values.any() is True & subset[col_name].isna().values.any() is True & (subset[col_name] != 0): return 0 # if null value
    else: return 1 # no null value
delta_M1 = check_nan('targeted_productivity')
delta_M2 = check_nan('department')
delta_M3 = check_nan('incentive')
print(f"Delta of targeted_productivity: {delta_M1}, department: {delta_M2} and incentive: {delta_M3}")

Delta of targeted_productivity: 1, department: 1 and incentive: 1


In [29]:
# Dissimilarity Matrix
M1 = np.array(M1)
M2 = np.array(M2)
M3 = np.array(M3)

dij = ((delta_M1 * M1) + (delta_M2 * M2) + (delta_M3 * M3)) / (delta_M1 + delta_M2 + delta_M3)
print("Dissimilarity Matrix")
dij

Dissimilarity Matrix


array([[0.33333333, 0.66666667, 0.26666667, 0.26666667, 0.26666667,
        0.13333333, 0.66666667, 0.53333333, 0.4       , 0.53333333],
       [1.        , 0.        , 0.93333333, 0.93333333, 0.93333333,
        0.8       , 0.        , 0.53333333, 0.4       , 0.53333333],
       [0.33333333, 0.66666667, 0.26666667, 0.26666667, 0.26666667,
        0.13333333, 0.66666667, 0.53333333, 0.4       , 0.53333333],
       [0.33333333, 0.66666667, 0.26666667, 0.26666667, 0.26666667,
        0.13333333, 0.66666667, 0.53333333, 0.4       , 0.53333333],
       [0.33333333, 0.66666667, 0.26666667, 0.26666667, 0.26666667,
        0.13333333, 0.66666667, 0.53333333, 0.4       , 0.53333333],
       [0.33333333, 0.66666667, 0.26666667, 0.26666667, 0.26666667,
        0.13333333, 0.66666667, 0.53333333, 0.4       , 0.53333333],
       [1.        , 0.        , 0.93333333, 0.93333333, 0.93333333,
        0.8       , 0.        , 0.53333333, 0.4       , 0.53333333],
       [0.66666667, 0.33333333, 0.6      