## Fuzzy C-Means Clustering

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("country_wise_latest.csv")

In [3]:
df.head()

Unnamed: 0,Country/Region,Confirmed,Deaths,Recovered,Active,New cases,New deaths,New recovered,Deaths / 100 Cases,Recovered / 100 Cases,Deaths / 100 Recovered,Confirmed last week,1 week change,1 week % increase,WHO Region
0,Afghanistan,36263,1269,25198,9796,106,10,18,3.5,69.49,5.04,35526,737,2.07,Eastern Mediterranean
1,Albania,4880,144,2745,1991,117,6,63,2.95,56.25,5.25,4171,709,17.0,Europe
2,Algeria,27973,1163,18837,7973,616,8,749,4.16,67.34,6.17,23691,4282,18.07,Africa
3,Andorra,907,52,803,52,10,0,0,5.73,88.53,6.48,884,23,2.6,Europe
4,Angola,950,41,242,667,18,1,0,4.32,25.47,16.94,749,201,26.84,Africa


#### Initial Variable

In [4]:
MAX_CLUSTER = 5
MAX_ITERATION = 100
MIN_ERROR = 0.00001
FIRST_ITERATION = 1

In [5]:
def train_data(cols: list):
    return df[cols]

In [6]:
def create_random(rows, max_cluster):
    np.random.seed(0)
    return np.random.dirichlet(np.ones(max_cluster),size=rows)

### Select Data for Clustering

In [7]:
label = "Country/Region"
columns = ["New cases", "New deaths", "New recovered"]
train_data = df[columns]

In [8]:
c_df = pd.DataFrame(create_random(len(train_data), MAX_CLUSTER))

In [9]:
c_colname = ["C" + str(i + 1) for i in range(MAX_CLUSTER)]
x_colname = ["X" + str(i + 1) for i in range(len(columns))]
v_colname = ["V" + str(i + 1) for i in range(MAX_CLUSTER)]
l_colname = ["L" + str(i + 1) for i in range(MAX_CLUSTER)]
cluster_name = ["CLUSTER_" + str(i + 1) for i in range(MAX_CLUSTER)]

#### Random Number

In [10]:
c_df.columns = c_colname
train_data.columns = x_colname
c_df.head()

Unnamed: 0,C1,C2,C3,C4,C5
0,0.184517,0.291178,0.214042,0.182506,0.127756
1,0.135961,0.075372,0.2912,0.434132,0.063334
2,0.268972,0.129038,0.143912,0.445445,0.012633
3,0.016742,0.003751,0.32827,0.276536,0.374701
4,0.498615,0.208154,0.080257,0.196651,0.016322


In [11]:
result = pd.DataFrame()
result["label"] = df[label]
for i, c in enumerate(c_colname):
    result[cluster_name[i]] = np.where(c_df[c] == c_df.max(axis=1), "1", "")

In [12]:
pd.set_option('max_rows', None)
result.head(187)

Unnamed: 0,label,CLUSTER_1,CLUSTER_2,CLUSTER_3,CLUSTER_4,CLUSTER_5
0,Afghanistan,,1.0,,,
1,Albania,,,,1.0,
2,Algeria,,,,1.0,
3,Andorra,,,,,1.0
4,Angola,1.0,,,,
5,Antigua and Barbuda,,,1.0,,
6,Argentina,,1.0,,,
7,Armenia,,,,1.0,
8,Australia,,,1.0,,
9,Austria,1.0,,,,


In [13]:
c_df.head()

Unnamed: 0,C1,C2,C3,C4,C5
0,0.184517,0.291178,0.214042,0.182506,0.127756
1,0.135961,0.075372,0.2912,0.434132,0.063334
2,0.268972,0.129038,0.143912,0.445445,0.012633
3,0.016742,0.003751,0.32827,0.276536,0.374701
4,0.498615,0.208154,0.080257,0.196651,0.016322


#### Cluster

In [14]:

all_cluster = {}

div = []

for i, cluster in enumerate(cluster_name):
    all_cluster[cluster] = {}
    all_cluster[cluster]["DATA"] = pd.DataFrame()
    
    all_cluster[cluster]["DATA"][c_colname[i] + "^2"] = np.power(c_df[c_colname[i]], 2)
    all_cluster[cluster]["SUM_" + c_colname[i] + "^2"] = np.sum(all_cluster[cluster]["DATA"][c_colname[i] + "^2"])
    div.append([])
    for j, x in enumerate(x_colname):
        all_cluster[cluster]["DATA"][c_colname[i] + "*" + x] = all_cluster[cluster]["DATA"][c_colname[i] + "^2"] * train_data[x]
        
        all_cluster[cluster]["SUM_" + c_colname[i] + "*" + x] = np.sum(all_cluster[cluster]["DATA"][c_colname[i] + "*" + x])
        
        all_cluster[cluster]["SUM_" + c_colname[i] + "*" + x + "_DIV_" + "SUM_" + c_colname[i] + "^2"] = all_cluster[cluster]["SUM_" + c_colname[i] + "*" + x] / all_cluster[cluster]["SUM_" + c_colname[i] + "^2"]
        
        div[i].append(all_cluster[cluster]["SUM_" + c_colname[i] + "*" + x + "_DIV_" + "SUM_" + c_colname[i] + "^2"])

In [15]:
all_cluster["CLUSTER_1"]["DATA"].head()

Unnamed: 0,C1^2,C1*X1,C1*X2,C1*X3
0,0.034047,3.608944,0.340466,0.61284
1,0.018485,2.162789,0.110912,1.164579
2,0.072346,44.564978,0.578766,54.186962
3,0.00028,0.002803,0.0,0.0
4,0.248617,4.475109,0.248617,0.0


In [16]:
all_cluster["CLUSTER_2"]["DATA"].head()

Unnamed: 0,C2^2,C2*X1,C2*X2,C2*X3
0,0.084785,8.987157,0.847845,1.526121
1,0.005681,0.66467,0.034086,0.357899
2,0.016651,10.256974,0.133207,12.471548
3,1.4e-05,0.000141,0.0,0.0
4,0.043328,0.779903,0.043328,0.0


In [17]:
all_cluster["CLUSTER_3"]["DATA"].head()

Unnamed: 0,C3^2,C3*X1,C3*X2,C3*X3
0,0.045814,4.856287,0.45814,0.824652
1,0.084798,9.921332,0.508786,5.342256
2,0.020711,12.757842,0.165686,15.512376
3,0.107761,1.077613,0.0,0.0
4,0.006441,0.115943,0.006441,0.0


In [18]:
all_cluster["CLUSTER_4"]["DATA"].head()

Unnamed: 0,C4^2,C4*X1,C4*X2,C4*X3
0,0.033309,3.530713,0.333086,0.599555
1,0.188471,22.051099,1.130826,11.873669
2,0.198421,122.227398,1.587369,148.617404
3,0.076472,0.764719,0.0,0.0
4,0.038672,0.696091,0.038672,0.0


In [19]:
all_cluster["CLUSTER_5"]["DATA"].head()

Unnamed: 0,C5^2,C5*X1,C5*X2,C5*X3
0,0.016322,1.730098,0.163217,0.29379
1,0.004011,0.469313,0.024067,0.252707
2,0.00016,0.098303,0.001277,0.119528
3,0.140401,1.404012,0.0,0.0
4,0.000266,0.004796,0.000266,0.0


In [20]:
# print(all_cluster["CLUSTER_2"]["SUM_C2*X2_DIV_SUM_C2^2"])

#### Pusat Cluster

In [21]:
pusat_cluster = pd.DataFrame(np.array(div))
pusat_cluster.columns = x_colname
pusat_cluster.index = cluster_name
pusat_cluster.head()

Unnamed: 0,X1,X2,X3
CLUSTER_1,3419.390481,58.529735,2317.235424
CLUSTER_2,1154.220716,34.099551,794.405869
CLUSTER_3,524.323202,12.084287,484.056412
CLUSTER_4,1206.833865,30.14561,860.561005
CLUSTER_5,543.697542,19.099963,597.000924


In [22]:
cluster = {}
for i, c in enumerate(cluster_name):
    cluster[c] = pd.DataFrame()
    for x in x_colname:
        cluster[c][ "(" + x + "-" + v_colname[i] + ")^2"] = np.power(train_data[x] - pusat_cluster["X1"][i], 2)
    cluster[c]["SUM"] = cluster[c].sum(axis=1)

In [23]:
cluster["CLUSTER_2"].head()

Unnamed: 0,(X1-V2)^2,(X2-V2)^2,(X3-V2)^2,SUM
0,1098767.0,1309241.0,1290998.0,3699005.0
1,1075827.0,1318411.0,1190763.0,3585000.0
2,289681.5,1313822.0,164203.8,1767707.0
3,1309241.0,1332225.0,1332225.0,3973692.0
4,1290998.0,1329918.0,1332225.0,3953141.0


#### Objective Function

In [24]:
obj_function = np.power(c_df, 2)
obj_function.columns = [c + "^2" for c in c_colname]

In [25]:
for i, l in enumerate(l_colname):
    obj_function[l] = np.abs(cluster[cluster_name[i]]["SUM"]) * obj_function[obj_function.columns[i]]
obj_function["TOTAL"] = obj_function[l_colname].sum(axis=1)

In [26]:
obj_function.head()

Unnamed: 0,C1^2,C2^2,C3^2,C4^2,C5^2,L1,L2,L3,L4,L5,TOTAL
0,0.034047,0.084785,0.045814,0.033309,0.016322,1163440.0,313618.323622,31881.331834,135152.104781,12286.469003,1656379.0
1,0.018485,0.005681,0.084798,0.188471,0.004011,625219.4,20366.160225,54897.210193,742216.586709,2816.918957,1445516.0
2,0.072346,0.016651,0.020711,0.198421,0.00016,1926391.0,29433.974079,6740.820201,396028.504158,53.356393,2358648.0
3,0.00028,1.4e-05,0.107761,0.076472,0.140401,9812.224,55.91392,87756.26776,332294.11248,122998.060016,552916.6
4,0.248617,0.043328,0.006441,0.038672,0.000266,8688445.0,171281.459843,5186.147303,167209.252362,230.847181,9032353.0


In [27]:
TOTAL = np.sum(obj_function["TOTAL"])
LAST_ERROR = np.abs(TOTAL-0)

print(TOTAL)
print(LAST_ERROR)

3924558652.92076
3924558652.92076


#### Matrix Partition U

In [28]:
# matrix_partition = np.power(cluster["CLUSTER_1"][cluster["CLUSTER_1"].columns[0:len(x_colname)]].sum(axis=1), -1)

In [29]:
matrix_partition = pd.DataFrame()
for i, c in enumerate(cluster_name):
    matrix_partition[l_colname[i]] = np.power(cluster[c][cluster[c].columns[0:len(x_colname)]].sum(axis=1), -1)
matrix_partition["LT"] = matrix_partition.sum(axis=1)

In [30]:
matrix_partition.head()

Unnamed: 0,L1,L2,L3,L4,L5,LT
0,2.926376e-08,2.70343e-07,1e-06,2.464528e-07,1e-06,3e-06
1,2.956623e-08,2.7894e-07,2e-06,2.539298e-07,1e-06,4e-06
2,3.755506e-08,5.657045e-07,3e-06,5.010273e-07,3e-06,7e-06
3,2.856457e-08,2.516551e-07,1e-06,2.301333e-07,1e-06,3e-06
4,2.861469e-08,2.529634e-07,1e-06,2.312775e-07,1e-06,3e-06


#### Update C

In [31]:
for i, c in enumerate(c_colname):
    c_df[c] = matrix_partition[l_colname[i]]/matrix_partition["LT"]

In [32]:
c_df.head()

Unnamed: 0,C1,C2,C3,C4,C5
0,0.008837,0.081638,0.433947,0.074423,0.401155
1,0.008373,0.078996,0.437449,0.071913,0.40327
2,0.00524,0.078925,0.428656,0.069901,0.417278
3,0.009919,0.087386,0.426404,0.079913,0.396378
4,0.009837,0.08696,0.426962,0.079505,0.396736


#### Cluster Result

In [33]:
result = pd.DataFrame()
result["label"] = df[label]
for i, c in enumerate(c_colname):
    result[cluster_name[i]] = np.where(c_df[c] == c_df.max(axis=1), "1", "")

In [34]:
pd.set_option('max_rows', None)
result.head(187)

Unnamed: 0,label,CLUSTER_1,CLUSTER_2,CLUSTER_3,CLUSTER_4,CLUSTER_5
0,Afghanistan,,,1.0,,
1,Albania,,,1.0,,
2,Algeria,,,1.0,,
3,Andorra,,,1.0,,
4,Angola,,,1.0,,
5,Antigua and Barbuda,,,1.0,,
6,Argentina,1.0,,,,
7,Armenia,,,1.0,,
8,Australia,,,1.0,,
9,Austria,,,1.0,,
