In [10]:
import numpy as np
import pandas as pd
import itertools

# Q1

In [5]:
# Document Term matrix
doc_term = np.array([[3, 0, 2, 2, 0, 0, 4, 3],
                     [0, 0, 4, 3, 2, 0, 0, 2],
                     [2, 2, 0, 2, 2, 1, 0, 0],
                     [0, 1, 0, 2, 2, 0, 1, 0],
                     [0, 0, 0, 0, 0, 2, 0, 0],
                     [2, 1, 3, 4, 2, 2, 0, 2]])

# Item/Item matrix
item_matrix = np.zeros((doc_term.shape[0], doc_term.shape[0]))

for i in range(doc_term.shape[0]):
    for j in range(i+1, doc_term.shape[0]):
        # similarity calculation
        similarity = np.dot(doc_term[i], doc_term[j])
        item_matrix[i][j] = similarity
        item_matrix[j][i] = similarity

print(item_matrix)

[[ 0. 20. 10.  8.  0. 26.]
 [20.  0. 10. 10.  0. 32.]
 [10. 10.  0. 10.  2. 20.]
 [ 8. 10. 10.  0.  0. 13.]
 [ 0.  0.  2.  0.  0.  4.]
 [26. 32. 20. 13.  4.  0.]]


In [8]:
# the Item Relationship matrix using a threshold of 8 or higher based on the Item/Item matrix
relationship_matrix = item_matrix >= 8
relationship_matrix = relationship_matrix.astype(int)
relationship_matrix

array([[0, 1, 1, 1, 0, 1],
       [1, 0, 1, 1, 0, 1],
       [1, 1, 0, 1, 0, 1],
       [1, 1, 1, 0, 0, 1],
       [0, 0, 0, 0, 0, 0],
       [1, 1, 1, 1, 0, 0]])

In [12]:
# the Item Relationship matrix using a threshold of 1 or higher based on the Item/Item matrix
relationship_matrix = item_matrix >= 11
relationship_matrix = relationship_matrix.astype(int)
relationship_matrix

array([[0, 1, 0, 0, 0, 1],
       [1, 0, 0, 0, 0, 1],
       [0, 0, 0, 0, 0, 1],
       [0, 0, 0, 0, 0, 1],
       [0, 0, 0, 0, 0, 0],
       [1, 1, 1, 1, 0, 0]])

In [11]:
cliques = []
for i in range(3, len(relationship_matrix)+1):
    for combination in itertools.combinations(range(len(relationship_matrix)), i):
        if all(relationship_matrix[x][y] for x, y in itertools.combinations(combination, 2)):
            cliques.append(combination)

# print the resulting cliques
for i, clique in enumerate(cliques):
    print(f"Clique {i+1}: {clique}")

Clique 1: (0, 1, 2)
Clique 2: (0, 1, 3)
Clique 3: (0, 1, 5)
Clique 4: (0, 2, 3)
Clique 5: (0, 2, 5)
Clique 6: (0, 3, 5)
Clique 7: (1, 2, 3)
Clique 8: (1, 2, 5)
Clique 9: (1, 3, 5)
Clique 10: (2, 3, 5)
Clique 11: (0, 1, 2, 3)
Clique 12: (0, 1, 2, 5)
Clique 13: (0, 1, 3, 5)
Clique 14: (0, 2, 3, 5)
Clique 15: (1, 2, 3, 5)
Clique 16: (0, 1, 2, 3, 5)


# Q2

In [85]:
# Question 2

# Document Term matrix
doc_term = np.array([[3, 0, 2, 2, 0, 0, 4, 3],
                     [0, 0, 4, 3, 2, 0, 0, 2],
                     [2, 2, 0, 2, 2, 1, 0, 0],
                     [0, 1, 0, 2, 2, 0, 1, 0],
                     [0, 0, 0, 0, 0, 2, 0, 0],
                     [2, 1, 3, 4, 2, 2, 0, 2]])

# Existing clusters
clusters = {"CL1": [0], "CL2": [2], "CL3": [4]}
CL1, CL2, CL3 = [], [], []


[CL1.append(doc_term[item]) for item in clusters["CL1"]]
CL1 = np.array(CL1).sum(axis=0) / len(clusters["CL1"])
[CL2.append(doc_term[item]) for item in clusters["CL2"]]
CL2 = np.sum(np.array(CL2), axis=0) / len(clusters["CL2"])
[CL3.append(doc_term[item]) for item in clusters["CL3"]]
CL3 = np.sum(np.array(CL3), axis=0) / len(clusters["CL3"])


print(">>> Cluster Centroids before the start of the iterations:")
print(f"CL1: {CL1}")
print(f"CL2: {CL2}")
print(f"CL3: {CL3}")

print(f">>> Clusters before the start of the iterations:")
print(f"CL1: {[i+1 for i in clusters['CL1']]}")
print(f"CL2: {[i+1 for i in clusters['CL2']]}")
print(f"CL3: {[i+1 for i in clusters['CL3']]}")

print('\n' + '-'*50 + '\n')

for i in range(5):
    CL1, CL2, CL3 = [], [], []
    
    [CL1.append(doc_term[item]) for item in clusters["CL1"]]
    CL1 = np.array(CL1).sum(axis=0) / len(clusters["CL1"])
    [CL2.append(doc_term[item]) for item in clusters["CL2"]]
    CL2 = np.sum(np.array(CL2), axis=0) / len(clusters["CL2"])
    [CL3.append(doc_term[item]) for item in clusters["CL3"]]
    CL3 = np.sum(np.array(CL3), axis=0) / len(clusters["CL3"])
    
    print(f">>> Cluster Centroids at the start of iteration {i+1}")
    print(f"CL1: {CL1}")
    print(f"CL2: {CL2}")
    print(f"CL3: {CL3}")

    cols = [f"item {i+1}" for i in range(doc_term.shape[0])]
    res = np.zeros((4, doc_term.shape[0]))

    # create a dataframe from the calculating the similarity between the existing clusters and the items
    for j, item in enumerate(doc_term):
        res[0][j] = np.dot(CL1, item)
        res[1][j] = np.dot(CL2, item)
        res[2][j] = np.dot(CL3, item)
        
    res[3] = np.argmax(res[:3], axis=0) + 1
    res = res.astype(int)

    df = pd.DataFrame(res, columns=cols, index=["CL1", "CL2", "CL3", "Cluster"])
    df_transposed = df.T
    df_transposed.reset_index(inplace=True)

    clusters = {f"CL{i}": list(df_transposed[df_transposed["Cluster"] == i].index) for i in range(1, 4)}
    
    
    print(f"\n>>> Clusters at the end of iteration {i+1}:")
    print(f"CL1: {[i+1 for i in clusters['CL1']]}")
    print(f"CL2: {[i+1 for i in clusters['CL2']]}")
    print(f"CL3: {[i+1 for i in clusters['CL3']]}")
    
    print('\n' + '-'*50 + '\n')


>>> Cluster Centroids before the start of the iterations:
CL1: [3. 0. 2. 2. 0. 0. 4. 3.]
CL2: [2. 2. 0. 2. 2. 1. 0. 0.]
CL3: [0. 0. 0. 0. 0. 2. 0. 0.]
>>> Clusters before the start of the iterations:
CL1: [1]
CL2: [3]
CL3: [5]

--------------------------------------------------

>>> Cluster Centroids at the start of iteration 1
CL1: [3. 0. 2. 2. 0. 0. 4. 3.]
CL2: [2. 2. 0. 2. 2. 1. 0. 0.]
CL3: [0. 0. 0. 0. 0. 2. 0. 0.]

>>> Clusters at the end of iteration 1:
CL1: [1, 2, 6]
CL2: [3, 4]
CL3: [5]

--------------------------------------------------

>>> Cluster Centroids at the start of iteration 2
CL1: [1.66666667 0.33333333 3.         3.         1.33333333 0.66666667
 1.33333333 2.33333333]
CL2: [1.  1.5 0.  2.  2.  0.5 0.5 0. ]
CL3: [0. 0. 0. 0. 0. 2. 0. 0.]

>>> Clusters at the end of iteration 2:
CL1: [1, 2, 4, 6]
CL2: [3]
CL3: [5]

--------------------------------------------------

>>> Cluster Centroids at the start of iteration 3
CL1: [1.25 0.5  2.25 2.75 1.5  0.5  1.25 1.75]
CL2:

In [19]:
# New clusters
new_CLs = [np.zeros((1, doc_term.shape[1])) for i in range(3)]

# Assign each doc_term point to its nearest cluster


new_CL1: [[3. 0. 2. 2. 0. 0. 4. 3.]]
new_CL2: [[0. 0. 4. 3. 2. 0. 0. 2.]
 [2. 2. 0. 2. 2. 1. 0. 0.]
 [0. 1. 0. 2. 2. 0. 1. 0.]
 [2. 1. 3. 4. 2. 2. 0. 2.]]
new_CL3: [[0. 0. 0. 0. 0. 2. 0. 0.]]
