# Part 2 — Finding document duplicates and near duplicates using WebSty

## Demo dataset



## Uploading the dataset

## Getting the similarity matrix

In [None]:
url_base = "http://ws.clarin-pl.eu/nlprest2/base"
request = "/requests/cluto/81f393ce-6544-415e-9bef-038b574039a5"

# clusters.json  clutoout.txt  data.json  distance.json  labels.json  
# matrix.txt  result.clustering  result.json  result.png  result.xlsx  
# rowlabels.pkl  similarity.json  weighted.json

import requests
import json
import numpy as np


url = url_base + "/download" + request + "/similarity.json"
print(url)
data = requests.get(url).content.decode("utf-8-sig")
print("Data size: %d" % len(data))

In [None]:
parsed = json.loads(data)

rowlabels = parsed["rowlabels"]
similarities = parsed["arr"]

document_similarity = []
for x in range(0,len(rowlabels)):
    for y in range(0,x):
        sim = similarities[x][y]
        if sim > 0.0:
            document_similarity.append((sim, x, y))
            
document_similarity_sorted = sorted(document_similarity, key=lambda t: t[0], reverse=True)
            
for t in document_similarity_sorted[:20]:
    print("%6.4f %5d %5d" % t)

## Postprocessing the similarity matrix

In [None]:
import numpy as np

def create_neighbourhood_matrix(similarities, threshold):
    matrix = np.zeros(shape=(len(similarities), len(similarities)))
    for x in range(0,len(rowlabels)):
        for y in range(0,len(rowlabels)):
            matrix[x,y] = 1 if similarities[x][y] >= threshold else 0
    return matrix

matrix = create_neighbourhood_matrix(similarities, 0.7)

In [None]:
def get_row_index_with_highest_sum(matrix):
    sums = [np.sum(matrix[i,:]) for i in range(0, len(matrix))]
    return sums.index(max(sums))
 
def add_into_group(matrix, ind):
    change = True
    indexes = []
    for col in range(len(matrix)):
        if matrix[ind, col] == 1:
            indexes.append(col)
    while change == True:
        change = False
        numIndexes = len(indexes)
        for i in indexes:
            for col in range(len(matrix)):
                if matrix[i, col] == 1:
                    if col not in indexes:
                        indexes.append(col)
        numIndexes2 = len(indexes)
        if numIndexes != numIndexes2:
            change = True
    return indexes
 
def reset_rows_and_cols(matrix, indexes):
    for i in indexes:
        matrix[i,:] = 0
        matrix[:,i] = 0
    return matrix

def cluster_matrix(matrix):
    groups = []
    while np.sum(matrix) > 0:
        group = []
        row = get_row_index_with_highest_sum(matrix)
        indexes = add_into_group(matrix, row)
        groups.append(indexes)
        matrix = reset_rows_and_cols(matrix, indexes)
    return groups

groups = cluster_matrix(matrix)



In [None]:
print ("Number of distinct groups: %d" % len(groups))

print ("Groups with more than one element:")
for i in range(len(groups)):
    group = groups[i]
    if len(group) > 1:
        print("  {}) {}".format(i+1, sorted(group)))

[Back to agenda](agenda.ipynb)