# Find Duplicates
The following example shows how to identify entities that are likely to be duplicates of one through the __ampligraph.discovery.find_duplicates__ API.

In [1]:
import sys
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
import tensorflow as tf
tf.get_logger().setLevel('ERROR')

In [2]:
import pandas as pd
import re
import numpy as np

# The IMDB dataset used here is part of the Movies5 dataset found on:
# The Magellan Data Repository (https://sites.google.com/site/anhaidgroup/projects/data)
import requests
url = 'http://pages.cs.wisc.edu/~anhai/data/784_data/movies5.tar.gz'
open('movies5.tar.gz', 'wb').write(requests.get(url).content)
import tarfile
tar = tarfile.open('movies5.tar.gz', "r:gz")
tar.extractall()
tar.close()

# Reading tabular dataset of IMDB movies and filling the missing values
imdb = pd.read_csv("movies5/csv_files/imdb.csv")
imdb["directors"] = imdb["directors"].fillna("UnknownDirector")
imdb["actors"] = imdb["actors"].fillna("UnknownActor")
imdb["genre"] = imdb["genre"].fillna("UnknownGenre")
imdb["duration"] = imdb["duration"].fillna("0")

# Creating knowledge graph triples from tabular dataset
imdb_triples = []

for _, row in imdb.iterrows():
    movie_id = "ID" + str(row["id"])
    directors = row["directors"].split(",")
    actors = row["actors"].split(",")
    genres = row["genre"].split(",")
    duration = "Duration" + str(int(re.sub("\D", "", row["duration"])) // 30)

    directors_triples = [(movie_id, "hasDirector", d) for d in directors]
    actors_triples = [(movie_id, "hasActor", a) for a in actors]
    genres_triples = [(movie_id, "hasGenre", g) for g in genres]
    duration_triple = (movie_id, "hasDuration", duration)

    imdb_triples.extend(directors_triples)
    imdb_triples.extend(actors_triples)
    imdb_triples.extend(genres_triples)
    imdb_triples.append(duration_triple)
    
imdb_triples = np.array(imdb_triples)

In [3]:
from ampligraph.latent_features import ScoringBasedEmbeddingModel

# Create, compile and fit the model
model = ScoringBasedEmbeddingModel(eta=5, 
                                   k=300,
                                   scoring_type='ComplEx')



model.compile(optimizer='adam', 
              loss='multiclass_nll')


model.fit(imdb_triples,
          batch_size=10000,
          epochs=50, 
          verbose=False)


Metal device set to: Apple M1 Pro

systemMemory: 32.00 GB
maxCacheSize: 10.67 GB



<tensorflow.python.keras.callbacks.History at 0x16b2e90c0>

In [4]:
# Finding duplicates movies (entities)
from ampligraph.discovery import find_duplicates

# get the unique movies - in this case all subject entities are movies
entities = np.unique(imdb_triples[:, 0])
# find duplicate movies
dups, _ = find_duplicates(entities, model, mode='e', tolerance=0.45)
id_list = []
for data in dups:
    for i in data:
        id_list.append(int(i[2:]))
print(imdb.iloc[id_list[:6]][['movie_name', 'year']])


              movie_name  year
2115           White Air  2007
2116           White Air  2007
2992  Chor Machaaye Shor  2002
2991  Chor Machaaye Shor  2002
5856    The Golden Lotus  1974
5857    The Golden Lotus  1974
