In [None]:
!pip install sentence_transformers

In [135]:
from sentence_transformers import  SentenceTransformer
import numpy as np
import json

model = SentenceTransformer("all-MiniLM-L6-v2")

In [136]:
def model_similarity(vec1, vec2):

    vec1 = np.array(vec1)
    vec2 = np.array(vec2)
    vectors = np.vstack([vec1, vec2])
    sim_matrix = model.similarity(vectors, vectors)
    return sim_matrix[0][1].item()

def binarize_vector(vec, thresh=0.0):

    vec = np.array(vec)
    return (vec >= thresh).astype(int)

def xor_difference(vec1, vec2, thresh=0.0):

    bin1 = binarize_vector(vec1, thresh)
    bin2 = binarize_vector(vec2, thresh)
    xor = np.bitwise_xor(bin1, bin2)
    return xor

def get_child_key(level):

    mapping = {
        'univ': 'faculties',
        'faculty': 'departments',
        'dep': 'specialities',
        'speciality': 'subjects'
    }
    return mapping.get(level)

def next_level(level):
    """
    Get the next level name for recursion.
    """
    mapping = {
        'univ': 'faculty',
        'faculty': 'dep',
        'dep': 'speciality',
        'speciality': 'subject'
    }
    return mapping.get(level)

def compare_entities(ent1, ent2, level, sim_thresh=0.4, bin_thresh=0.0):
    """
    Compare two entities at a given hierarchical level.
    Returns a dictionary with all comparison results, no filtering.
    """
    # Compute similarity on the names using your model.
    embeddings = model.encode([ent1['name'], ent2['name']])
    sim = model_similarity(*embeddings.tolist())

    # Build the basic result structure for this level.
    result = {
        'level': level,
        'name1': ent1.get('name'),
        'name2': ent2.get('name'),
        'similarity': sim
    }

    # If we are at the "subject" level (leaf node), return the result.
    if level == 'subject':
        result['binary'] = 1 if sim > sim_thresh else 0
        return result

    # Otherwise, recursively compare children
    child_key = get_child_key(level)
    if child_key and child_key in ent1 and child_key in ent2:
        children1 = ent1[child_key]
        children2 = ent2[child_key]
        result['children_comparison'] = []

        for child1 in children1:
            for child2 in children2:
                child_comp = compare_entities(child1, child2, next_level(level), sim_thresh, bin_thresh)
                if child_comp is not None:
                    result['children_comparison'].append(child_comp)

    return result


def extract_subjects_recursive(data, sim_thresh=0.4):
    subjects = []

    def traverse(node):
        if isinstance(node, dict):
            if node.get("level") == "subject":
                sim = node.get("similarity", 0)
                subjects.append({
                    "name1": node.get("name1"),
                    "name2": node.get("name2"),
                    "similarity": sim,
                    "binary": 1 if sim > sim_thresh else 0
                })
            for key in node:
                traverse(node[key])
        elif isinstance(node, list):
            for item in node:
                traverse(item)

    traverse(data)
    return subjects



def return_mismatch(ent1, ent2, similarity_threshold = 0.5):
    comparison_result = compare_entities(ent1, ent2, "univ", sim_thresh=0.4, bin_thresh=0.0)
    subject_vector = extract_subjects_recursive(comparison_result, sim_thresh=0.4)

    gt_vector = [1] * len(subject_vector)
    pred_vector = [subj["binary"] for subj in subject_vector]

    xor_vector = [a ^ b for a, b in zip(gt_vector, pred_vector)]

    name1_max_sim = {}
    for subj in subject_vector:
        name1 = subj["name1"]
        sim = subj["similarity"]
        if name1 not in name1_max_sim or sim > name1_max_sim[name1]:
            name1_max_sim[name1] = sim

    final_mismatches = []
    for i in range(len(subject_vector)):
        subj = subject_vector[i]
        if xor_vector[i] == 1 and name1_max_sim[subj["name1"]] <= 0.4:
            final_mismatches.append(subj["name1"])

    final_mismatch_set = set(final_mismatches)
    return final_mismatch_set

In [137]:
return_mismatch(univ1, univ2, similarity_threshold = 0.5)

{'kiram'}

In [138]:
import json

with open('/content/UnivData.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

# print(data)

In [140]:
algerian = []
foriegn = []
for univ in data:
  if univ['country'].lower() == "algeria":
    algerian.append(univ)
  else:
    foriegn.append(univ)

In [143]:
algerian[0]

{'name': 'University of Algiers 1',
 'country': 'Algeria',
 'faculties': [{'name': 'Faculty of Science',
   'departments': [{'name': 'Department of Mathematics',
     'specialities': [{'name': 'Pure Mathematics',
       'subjects': [{'name': 'Real Analysis'},
        {'name': 'Abstract Algebra'},
        {'name': 'Topology'},
        {'name': 'Complex Analysis'},
        {'name': 'Differential Geometry'}]},
      {'name': 'Applied Mathematics',
       'subjects': [{'name': 'Numerical Methods'},
        {'name': 'Mathematical Modeling'},
        {'name': 'Optimization'},
        {'name': 'Statistics'},
        {'name': 'Computational Mathematics'}]}]},
    {'name': 'Department of Physics',
     'specialities': [{'name': 'Theoretical Physics',
       'subjects': [{'name': 'Quantum Mechanics'},
        {'name': 'General Relativity'},
        {'name': 'Statistical Mechanics'},
        {'name': 'Field Theory'},
        {'name': 'Mathematical Methods'}]},
      {'name': 'Experimental Physics

In [144]:
results = dict()


for alg in algerian:
  for forg in foriegn:
    mismatch = return_mismatch(forg, alg, similarity_threshold = 0.4)
    results[alg['name']] = mismatch

In [145]:
results

{'University of Algiers 1': {'Bridge Engineering',
  'Concrete Structures',
  'Data Structures',
  'Earthquake Geotechnics',
  'Foundation Engineering',
  'Ground Improvement',
  'Knowledge Representation',
  'Robotics',
  'Seismic Design',
  'Slope Stability',
  'Soil Mechanics',
  'Steel Structures'},
 'University of Oran 2': {'Approximation Algorithms',
  'Computational Complexity',
  'Computer Vision',
  'Graph Algorithms',
  'Ground Improvement',
  'Knowledge Representation',
  'Machine Learning',
  'Natural Language Processing',
  'Randomized Algorithms',
  'Robotics',
  'Slope Stability',
  'Soil Mechanics'},
 'University of Constantine 1': {'Approximation Algorithms',
  'Bridge Engineering',
  'Computational Complexity',
  'Computer Vision',
  'Concrete Structures',
  'Data Structures',
  'Earthquake Geotechnics',
  'Foundation Engineering',
  'Graph Algorithms',
  'Ground Improvement',
  'Knowledge Representation',
  'Machine Learning',
  'Natural Language Processing',
  'Rand