In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import re
import gensim
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')
import transformers
from transformers import LlamaTokenizer, LlamaForCausalLM
from transformers import BertTokenizer, BertModel
from sentence_transformers import SentenceTransformer
import torch
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import string
from transformers import BitsAndBytesConfig
import concurrent.futures
from math import log
from sklearn.decomposition import PCA
import torch.nn.functional as F

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


In [None]:
def text_clear(text):
    if not isinstance(text, str):
        return ""

    text = re.sub(r'\([^)]*\)', '', text)

    text = re.sub(r'([a-z])([A-Z])', r'\1 \2', text)

    text = text.lower()

    text = re.sub(r"[^a-z0-9\s]", " ", text)

    text = re.sub(r'\s+', ' ', text).strip()

    return text

In [None]:
df = pd.read_csv('Irish_populated_places.csv')

In [None]:
df['subject'] = df['subject'].str.replace(
    'http://dbpedia.org/resource/Category:', '', regex=False
).map(lambda x: x.replace('_',' '))

In [None]:
df['broaderSubject'] = df['broaderSubject'].fillna('').str.replace(
    'http://dbpedia.org/resource/Category:', '', regex=False
).str.replace('_', ' ')

In [None]:
df['country'] = df['country'].fillna('').str.replace(
    'http://dbpedia.org/resource/', '', regex=False
).str.replace('_', ' ')

In [None]:
df['placeType'] = df['placeType'].fillna('').str.replace(
    'http://dbpedia.org/resource/', '', regex=False
).str.replace('_', ' ')

In [None]:
df['broaderPlaceType'] = df['broaderPlaceType'].fillna('').str.replace(
    'http://dbpedia.org/ontology/', '', regex=False
)

In [None]:
df.head()

Unnamed: 0,place,placeName,abstract,country,subject,broaderSubject,placeType,broaderPlaceType,population
0,"http://dbpedia.org/resource/Holywell,_Swords","Holywell, Swords",Holywell (Irish: Tobar Naofa) is a neighbourho...,Republic of Ireland,"Neighbourhoods in Swords, Dublin","Swords, Dublin","Swords, Dublin",Place,2479
1,"http://dbpedia.org/resource/Holywell,_Swords","Holywell, Swords",Holywell (Irish: Tobar Naofa) is a neighbourho...,Republic of Ireland,"Neighbourhoods in Swords, Dublin","Swords, Dublin","Swords, Dublin",Location,2479
2,"http://dbpedia.org/resource/Holywell,_Swords","Holywell, Swords",Holywell (Irish: Tobar Naofa) is a neighbourho...,Republic of Ireland,"Neighbourhoods in Swords, Dublin","Swords, Dublin","Swords, Dublin",PopulatedPlace,2479
3,"http://dbpedia.org/resource/Holywell,_Swords","Holywell, Swords",Holywell (Irish: Tobar Naofa) is a neighbourho...,Republic of Ireland,"Neighbourhoods in Swords, Dublin","Swords, Dublin","Swords, Dublin",Settlement,2479
4,"http://dbpedia.org/resource/Holywell,_Swords","Holywell, Swords",Holywell (Irish: Tobar Naofa) is a neighbourho...,Republic of Ireland,"Neighbourhoods in Swords, Dublin","Swords, Dublin","Swords, Dublin",Town,2479


In [None]:
columns_to_clean = ['placeName', 'abstract', 'country', 'subject', 'broaderSubject', 'placeType', 'broaderPlaceType']
for col in columns_to_clean:
    df[col] = df[col].apply(text_clear)

In [None]:
agg_df = df.groupby("place", as_index=False).agg({
    "placeName": "first",
    "abstract": "first",
    "country": "first",
    "subject": lambda x: "; ".join(set(x.dropna().astype(str))),
    "broaderSubject": lambda x: "; ".join(set(x.dropna().astype(str))),
    "placeType": lambda x: "; ".join(set(x.dropna().astype(str))),
    "broaderPlaceType": lambda x: "; ".join(set(x.dropna().astype(str))),
    "population": "first"
})

In [None]:
agg_df['text'] = agg_df['placeName'] + " abstract:" + agg_df['abstract'] + " country:" + agg_df['country'] + " subject:" + agg_df['subject'] + " broaderSubject:" + agg_df['broaderSubject'] + " placeType:" + agg_df['placeType'] + " broaderPlaceType:" + agg_df['broaderPlaceType'] + " population:" + agg_df['population'].astype(str)

In [None]:
agg_df.head()

Unnamed: 0,place,placeName,abstract,country,subject,broaderSubject,placeType,broaderPlaceType,population,text
0,http://dbpedia.org/resource/Abbeyleix,abbeyleix,abbeyleix is a town in county laois ireland lo...,republic of ireland,towns and villages in county laois; planned co...,towns and villages in the republic of ireland ...,,,1770,abbeyleix abstract:abbeyleix is a town in coun...
1,http://dbpedia.org/resource/Achill_Island,achill island,achill island in county mayo is the largest of...,republic of ireland,islands of county mayo; achill island; gaeltac...,wikipedia categories named after islands; land...,,,2569,achill island abstract:achill island in county...
2,http://dbpedia.org/resource/Achillbeg,achillbeg,acaill bheag is a small island in county mayo ...,republic of ireland,ghost towns in europe; achill island; islands ...,wikipedia categories named after islands; land...,,,1,achillbeg abstract:acaill bheag is a small isl...
3,http://dbpedia.org/resource/Aghamore,aghamore,aghamore is a townland in county leitrim irela...,republic of ireland,towns and villages in county leitrim,towns and villages in the republic of ireland ...,,,620,aghamore abstract:aghamore is a townland in co...
4,http://dbpedia.org/resource/Aglish,aglish,aglish is a village in west county waterford i...,republic of ireland,towns and villages in county waterford,towns and villages in the republic of ireland ...,,,333,aglish abstract:aglish is a village in west co...


MiniLM

In [None]:
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
def miniLM_embeddings(text):

  embeddings = model.encode(text, convert_to_numpy=True, normalize_embeddings=True)

  return torch.tensor(embeddings, dtype=torch.float)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
agg_df['text_embedding'] = agg_df['text'].apply(miniLM_embeddings)

In [None]:
text_emb_dict = dict(zip(agg_df['place'], agg_df['text_embedding']))

In [None]:
def compute_similar_places(entity_repr, top_k=10):
    entities = list(entity_repr.keys())
    all_embeddings = torch.stack([entity_repr[e] for e in entities])  # [N, D]
    sims = F.cosine_similarity(all_embeddings.unsqueeze(1), all_embeddings.unsqueeze(0), dim=-1)

    similar_places_dict = {}
    for i, e in enumerate(entities):
        sim_scores = sims[i]
        topk = torch.topk(sim_scores, k=top_k+1)  # +1 to skip itself
        indices = topk.indices.tolist()
        similar_entities = [entities[j] for j in indices if j != i][:top_k]
        similar_places_dict[e] = similar_entities
    return similar_places_dict

In [None]:
similar_places = compute_similar_places(text_emb_dict, top_k=10)

In [None]:
ground_truth = pd.read_csv("./ground_truth.csv")
y_true_dict = ground_truth.groupby('geographical_entity_1')['geographical_entity_2'].apply(list).to_dict()

In [None]:
def evluation(k, y_true_dict, similar_places_dict):
  invalid_places = []
  # Compute metrics
  precisions, recalls, ndcgs, hits, map_scores,mrrs = [], [], [], [], [], []
  place_idxs = list(y_true_dict.keys())

  for pid in place_idxs:
      if pid not in similar_places_dict or len(similar_places_dict[pid]) < k:
          invalid_places.append(pid)
          continue
      pred_list, rel_set = similar_places_dict[pid][:k], y_true_dict[pid]
      pred_real = "pid:"+str(pid)+' '+"pred_list:"+str(pred_list)+' '+"rel_set:"+str(rel_set)

      if len(pred_list) == 0:
          continue


      dcg = 0.0
      hit_num = 0.0
      for i in range(len(pred_list)):
          if pred_list[i] in rel_set:
              dcg += 1. / (log(i + 2) / log(2))
              hit_num += 1
      # idcg
      idcg = 0.0
      for i in range(min(len(rel_set), len(pred_list))):
          idcg += 1. / (log(i + 2) / log(2))
      ndcg = dcg / idcg
      recall = hit_num / len(rel_set)
      precision = hit_num / len(pred_list)
      hit = 1.0 if hit_num > 0.0 else 0.0

      #map
      map_score = 0.0
      num_hits = 0.0
      score = 0.0
      for i,p in enumerate(pred_list):
          if p in rel_set and p not in pred_list[:i]:
              num_hits+=1.0
              score+=num_hits/(i+1.0)
      map_score = score/min(len(rel_set),k)
      #map_score = score / min(len(rel_set), len(pred_list)) if len(rel_set) > 0 else 0.0

      #MRR
      rr = 0.0
      for i, p in enumerate(pred_list):
          if p in rel_set:
              rr = 1.0 / (i + 1.0)
              break

      ndcgs.append(ndcg)
      recalls.append(recall)
      precisions.append(precision)
      hits.append(hit)
      map_scores.append(map_score)
      mrrs.append(rr)

  avg_precision = np.mean(precisions) * 100
  avg_recall = np.mean(recalls) * 100
  avg_ndcg = np.mean(ndcgs) * 100
  avg_hit = np.mean(hits) * 100
  avg_map = np.mean(map_scores) * 100
  avg_mrr = np.mean(mrrs) * 100

  print("invalid places:", str(len(invalid_places)))
  print('MAP={:.3f} | NDCG={:.3f} |  Recall={:.3f} | Precision={:.3f} | Hits={:.3f} | MRR={:.3f}'.format(
          avg_map, avg_ndcg, avg_recall, avg_precision, avg_hit, avg_mrr))

In [None]:
evluation(k=10, y_true_dict=y_true_dict, similar_places_dict=similar_places)

invalid places: 0
MAP=21.571 | NDCG=35.957 |  Recall=16.567 | Precision=32.526 | Hits=94.198 | MRR=63.574
