In [1]:
model_id = "sentence-transformers/all-MiniLM-L6-v2"
hf_token = "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"

In [2]:
import requests
from retry import retry

api_url = f"https://api-inference.huggingface.co/pipeline/feature-extraction/{model_id}"
headers = {"Authorization": f"Bearer {hf_token}"}

### Generacion de embeddings

In [3]:
@retry(tries=3, delay=10)
def query(texts):
    response = requests.post(api_url, headers=headers, json={"inputs": texts})
    result = response.json()
    print(result)
    if isinstance(result, list):
      return result
    elif list(result.keys())[0] == "error":
      raise RuntimeError(
          "The model is currently loading, please re-run the query."
          )

In [4]:
from json_cleaner_and_converter import json_to_dataframe
df = json_to_dataframe('web-scrapping/scraper/extraction/*.json')
df["embedding"] = query(df["content"].tolist())


[[-0.036524586379528046, 0.0752684697508812, 0.024583466351032257, 0.00961547065526247, 0.02466585487127304, 0.09877210110425949, -0.020703932270407677, -0.003950054757297039, 0.012347975745797157, 0.018478170037269592, 0.05401242524385452, -0.0027850314509123564, -0.020499495789408684, -0.03614186495542526, 0.018398011103272438, -0.01606239378452301, -0.023429786786437035, -0.006483232602477074, -0.016796614974737167, -0.011474143713712692, -0.03718096762895584, 0.013095933943986893, -0.010470084846019745, -0.025561360642313957, 0.04945765808224678, -0.10572347790002823, 0.02245912328362465, 0.02750973217189312, -0.03634612262248993, 0.02851942554116249, 0.011938045732676983, 0.04035051539540291, -0.044626735150814056, 0.08014127612113953, 0.04465274140238762, -0.10473640263080597, -0.021513493731617928, 0.018873687833547592, 0.05764938145875931, -0.040044721215963364, 0.026437539607286453, -0.1258319616317749, -0.01939552091062069, -0.1418180763721466, -0.02508978731930256, 0.0102749

In [5]:
df

Unnamed: 0,title,content,embedding
0,Capitalizing on Vietnam's Healthy and Sustaina...,The Social Security Agreement between Vietnam ...,"[-0.036524586379528046, 0.0752684697508812, 0...."
1,Russian Orthodox priests face persecution from...,Partly cloudy. Low 76F. Winds light and variab...,"[0.0486292764544487, 0.025456130504608154, -0...."
2,HOME - Archyde,"The independent manufacturer ZH Studio, in par...","[-0.11916355043649673, 0.041927848011255264, -..."
3,Samia uses Magufuli-style tactics on critics o...,Tanzanian President Samia Suluhu Hassan. PHOTO...,"[-0.09568692743778229, 0.043381985276937485, -..."
4,Saudi Arabia appoints first Palestinian envoy ...,I accept the JTA Privacy Policy. By submitting...,"[0.0034141235519200563, 0.01724512316286564, 0..."
...,...,...,...
92,Nursing Detroit Training Center Opens at Durfe...,Designed to meet demand in the health care sec...,"[-0.08147545158863068, -0.12105432152748108, -..."
93,Worcester’s Indian Lake closing Tuesday for al...,"Update: Due to rain, the lake will be reopened...","[-0.04935750737786293, 0.023076243698596954, 0..."
94,GOP debate: Where each candidate stands on the...,The Republican National Committee wants qualif...,"[-0.00406273640692234, -0.0035951111931353807,..."
95,"Tinubu, Mbah meet over Enugu’s development – T...","From Juliana Taiwo-Obalonye, Abuja President B...","[0.007244040723890066, -0.007231647614389658, ..."


In [6]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

In [7]:
reference_sentence = "news about to Arabia and Palestinian"
reference_embedding = query(reference_sentence)

[0.05621788650751114, 0.04397282004356384, -0.004763108678162098, 0.0699581652879715, 0.004715069197118282, -0.029321392998099327, -0.04310661554336548, -0.13302476704120636, -0.016067614778876305, 0.03801357373595238, 0.03166257217526436, 0.03286885470151901, 0.037485454231500626, 0.008457213640213013, 0.04927690327167511, -0.045170217752456665, -0.014165444299578667, -0.026988083496689796, -0.04117928445339203, -0.06771251559257507, -0.05520980805158615, 0.007074193097651005, 0.06679452210664749, 0.026798876002430916, -0.010229019448161125, 0.02822473831474781, 0.00444214791059494, -0.08221668750047684, -0.03236991539597511, -0.04152069240808487, -0.025796031579375267, -0.03147623687982559, -0.039885252714157104, -0.048236068338155746, 0.04579707235097885, 0.08556392043828964, 0.02167722024023533, 0.03511171415448189, 0.038712695240974426, -0.033641450107097626, 0.17432968318462372, -0.10454002022743225, 0.07279040664434433, -0.03787936270236969, -0.0002782545634545386, -0.0188127495

In [8]:
# Convertir cada embedding de la columna 'embedding' a un array de NumPy
df["embedding"] = df["embedding"].apply(lambda x: np.array(x))

# Ahora puedes convertir toda la columna en un array 2D de NumPy
news_embeddings_np = np.array(df["embedding"].tolist())  # Asegura que sea un array 2D de floats

query_embedding_np = np.array(reference_embedding)

# Calcular la similitud del coseno entre la consulta y todas las noticias
similarities = cosine_similarity([query_embedding_np], news_embeddings_np)[0]


In [9]:
top_k = 10  # Número de resultados que quieres obtener

# Obtener los índices de los top_k resultados más similares
top_k_indices = similarities.argsort()[-top_k:][::-1]  # Orden de mayor a menor

# Mostrar los índices y las similitudes correspondientes
for idx in top_k_indices:
    print(f"Noticia {idx} - Similaridad: {similarities[idx]}")

# Si tienes una lista de noticias, puedes recuperar las noticias más similares así:
top_k_noticias = [df["title"][idx] for idx in top_k_indices]


Noticia 4 - Similaridad: 0.5549048803763158
Noticia 60 - Similaridad: 0.5416101985831872
Noticia 70 - Similaridad: 0.4708427892441013
Noticia 33 - Similaridad: 0.43841085565541915
Noticia 89 - Similaridad: 0.41637760061432905
Noticia 12 - Similaridad: 0.40488993661805034
Noticia 15 - Similaridad: 0.35260824088268383
Noticia 24 - Similaridad: 0.3275227377564065
Noticia 3 - Similaridad: 0.3265500193758707
Noticia 28 - Similaridad: 0.3045268569638364


In [10]:
top_k_noticias

['Saudi Arabia appoints first Palestinian envoy as Saudi-Israel normalization talks continue - Jewish Telegraphic Agency',
 "Iran announces FM's visit to Saudi Arabia in near future | Al Mayadeen English",
 'Palestinians Complain Biden Admin Has Done Nothing For Them | Frontpage Mag',
 'Syrian Civilians Struggle between Deadly Israeli Air Strikes and Equally Deadly US Sanctions | HUMAN WRONGS WATCH',
 'How Rabaa and its symbol changed Turkish-Egyptian relations | Middle East Eye',
 'US breaking pledge to help prominent political prisoner in Egypt, family says | Middle East Eye',
 'Niger Says 17 Soldiers Killed in Ambush - allAfrica.com',
 'Two years after Taliban takeover, many Afghans who helped Canada’s military remain in limbo - The Globe and Mail',
 'Samia uses Magufuli-style tactics on critics of controversial Dar port deal - The East African',
 'Caretaker PM vows to ensure protection of minorities in Pakistan']