In [3]:
!pip install "Elasticsearch==8.7.0"
!pip install sqlalchemy
!pip install psycopg2-binary

Collecting psycopg2-binary
  Downloading psycopg2_binary-2.9.10-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.0/3.0 MB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: psycopg2-binary
Successfully installed psycopg2-binary-2.9.10


In [44]:
import pandas as pd
from sqlalchemy import create_engine

# Conexión a PostgreSQL
user = "userGESTDB"
password = "passGESTDB"
host = "postgres_db"  # nombre del servicio en docker-compose
port = "5432"
db = "GESTDB"

engine = create_engine(f"postgresql://{user}:{password}@{host}:{port}/{db}")

# Leer una tabla completa
df = pd.read_sql("SELECT * FROM grado;", engine)
df.head()

Unnamed: 0,id,nombre,id_area,descripcion,salidas
0,1,Grado en Ingeniería Informática,1,"Formación en programación, sistemas y redes","Desarrollador, analista, consultor IT"
1,2,Grado en Matemáticas,2,Formación avanzada en álgebra y estadística,"Docencia, investigación, finanzas"
2,3,Grado en Biología,3,Estudio de organismos vivos y ecosistemas,"Investigador, docente, técnico de laboratorio"
3,4,Grado en Derecho,4,Formación jurídica en distintas ramas del Derecho,"Abogacía, judicatura, asesoría legal"
4,5,Grado en Economía,5,Análisis económico y financiero,"Economista, analista, consultor"


In [45]:
from elasticsearch import Elasticsearch

es = Elasticsearch("http://elasticsearch:9200")

# Verifica la conexión
print(es.info().body)

{'name': '2e4dd1d218b7', 'cluster_name': 'docker-cluster', 'cluster_uuid': 'EEfTU5MFTH2Fc3SU6wXKkw', 'version': {'number': '8.7.0', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '09520b59b6bc1057340b55750186466ea715e30e', 'build_date': '2023-03-27T16:31:09.816451435Z', 'build_snapshot': False, 'lucene_version': '9.5.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'}


In [46]:
df.fillna({
        'nombre': 'Unknown', 
        'descripcion':'Este grado no contiene descripción',
        'salidas':'Este grado no contiene salidas'
        },
    inplace=True)

In [47]:
mapping = {
    "mappings": {
        "properties": {
            "id": { "type": "integer" },        
            "nombre": { "type": "keyword" },     
            "descripcion": { "type": "text" },
            "id_area": { "type": "integer" },
            "salidas": { "type": "text" },
        }
    }
}

index_name = 'salidas'
if es.indices.exists(index=index_name):
    es.indices.delete(index=index_name)

es.indices.create(index=index_name, mappings=mapping["mappings"])
print(f"Index '{index_name}' created with mapping.")

Index 'salidas' created with mapping.


In [48]:
import json
from elasticsearch.helpers import bulk

def bulk_index_data(es, data, index_name):
    batch_size = 50  # Reducir el tamaño del lote a 50
    for i in range(0, len(data), batch_size):
        batch = data[i:i+batch_size]
        actions = []
        for doc in batch:
            actions.append({
                "_index": index_name,
                "_id": doc['id'],
                "_source": doc
            })
        # Capturar la respuesta para verificar errores
        resp = bulk(es, actions, raise_on_error=True)
        print("Indexed:", resp[0], "Errors:", resp[1])

salidas = df.to_dict(orient='records')
bulk_index_data(es, salidas, index_name)


Indexed: 10 Errors: []


In [49]:
import json

# Query 1

res = es.search(
    index="salidas",
    size=3,
    query={
        "term": {
            "nombre": "Grado en Derecho"
        }
    }
)
print(json.dumps(res.body, indent=4))

{
    "took": 21,
    "timed_out": false,
    "_shards": {
        "total": 1,
        "successful": 1,
        "skipped": 0,
        "failed": 0
    },
    "hits": {
        "total": {
            "value": 1,
            "relation": "eq"
        },
        "max_score": 1.9924302,
        "hits": [
            {
                "_index": "salidas",
                "_id": "4",
                "_score": 1.9924302,
                "_source": {
                    "id": 4,
                    "nombre": "Grado en Derecho",
                    "id_area": 4,
                    "descripcion": "Formaci\u00f3n jur\u00eddica en distintas ramas del Derecho",
                    "salidas": "Abogac\u00eda, judicatura, asesor\u00eda legal"
                }
            }
        ]
    }
}


In [50]:
# Query
res = es.search(
    index="salidas",
    size=3,
    query={
        "match": {
            "salidas": "docencia",
            "salidas": "investigación"
        }
    }
)
print(json.dumps(res.body, indent=4))

{
    "took": 13,
    "timed_out": false,
    "_shards": {
        "total": 1,
        "successful": 1,
        "skipped": 0,
        "failed": 0
    },
    "hits": {
        "total": {
            "value": 2,
            "relation": "eq"
        },
        "max_score": 1.5900146,
        "hits": [
            {
                "_index": "salidas",
                "_id": "2",
                "_score": 1.5900146,
                "_source": {
                    "id": 2,
                    "nombre": "Grado en Matem\u00e1ticas",
                    "id_area": 2,
                    "descripcion": "Formaci\u00f3n avanzada en \u00e1lgebra y estad\u00edstica",
                    "salidas": "Docencia, investigaci\u00f3n, finanzas"
                }
            },
            {
                "_index": "salidas",
                "_id": "8",
                "_score": 1.5900146,
                "_source": {
                    "id": 8,
                    "nombre": "Grado en Filolog\u00eda Hi

In [21]:
!pip install sentence_transformers

Collecting sentence_transformers
  Downloading sentence_transformers-3.2.1-py3-none-any.whl (255 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m255.8/255.8 kB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting transformers<5.0.0,>=4.41.0
  Downloading transformers-4.46.3-py3-none-any.whl (10.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.0/10.0 MB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m00:01[0m0:01[0m
Collecting torch>=1.11.0
  Downloading torch-2.4.1-cp38-cp38-manylinux1_x86_64.whl (797.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m797.1/797.1 MB[0m [31m327.2 kB/s[0m eta [36m0:00:00[0m00:01[0m00:21[0m
[?25hCollecting huggingface-hub>=0.20.0
  Downloading huggingface_hub-0.36.0-py3-none-any.whl (566 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m566.1/566.1 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting fsspec>=2023.5.0
  Downloading fsspe

In [24]:
# indice semántico

vector_index_mapping = {
    "properties": {
        "id": { "type": "integer" },        
        "nombre": { "type": "keyword" },     
        "id_area": { "type": "integer" },
        "salidas": { "type": "text" },
        "descripcion": { "type": "text" },
        "descripcion_vector": {
            "type": "dense_vector",
            "dims": 3,  # Dimensionalityof the embeddings
            "index": True,
            "similarity": "cosine"
        }
    }
}

vector_index_name = "vector_descripcion"
if es.indices.exists(index=vector_index_name):
    es.indices.delete(index=vector_index_name)
es.indices.create(
    index=vector_index_name,
    mappings=vector_index_mapping
)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'vector_example'})

In [None]:
# creamos los embedings de la descripcion de titulo
from sentence_transformers import SentenceTransformer

# Load a pre-trained Sentence Transformer model
model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')

# Generate embeddings (vector representations) for the sentences
embeddings = model.encode(df["descripcion"].values.tolist())

# Print the vector for the first sentence
print(embeddings[0])

In [None]:
import json
from elasticsearch.helpers import bulk

def bulk_index_data(es, data, index_name):
    batch_size = 50  # Reducir el tamaño del lote a 50
    for i in range(0, len(data), batch_size):
        batch = data[i:i+batch_size]
        actions = []
        for doc in batch:
            actions.append({
                "_index": index_name,
                "_id": doc['id'],
                "_source": doc
            })
        # Capturar la respuesta para verificar errores
        resp = bulk(es, actions, raise_on_error=True)
        print("Indexed:", resp[0], "Errors:", resp[1])

descripciones = df.to_dict(orient='records')
bulk_index_data(es, descripciones, vector_index_name)

In [None]:
# Query semantica

query_sentence = "nuevas tecnologías"
query_vector = model.encode([query_sentence])[0]

parameters = {
     "field":"descripcion_vector",
     "query_vector": query_sentence,
     "k":5,
     "num_candidates":100
}
res = es.search(
    index=vector_index_name, 
    knn=parameters)

print("Search Results:")
for hit in res['hits']['hits']:
     print(f"Document ID: {hit['_id']}, Nombre Título: {hit['_source']['nombre']}, Description: {hit['_source']['descripcion']}, Score: {hit['_score']}")