# LLM and Embeddings by (required llm api key):

- Daniela Martínez Quiroga
- María Isabella Rodríguez Arévalo

In [None]:
!pip install PyPDF2
!pip install -U langchain langchain-community sentence-transformers
!pip install -U gradio
!pip install openai

In [None]:
from PyPDF2 import PdfReader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from sentence_transformers.util import pytorch_cos_sim
from google.colab import userdata

import torch
import gradio as gr
import openai

# Example 1:
## Load PDF

In [None]:
def text_pdf(path):
  reader = PdfReader(path)
  text = ""
  for page in reader.pages:
    text += page.extract_text()
  return text

book_1 = text_pdf("/text/1er libro - Harry Potter.pdf")
book_2 = text_pdf("/text/2do libro - Harry Potter.pdf")
book_3 = text_pdf("/text/3er libro - Harry Potter.pdf")

## Split text

In [None]:
text_plitter = RecursiveCharacterTextSplitter(chunk_size = 900, chunk_overlap = 150)
chunks_book_1 = text_plitter.split_text(book_1)
chunks_book_2 = text_plitter.split_text(book_2)
chunks_book_3 = text_plitter.split_text(book_3)

## Embedded model

In [None]:
embedding_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
vectors_book_1 = embedding_model.embed_documents(chunks_book_1)
vectors_book_2 = embedding_model.embed_documents(chunks_book_2)
vectors_book_3 = embedding_model.embed_documents(chunks_book_3)

## Similarity between documents

In [None]:
def similitude(reference, list_em, n):
  if not isinstance(reference, torch.Tensor):
    reference = torch.tensor(reference)
  if not isinstance(list_em, torch.Tensor):
    list_em = torch.tensor(list_em)
  score_similar = pytorch_cos_sim(reference, list_em)[0] #Tenso 2d with similitude values
  index_similar = torch.topk(score_similar,n) #Got the n indexes with the highest percentage
  indexes = index_similar.indices
  scores = index_similar.values
  return indexes, scores

## OpenAI and interface

In [None]:
def question(question):
  results=10
  question_emb = embedding_model.embed_query(question)
  index_1q, scores_1q = similitude(question_emb, vectors_book_1,10)
  index_2q, scores_2q = similitude(question_emb, vectors_book_2,10)
  index_3q, scores_3q = similitude(question_emb, vectors_book_3,10)

  """
  index_1q = index_1q.item()
  index_2q = index_2q.item()
  index_3q = index_3q.item()
  """

  fragmets_1 = [chunks_book_1[i] for i in index_1q]
  fragmets_2 = [chunks_book_2[i] for i in index_2q]
  fragmets_3 = [chunks_book_3[i] for i in index_3q]

  fragmets = fragmets_1 + fragmets_2 + fragmets_3

  return " ".join(fragmets)

def chatGPT(query, history=[]):
    history_text = "\n".join([f"User: {h[0]}\nAI: {h[1]}" for h in history])
    context = question(query)
    prompt = f"Contexto:\n{context}\n\nPregunta: {query}"

    client = openai.OpenAI(api_key=userdata.get('ChatGPT'))
    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[{"role": "system", "content": "Eres un asistente útil."}] +
                 [{"role": "user", "content": history_text + "\nUser: " + prompt}]
    )
    return response.choices[0].message.content

interface = gr.ChatInterface(chatGPT).launch(debug=True)

# Example 2

In [None]:
import tensorflow as tf
import tensorflow_hub as hub
import numpy as np
import gradio as gr
from PIL import Image
import os
from tqdm import tqdm


In [None]:
import os
import numpy as np
import tensorflow as tf
import tensorflow_hub as hub
from PIL import Image
import gradio as gr

# Model from TensorFlow Hub
model_url = "https://tfhub.dev/google/tf2-preview/mobilenet_v2/feature_vector/4"
model = hub.load(model_url)

# Load reference images
def load_images_from_folder(folder):
    images = {}
    for filename in os.listdir(folder):
        img_path = os.path.join(folder, filename)
        if img_path.endswith(".jpg") or img_path.endswith(".png"):
            img = Image.open(img_path).convert('RGB')
            images[filename] = img
    return images

# Extract features using MobileNetV2
def extract_features(image):
    image = image.resize((224, 224))  # Resize
    image_array = np.array(image) / 255.0  # Normalize values between 0 and 1
    image_array = image_array.astype(np.float32)  # Convert to float32
    image_array = np.expand_dims(image_array, axis=0)  # Add batch dimension

    features = model(image_array)  # Extract features
    return np.array(features)[0]  # Convert output to numpy array


# Find most similar image
def find_most_similar(image, reference_images):
    try:
        image_features = extract_features(image)
        similarities = {}

        for filename, ref_img in reference_images.items():
            ref_features = extract_features(ref_img)
            similarity = np.dot(image_features, ref_features) / (np.linalg.norm(image_features) * np.linalg.norm(ref_features))
            similarities[filename] = similarity

        best_match = max(similarities, key=similarities.get)
        print(f"Mejor coincidencia: {best_match} con similitud {similarities[best_match]}")

        return reference_images[best_match]
    except Exception as e:
        print("Error en find_most_similar:", str(e))
        return None


# Reference images loading
image_folder = "/imgs/images"
reference_images = load_images_from_folder(image_folder)

def match_image(image):
    try:
        result = find_most_similar(image, reference_images)
        if not isinstance(result, Image.Image):
            raise TypeError("La función no devuelve una imagen PIL.")
        return result
    except Exception as e:
        print("Error en match_image:", str(e))
        return None

# Gradio interface
interface = gr.Interface(
    fn=match_image,
    inputs=gr.Image(type="pil"),
    outputs=gr.Image(type="pil"),
    title="Búsqueda de Imagen Más Parecida",
    description="Sube una imagen y encuentra la más parecida en el conjunto de referencia."
)

if __name__ == "__main__":
    interface.launch()