# Kikiola Multilingual Embedding with Hugging Face 🤗

## Installing Required Libraries

In [None]:
!pip install PyPDF2 requests sentence-transformers

## Starting the Server

Before running the Kikiola Genome Embedding code, make sure to start the server by running the following command in your terminal. This command will start the server that will handle the storage of the generated embeddings.

```sh
go run cmd/main.go
```

## Kikiola Multilingual Embedding Code

In [None]:
import io
import os
import requests
import PyPDF2
import re
from sentence_transformers import SentenceTransformer

class KikiolaEmbedding:
    def __init__(self):
        self.model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')
        self.pdf_text = ""
        self.embeddings = []

    def load_pdf(self, pdf_path_or_url):
        if pdf_path_or_url.startswith("http"):
            # Download PDF from URL
            response = requests.get(pdf_path_or_url)
            pdf_file = io.BytesIO(response.content)
        else:
            # Open local PDF file
            pdf_file = open(pdf_path_or_url, 'rb')
        
        reader = PyPDF2.PdfReader(pdf_file)
        self.pdf_text = ""
        for page in reader.pages:
            self.pdf_text += page.extract_text()
        
        pdf_file.close()

    def clean_text(self):
        self.pdf_text = re.sub(r'\\n', ' ', self.pdf_text)
        self.pdf_text = re.sub(r'\\t', ' ', self.pdf_text)
        self.pdf_text = re.sub(r'\\r', ' ', self.pdf_text)
        self.pdf_text = re.sub(r'\\.', '.', self.pdf_text)
        self.pdf_text = re.sub(r'\\,', ',', self.pdf_text)
        self.pdf_text = re.sub(r"\\'", "'", self.pdf_text)
        self.pdf_text = re.sub(r'\\', ' ', self.pdf_text)
        self.pdf_text = ' '.join(self.pdf_text.split())

    def generate_embeddings(self):
        self.embeddings = self.model.encode(self.pdf_text)

    def store_embeddings(self):
        vector_data = {
            "ID": "83635f86-56b3-4bdd-a9bf-428dcebb8674",
            "Embedding": self.embeddings.tolist(),
            "Metadata": {
                "name": "PDF Embeddings",
                "category": "document"
            },
            "Text": self.pdf_text
        }

        response = requests.post("http://localhost:3400/vectors", json=vector_data)
        print(f"Embeddings stored. Status code: {response.status_code}")

pdf_path_or_url = "les_contes_de_canterbury.pdf"
embeddings_generator = KikiolaEmbedding()
embeddings_generator.load_pdf(pdf_path_or_url)
embeddings_generator.clean_text()
embeddings_generator.generate_embeddings()
embeddings_generator.store_embeddings()

print("Kikiola Embeddings Completed.")