# Kikiola Genome Embedding with Hugging Face 🤗

## Installing Required Libraries

In [None]:
!pip install transformers requests torch

## Starting the Server

Before running the Kikiola Genome Embedding code, make sure to start the server by running the following command in your terminal. This command will start the server that will handle the storage of the generated embeddings.

```sh
go run cmd/main.go
```

## Kikiola Genome Embedding Code

In [None]:
import requests
import sys
import torch
from transformers import AutoTokenizer, AutoModel

class KikiolaGenomeEmbedding:
    def __init__(self):
        self.tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
        self.model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
        self.document_text = ""
        self.embeddings = []

    def load_genome_sequences(self, gene_id):
        server = "http://rest.ensembl.org"
        ext = f"/genetree/member/id/{gene_id}?"
        r = requests.get(server + ext, headers={"Content-Type": "application/json"})
        if not r.ok:
            r.raise_for_status()
            sys.exit()
        decoded = r.json()
        self.extract_genome_sequences(decoded)

    def extract_genome_sequences(self, data):
        if isinstance(data, dict):
            if 'sequence' in data and 'mol_seq' in data['sequence']:
                genome_sequence = data['sequence']['mol_seq']['seq']
                self.document_text += genome_sequence + '\n'
            else:
                for value in data.values():
                    self.extract_genome_sequences(value)
        elif isinstance(data, list):
            for item in data:
                self.extract_genome_sequences(item)

    def generate_embeddings(self):
        chunk_size = 8000
        chunks = [self.document_text[i:i + chunk_size] for i in range(0, len(self.document_text), chunk_size)]
        self.embeddings = []
        for chunk in chunks:
            encoded_input = self.tokenizer(chunk, return_tensors='pt', truncation=True, padding=True)
            with torch.no_grad():
                model_output = self.model(**encoded_input)
            embeddings = model_output.last_hidden_state.mean(dim=1).tolist()
            self.embeddings.extend(embeddings)

    def store_embeddings(self):
        for i, embedding in enumerate(self.embeddings):
            vector_data = {
                "id": f"genome_sequence_{i}",
                "embedding": embedding,
                "metadata": {
                    "name": f"Genome Sequence Embeddings - Part {i+1}",
                    "category": "genome_sequence"
                }
            }
            print(f"Vector data for Part {i+1}: {vector_data}")
            response = requests.post("http://localhost:3400/vectors", json=vector_data)
            if response.status_code == 200:
                print(f"Embeddings stored for Part {i+1}. Status code: {response.status_code}")
            else:
                print(f"Error storing embeddings for Part {i+1}. Status code: {response.status_code}")
                print(f"Error response: {response.text}")

if __name__ == "__main__":
    gene_id = "ENSG00000157764"
    embeddings_generator = KikiolaGenomeEmbedding()
    embeddings_generator.load_genome_sequences(gene_id)
    embeddings_generator.generate_embeddings()
    embeddings_generator.store_embeddings()
    print("Kikiola Embeddings Completed.")