## ECMWF Code4Earth
###  Vector Embeddings from Github and Confluence
Embed documents with sentence transformers and upload them into Deeplake local db

In [None]:
!pip install accelerate bitsandbytes xformers atlassian-python-api > /dev/null

In [None]:
!pip install langchain deeplake huggingface_hub > /dev/null

In [None]:
!pip install transformers sentence_transformers > /dev/null

In [None]:
import codecs
import itertools
import locale
import json
import os
import pickle
from tqdm import tqdm

In [None]:
import numpy as np
import shutil

In [None]:
from sentence_transformers import SentenceTransformer
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import DeepLake
from langchain.document_loaders import TextLoader
from transformers import pipeline
os.environ["HUGGINGFACEHUB_API_TOKEN"] = ''

## Load repo content

In [None]:
base_space = 'https://github.com/ecmwf/'
repo_names = ['ecmwf-api-client', 'ecmwf-opendata', 'cdsapi',
              'eccodes', 'cfgrib', 'earthkit-data']

In [None]:
all_docs = list()
for repo_path in repo_names:
    this_repo = base_space + repo_path
    !git clone {this_repo}
    for dirpath, dirnames, filenames in os.walk(repo_path):
        for this_file in filenames:
            if '.git' in dirpath:
                print(f"skipping: {this_file}")
                continue
            try:
                loader = TextLoader(os.path.join(dirpath, this_file), encoding='utf-8')
                all_docs.extend(loader.load_and_split())
            except Exception as e:
                print(e)
                pass

In [None]:
from atlassian import Confluence

In [None]:
confluence_server = Confluence('https://confluence.ecmwf.int')
with open('/content/drive/MyDrive/Code4Earth/ecmwf_complete.pickle', 'rb') as fin:
    confluence = pickle.load(fin)
confluence_docs = []
for space in confluence:
    print(space['space'])
    for page in space['pages']:
        try:
            link = confluence_server.get_page_by_title(space['space'],
                                                 page[0])['_links']['webui']
        except:
            print(f"Something wrong in {space['space']} and {page}")
            continue
        page_filename = '{}.txt'.format(link)[1:].replace('/', '_')
        with codecs.open(page_filename, 'w', 'utf-8') as fout:
            fout.write(page[1])
            try:
                loader = TextLoader(page_filename, encoding='utf-8')
                confluence_docs.extend(loader.load_and_split())
            except Exception as e:
                print(e)
                pass
            finally:
                os.remove(page_filename)

In [None]:
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
repo_texts = text_splitter.split_documents(all_docs + confluence_docs)

In [None]:
MAX_SAMPLE = len(repo_texts)
model = SentenceTransformer('all-mpnet-base-v2', device='cuda')
custom_embeddings = []
progressive_id = 0
for i, text in tqdm(enumerate(repo_texts[:MAX_SAMPLE]), total=len(repo_texts[:MAX_SAMPLE])):
    embedding = {'id': f'id{i}',
                 'vector': model.encode(text.page_content).tolist(),
                 'metadata': text.metadata,
                 'text': text.page_content}
    custom_embeddings.append(embedding)

In [None]:
DATASET_PATH = ''
dataset_path = DATASET_PATH
sentence_id = 'sentence-transformers/all-mpnet-base-v2'#databricks/dolly-v2-3b'
embeddings = HuggingFaceHubEmbeddings(repo_id=sentence_id)
db = DeepLake(dataset_path=dataset_path, read_only=True, embedding_function=embeddings)

In [None]:
vectors = np.array([np.array(e['vector'], dtype=np.float32) for e in custom_embeddings])

In [None]:
for i in range(len(vectors[:MAX_SAMPLE])):
        if i % 100 == 0:
            print(i)
        db.ds.text.append(custom_embeddings[i]['text'])
        db.ds.metadata.append(custom_embeddings[i]['metadata'])
        db.ds.ids.append(custom_embeddings[i]['id'])
        db.ds.embedding.append(vectors[i])
db.ds.commit()

In [None]:
db.ds.summary()