In [1]:
!pip install -q -U torch --index-url https://download.pytorch.org/whl/cu117
!pip install -q -U transformers=="4.38.2"
!pip install -q accelerate
!pip install -q -i https://pypi.org/simple/ bitsandbytes
!pip install -q -U sentence_transformers
!pip install -q -U scann
!pip install -q -U wikipedia-api


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.2[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.2[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.2[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.2[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip

In [2]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [3]:
import warnings
warnings.filterwarnings("ignore")

In [4]:
import re
import numpy as np
import pandas as pd
from tqdm import tqdm
import scann
import wikipediaapi

import torch

import transformers
from transformers import (AutoModelForCausalLM, 
                          AutoTokenizer, 
                          BitsAndBytesConfig,
                         )
from sentence_transformers import SentenceTransformer
import bitsandbytes as bnb

2024-04-22 20:41:33.882112: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-04-22 20:41:34.017589: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-04-22 20:41:34.483533: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
The installed version of bitsandbytes was compiled without GPU support. 8-bit optimizers, 8-bit multiplication, and GPU quantization are unavailable.


In [5]:
def define_device():
    """Define the device to be used by PyTorch"""

    # Get the PyTorch version
    torch_version = torch.__version__

    # Print the PyTorch version
    print(f"PyTorch version: {torch_version}", end=" -- ")

    # Check if MPS (Multi-Process Service) device is available on MacOS
    if torch.backends.mps.is_available():
        # If MPS is available, print a message indicating its usage
        print("using MPS device on MacOS")
        # Define the device as MPS
        defined_device = torch.device("mps")
    else:
        # If MPS is not available, determine the device based on GPU availability
        defined_device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        # Print a message indicating the selected device
        print(f"using {defined_device}")

    # Return the defined device
    return defined_device


In [6]:
def get_embedding(text, embedding_model):
    """Get embeddings for a given text using the provided embedding model"""
    
    # Encode the text to obtain embeddings using the provided embedding model
    embedding = embedding_model.encode(text, show_progress_bar=False)
    
    # Convert the embeddings to a list of floats and return
    return embedding.tolist()


def map2embeddings(data, embedding_model):
    """Map a list of texts to their embeddings using the provided embedding model"""
    
    # Initialize an empty list to store embeddings
    embeddings = []

    # Iterate over each text in the input data list
    no_texts = len(data)
    print(f"Mapping {no_texts} pieces of information")
    for i in tqdm(range(no_texts)):
        # Get embeddings for the current text using the provided embedding model
        embeddings.append(get_embedding(data[i], embedding_model))
    
    # Return the list of embeddings
    return embeddings

In [7]:
def clean_text(txt, EOS_TOKEN):
    """Clean text by removing specific tokens and redundant spaces"""
    txt = (txt
           .replace(EOS_TOKEN, "") # Replace the end-of-sentence token with an empty string
           .replace("**", "")      # Replace double asterisks with an empty string
           .replace("<pad>", "")   # Replace "<pad>" with an empty string
           .replace("  ", " ")     # Replace double spaces with single spaces
          ).strip()                # Strip leading and trailing spaces from the text
    return txt

In [8]:
def add_indefinite_article(role_name):
    """Check if a role name has a determinative adjective before it, and if not, add the correct one"""
    
    # Check if the first word is a determinative adjective
    determinative_adjectives = ["a", "an", "the"]
    words = role_name.split()
    if words[0].lower() not in determinative_adjectives:
        # Use "a" or "an" based on the first letter of the role name
        determinative_adjective = "an" if words[0][0].lower() in "aeiou" else "a"
        role_name = f"{determinative_adjective} {role_name}"

    return role_name

In [9]:

def generate_summary_and_answer(question, data, searcher, embedding_model, model,
                                max_new_tokens=2048, temperature=0.4, role="expert"):
    """Generate an answer for a given question using context from a dataset"""
    
    # Embed the input question using the provided embedding model
    embeded_question = np.array(get_embedding(question, embedding_model)).reshape(1, -1)
    
    # Find similar contexts in the dataset based on the embedded question
    neighbors, distances = searcher.search_batched(embeded_question)
    
    # Extract context from the dataset based on the indices of similar contexts
    context = " ".join([data[pos] for pos in np.ravel(neighbors)])
    
    # Get the end-of-sentence token from the tokenizer
    try:
        EOS_TOKEN = model.tokenizer.eos_token
    except:
        EOS_TOKEN = "<eos>"
    
    # Add a determinative adjective to the role
    role = add_indefinite_article(role)
    
    # Generate a prompt for summarizing the context
    prompt = f"""
             Summarize this context: "{context}" in order to answer the question "{question}" as {role}\
             SUMMARY:
             """.strip() + EOS_TOKEN
    
    # Generate a summary based on the prompt
    results = model.generate_text(prompt, max_new_tokens, temperature)
    # print("-----------results----------------: "+str(results)+"-------------------END-----------------")
    
    # Clean the generated summary
    # summary = clean_text(results[0].split("SUMMARY:")[-1], EOS_TOKEN)
    summary = results

    # print("-----------Summary----------------: "+str(summary)+"-------------------END-----------------")
    # Generate a prompt for providing an answer
    prompt = f"""
             Here is the context: {summary}
             Using the relevant information from the context 
             and integrating it with your knowledge,
             provide an answer as {role} to the question: {question}.
             If the context doesn't provide
             any relevant information answer with 
             [I couldn't find a good match in my
             knowledge base for your question, 
             hence I answer based on my own knowledge] \
             ANSWER:
             """.strip() + EOS_TOKEN

    # Generate an answer based on the prompt
    results = model.generate_text(prompt, max_new_tokens, temperature)
    
    # Clean the generated answer
    answer = clean_text(results[0].split("ANSWER:")[-1], EOS_TOKEN)

    # Return the cleaned answer
    return answer

In [10]:
# Pre-compile the regular expression pattern for better performance
BRACES_PATTERN = re.compile(r'\{.*?\}|\}')

def remove_braces_and_content(text):
    """Remove all occurrences of curly braces and their content from the given text"""
    return BRACES_PATTERN.sub('', text)

def clean_string(input_string):
    """Clean the input string."""
    
    # Remove extra spaces by splitting the string by spaces and joining back together
    cleaned_string = ' '.join(input_string.split())
    
    # Remove consecutive carriage return characters until there are no more consecutive occurrences
    cleaned_string = re.sub(r'\r+', '\r', cleaned_string)
    
    # Remove all occurrences of curly braces and their content from the cleaned string
    cleaned_string = remove_braces_and_content(cleaned_string)
    
    # Return the cleaned string
    return cleaned_string

In [9]:
!echo "Can you explain random forest?" | ./gemma_cpp/build/gemma -- --tokenizer /home/hunter/courses/fp/gemcp/4/tokenizer.spm --compressed_weights /home/hunter/courses/fp/gemcp/4/2b-it-sfp.sbs --model 2b-it --verbosity 0


[ Reading prompt ] ...............**Random Forest** is an ensemble learning algorithm that combines multiple decision trees to improve predictive performance. It is a supervised learning technique that can be used for both classification and regression tasks.

**Key Concepts:**

* **Ensemble learning:** Random forest combines multiple decision trees through a voting mechanism to improve overall accuracy and reduce overfitting.
* **Decision trees:** Each tree in the forest makes a decision based on a feature, and the final prediction is made by aggregating the results of all trees.
* **Feature randomness:** Features are randomly selected for each tree, ensuring that no single feature dominates the learning process.
* **Bagging:** Random forest uses a technique called bagging to create multiple training sets by randomly sampling with replacement from the original dataset. This helps to reduce the impact of overfitting.

**How it works:**

1. **Training:**
   - Split the training data in

In [11]:
import subprocess
import sys
import re

class GemmaCPP():
    """Wrapper for the C++ implementation of Gemma"""
    
    def __init__(self, gemma_cpp, tokenizer, compressed_weights, model):
        self.gemma_cpp = gemma_cpp
        self.tokenizer = tokenizer
        self.compressed_weights = compressed_weights
        self.model = model
        
    def eliminate_long_dots(self, input_string):
        """Eliminate long sequences of dots from the input string"""
        # Define a regular expression pattern to match sequences of 2 or more dots
        pattern = r'\.{2,}'

        # Replace all occurrences of the pattern with a space
        output_string = re.sub(pattern, ' ', input_string)

        return output_string.strip()
    
    def beautify_string(self, input_string):
        """Clean the input string by removing non-letter characters at the beginning
           and isolated letters at the end after multiple spaces"""
        # Remove non-letter characters at the beginning of the string
        output_string = re.sub(r'^[^a-zA-Z]+', '', input_string.strip())

        # Remove isolated letters at the end of the output string after multiple spaces
        output_string = re.sub(r'\s{3,}(.+)\Z', '', output_string.strip())

        return output_string
        
    def generate_text(self, prompt, *args, **kwargs):
        """Generate text using the cpp tokenizer and model"""

        # Define the shell command
        prompt = prompt.replace('"', '').replace("'", "")
        shell_command = f'echo "{prompt}" | {gemma_cpp} -- --tokenizer {tokenizer} --compressed_weights {compressed_weights} --model {model} --verbosity 0'

        # Execute the shell command and redirect stdout to the Python script's stdout
        process = subprocess.Popen(shell_command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)

        output_text = ""
        reading_block = "[ Reading prompt ]"
        
        # Communicate with the process and capture stdout 
        for k, char in enumerate( iter(lambda: process.stdout.read(1), b'') ):
            single_char = char.decode(sys.stdout.encoding)
            output_text += single_char
            if len(output_text) % 20 == 0:
                count_reading_blocks = output_text.count(reading_block)
                if count_reading_blocks > 1:
                    break
                    
        # Remove long sequences of dots and the reading block, beautify the string
        output_text = output_text.replace(reading_block, "")
        output_text = self.eliminate_long_dots(output_text)
        output_text = self.beautify_string(output_text)
        output_text = prompt + output_text
        # print("---------------outputtext-----------------"+str(output_text)+"------------------END-------------------")
        # Return output text
        return [output_text]

In [12]:

class AIAssistant():
    """An AI assistant that interacts with users by providing answers based on a provided knowledge base"""
    
    def __init__(self, gemma_model, embeddings_name="thenlper/gte-large", temperature=0.4, role="expert"):
        """Initialize the AI assistant."""
        # Initialize attributes
        self.embeddings_name = embeddings_name
        self.knowledge_base = []
        self.temperature = temperature
        self.role = role
        
        # Initialize Gemma model (it can be transformer-based or any other)
        self.gemma_model = gemma_model  
        
        # Load the embedding model
        self.embedding_model = SentenceTransformer(self.embeddings_name)
        
    def store_knowledge_base(self, knowledge_base):
        """Store the knowledge base"""
        self.knowledge_base=knowledge_base
        
    def learn_knowledge_base(self, knowledge_base):
        """Store and index the knowledge based to be used by the assistant"""
        # Storing the knowledge base
        self.store_knowledge_base(knowledge_base)
        
        # Load and index the knowledge base
        print("Indexing and mapping the knowledge base:")
        embeddings = map2embeddings(self.knowledge_base, self.embedding_model)
        self.embeddings = np.array(embeddings).astype(np.float32)
        
        # Instantiate the searcher for similarity search
        self.index_embeddings()
        
    def index_embeddings(self):
        """Index the embeddings using ScaNN """
        self.searcher = (scann.scann_ops_pybind.builder(db=self.embeddings, num_neighbors=10, distance_measure="dot_product")
                 .tree(num_leaves=min(self.embeddings.shape[0] // 2, 1000), 
                       num_leaves_to_search=100, 
                       training_sample_size=self.embeddings.shape[0])
                 .score_ah(2, anisotropic_quantization_threshold=0.2)
                 .reorder(100)
                 .build()
           )
        
    def query(self, query):
        """Query the knowledge base of the AI assistant."""
        # Generate and print an answer to the query
        answer = generate_summary_and_answer(query, 
                                             self.knowledge_base, 
                                             self.searcher, 
                                             self.embedding_model, 
                                             self.gemma_model,
                                             temperature=self.temperature,
                                             role=self.role)
        print(answer)
        
    def set_temperature(self, temperature):
        """Set the temperature (creativity) of the AI assistant."""
        self.temperature = temperature
        
    def set_role(self, role):
        """Define the answering style of the AI assistant."""
        self.role = role
        
    def save_embeddings(self, filename="embeddings.npy"):
        """Save the embeddings to disk"""
        np.save(filename, self.embeddings)
        
    def load_embeddings(self, filename="embeddings.npy"):
        """Load the embeddings from disk and index them"""
        self.embeddings = np.load(filename)
        # Re-instantiate the searcher
        self.index_embeddings()

In [13]:
embeddings_name = "thenlper/gte-large"
gemma_cpp = "./gemma_cpp/build/gemma"
tokenizer = "/home/hunter/courses/fp/gemcp/4/tokenizer.spm"
compressed_weights = "/home/hunter/courses/fp/gemcp/4/2b-it-sfp.sbs"
model = "2b-it"

# Create an instance of the class AIAssistant based on Gemma C++
gemma_ai_assistant = AIAssistant(
    gemma_model=GemmaCPP(gemma_cpp, tokenizer, compressed_weights, model),
    embeddings_name=embeddings_name
)

In [14]:

# Loading the previously prepared knowledge base and embeddings
wikipedia_data_science_kb = pd.read_csv("wikipedia_data_science_kb.csv")
knowledge_base = wikipedia_data_science_kb.wikipedia_text.tolist()
# Uploading the knowledge base and embeddings to the AI assistant
gemma_ai_assistant.store_knowledge_base(knowledge_base=knowledge_base)
gemma_ai_assistant.load_embeddings(filename="embeddings.npy")

2024-04-22 20:44:39.154420: I scann/partitioning/partitioner_factory_base.cc:59] Size of sampled dataset for training partition: 15968
2024-04-22 20:44:39.284236: W scann/utils/gmm_utils.cc:921] Could not normalize centroid due to zero norm or empty or zero-weight partition.
2024-04-22 20:44:40.336456: I ./scann/partitioning/kmeans_tree_partitioner_utils.h:88] PartitionerFactory ran in 1.181622412s.


In [15]:
gemma_ai_assistant.query("In short under 50 words , what is linear regression?")

Sure, here's a summary of the context:

Linear regression is a statistical method that relates one or more dependent variables (Y) to one or more independent variables (X). The goal is to find a linear relationship between the variables and to use this relationship to make predictions about the dependent variable.
