In [1]:
import os
import urllib.request


def download_file(file_link, filename):
    # Checks if the file already exists before downloading
    if not os.path.isfile(filename):
        urllib.request.urlretrieve(file_link, filename)
        print("File downloaded successfully.")
    else:
        print("File already exists.")

# Dowloading GGML model from HuggingFace
ggml_model_path = "https://huggingface.co/TheBloke/dolphin-2.6-mistral-7B-dpo-laser-GGUF/resolve/main/dolphin-2.6-mistral-7b-dpo-laser.Q4_K_M.gguf"
filename = "input/dolphin-2.6-mistral-7b-dpo-laser.Q4_K_M.gguf"

download_file(ggml_model_path, filename)

File downloaded successfully.


In [2]:
from llama_cpp import Llama

llm = Llama(model_path=filename, n_ctx=512, n_batch=126)

AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 0 | SSE3 = 1 | SSSE3 = 0 | VSX = 0 | 


In [3]:
def get_llm_generator(
    prompt,
    max_tokens=512,
    temperature=0,
    top_p=0.9,
    echo=False,
    stop=["[INST]"],
):
    text_generator = llm(
        prompt,
        max_tokens=max_tokens,
        temperature=temperature,
        top_p=top_p,
        echo=echo,
        stream=True,
        stop=stop,
    )
    return text_generator

def get_llm_generation(
    prompt,
    max_tokens=512,
    temperature=0,
    top_p=0.9,
    echo=False,
    stop=["[INST]"],
):
    text_generation = llm(
        prompt,
        max_tokens=max_tokens,
        temperature=temperature,
        top_p=top_p,
        echo=echo,
        stop=stop,
    )
    return text_generation["choices"][0]["text"].strip()

def generate_chat_prompt(prompt):
    system_message = (
        "You are a helpful bot that answers any questions the user may have. Only answer in short clear "
        "sentences."
    )
    chat_prompt_template = f"""<|im_start|>system
    {system_message}<|im_end|>
    <|im_start|>user
    {prompt}<|im_end|>
    <|im_start|>assistant"""
    return chat_prompt_template

def generate_skill_extraction_prompt(input):
    system = "You are a helpful bot that extracts a couple of skills the user will learn after learning what the user wants to learn. Only answer in a list of comma separated words in between curly brackets."
    chat_prompt_template = f"<s>[INST] {system} [/INST]</s>{input}"
    return chat_prompt_template

def process_skill_extraction_prompt(input, prompt):
    if "{" in input and "}" in input:
        return input[input.find("{")+1:input.find("}")]
    else:
        return prompt

In [7]:
# Test User Help Generator
prompt = generate_chat_prompt(
    "How can I learn more about AI?"
)

generator = get_llm_generator(prompt)

buffer = []
buffer_size = 10

for word in generator:
    buffer.append(word["choices"][0]["text"].strip())
    if len(buffer) >= buffer_size:
        print(" ".join(buffer))
        buffer.clear()

Llama.generate: prefix-match hit


  1 . Read books and articles on AI
.  2 . Take online courses or attend workshops
.  3 . Join AI communities and for ums
.  4 . Att end con ferences and sem
in ars .  5 . Learn programming languages used


In [105]:
# Test Skill Extraction
prompt = "I want to learn the most important languages in the world."
processed_prompt = generate_skill_extraction_prompt(
    prompt
)
output = get_llm_generation(processed_prompt)
processed_output = process_skill_extraction_prompt(output, prompt)
processed_output

Llama.generate: prefix-match hit


'English, Spanish, French, German, Italian, Portuguese, Russian, Chinese, Japanese, Korean'

In [115]:
!pip install scikit-learn

Collecting scikit-learn
  Downloading scikit_learn-1.3.2-cp310-cp310-win_amd64.whl.metadata (11 kB)
Collecting threadpoolctl>=2.0.0 (from scikit-learn)
  Downloading threadpoolctl-3.2.0-py3-none-any.whl.metadata (10.0 kB)
Downloading scikit_learn-1.3.2-cp310-cp310-win_amd64.whl (9.3 MB)
   ---------------------------------------- 0.0/9.3 MB ? eta -:--:--
   - -------------------------------------- 0.4/9.3 MB 7.4 MB/s eta 0:00:02
   ----- ---------------------------------- 1.2/9.3 MB 12.7 MB/s eta 0:00:01
   ---------- ----------------------------- 2.5/9.3 MB 17.9 MB/s eta 0:00:01
   ---------------- ----------------------- 3.8/9.3 MB 20.1 MB/s eta 0:00:01
   ---------------- ----------------------- 3.9/9.3 MB 15.7 MB/s eta 0:00:01
   ---------------------------- ----------- 6.6/9.3 MB 20.9 MB/s eta 0:00:01
   -------------------------------------- - 8.8/9.3 MB 23.5 MB/s eta 0:00:01
   ---------------------------------------  9.2/9.3 MB 23.6 MB/s eta 0:00:01
   -------------------------

In [1]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd

data = pd.read_csv("input/educations.csv")
data.head()

Unnamed: 0,EducationTitle,Description,Duration,SkillsAcquired,Level
0,Spanish Language Essentials,Immerse yourself in the basics of Spanish lang...,6 weeks,Spanish Vocabulary;Grammar;Conversational Skills,Beginner
1,Photography Fundamentals,"Learn the art of photography, including camera...",8 weeks,Camera Settings;Composition;Photo Editing,Intermediate
2,Healthy Cooking for Beginners,"Discover the fundamentals of healthy cooking, ...",10 weeks,Meal Planning;Nutrition Basics;Cooking Skills,Beginner
3,Introduction to Astronomy,"Explore the wonders of the universe, covering ...",12 weeks,Astronomy Basics;Stargazing Techniques;Celesti...,Beginner
4,Introduction to Sustainable Living,"Gain insights into sustainable practices, incl...",8 weeks,Sustainable Living Practices;Environmental Awa...,Beginner


In [21]:
# Create an word occurance matrix for each row in the dataset
# each row in the matrix corresponds to a row in the dictionary and each column corresponds to a word in the dictionary
# the value of each cell is the number of times the word appears in the sentence
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
from nltk.corpus import stopwords

stop_words = set(stopwords.words("english"))

def get_word_occurrence_matrix(data):
    vectorizer = CountVectorizer()
    X = vectorizer.fit_transform(data)
    df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())
    return df

def filter_out_stopwords(query):
    filtered_list = []
    for w in query.split():
        if w.lower() not in stop_words:
            filtered_list.append(w.lower())
    filtered_query = " ".join(filtered_list)
    return filtered_query

In [22]:
filter_out_stopwords("Introduction to Astronomy")

introduction astronomy


'introduction astronomy'

In [23]:
# Create a word occurance matrix for the dataset
data = pd.read_csv("input/educations.csv")
# create a list of a concatenation of the title and description
data["Temp"] = data["EducationTitle"] + "; " + data["SkillsAcquired"]
# filter out stopwords
data["Temp"] = data["Temp"].apply(filter_out_stopwords)
data = data["Temp"].tolist()
df = get_word_occurrence_matrix(data)
df.to_csv("input/word_matrix.csv", index=False)

spanish language essentials; spanish vocabulary;grammar;conversational skills
photography fundamentals; camera settings;composition;photo editing
healthy cooking beginners; meal planning;nutrition basics;cooking skills
introduction astronomy; astronomy basics;stargazing techniques;celestial navigation
introduction sustainable living; sustainable living practices;environmental awareness;green energy
graphic design essentials adobe creative cloud; graphic design principles;adobe creative cloud;design software
mindfulness meditation stress relief; mindfulness meditation techniques;stress management;relaxation skills
diy home gardening; plant care techniques;garden design;home vegetable cultivation
financial literacy young adults; personal budgeting;investment basics;credit management
introduction programming scratch; programming logic;scratch coding;algorithmic thinking
creative writing workshop; creative writing techniques;storytelling;genre exploration
fitness strength training; strengt