In [1]:
# !pip install scipy
# !pip install seaborn
# !pip install matplotlib
# !pip install spacy
# !pip install scikit-learn

In [1]:
import numpy as np
from numpy.linalg import norm

from llama_cpp.llama import Llama, LlamaGrammar
from langchain.embeddings import LlamaCppEmbeddings

import pickle
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from nltk.corpus import stopwords

stop_words = set(stopwords.words("english"))    

# load input/educations.csv into pandas
df = pd.read_csv("input/educations.csv")

# randomise the order of the rows
# df = df.sample(frac=1, random_state=42).reset_index(drop=True)

df.head()

Unnamed: 0,EducationTitle,Description,Duration,SkillsAcquired,Level
0,Spanish Language Essentials,Immerse yourself in the basics of Spanish lang...,6 weeks,Spanish Vocabulary;Grammar;Conversational Skills,Beginner
1,Photography Fundamentals,"Learn the art of photography, including camera...",8 weeks,Camera Settings;Composition;Photo Editing,Intermediate
2,Healthy Cooking for Beginners,"Discover the fundamentals of healthy cooking, ...",10 weeks,Meal Planning;Nutrition Basics;Cooking Skills,Beginner
3,Introduction to Astronomy,"Explore the wonders of the universe, covering ...",12 weeks,Astronomy Basics;Stargazing Techniques;Celesti...,Beginner
4,Introduction to Sustainable Living,"Gain insights into sustainable practices, incl...",8 weeks,Sustainable Living Practices;Environmental Awa...,Beginner


In [2]:
ggml_model_path = "https://huggingface.co/TheBloke/dolphin-2.6-mistral-7B-dpo-laser-GGUF/resolve/main/dolphin-2.6-mistral-7b-dpo-laser.Q4_K_M.gguf"
filename = "input/dolphin-2.6-mistral-7b-dpo-laser.Q4_K_M.gguf"

llm = LlamaCppEmbeddings(model_path=filename)

AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 0 | SSE3 = 1 | SSSE3 = 0 | VSX = 0 | 


In [3]:
def embed_row(i, df):
    input_text = df['SkillsAcquired'][i]
    filtered_list = []
    for w in input_text.split():
        if w.lower() not in stop_words:
            filtered_list.append(w.lower())
    filter_input_text = " ".join(filtered_list)
    input = f"""{filter_input_text}""".replace(";", " ")
    print(f"Index {i} - {len(df)}: {input}")
    return llm.embed_query(input)

In [4]:
def generate_embeddings_with_checkpoints(df, embed_function, checkpoint_interval=10, checkpoint_path="embeddings_checkpoint.pkl"):
    embeddings = []
    checkpoint_index = 0

    # randomise the dataframe rows
    df = df.sample(frac=1).reset_index(drop=True)

    # Attempt to load from checkpoint if exists
    try:
        with open(checkpoint_path, 'rb') as file:
            embeddings, checkpoint_index = pickle.load(file)
            print(f"Resuming from checkpoint at index {checkpoint_index}")
    except FileNotFoundError:
        print("No checkpoint found, starting from scratch")

    # Adjust the loop to iterate over DataFrame rows
    for i in df.index[checkpoint_index:]:
        # Assuming embed_row correctly handles DataFrame rows
        embeddings.append(embed_function(i, df))

        # Save a checkpoint every 'checkpoint_interval' iterations
        if (len(embeddings) % checkpoint_interval == 0) or (i == df.index[-1]):
            with open(checkpoint_path, 'wb') as file:
                pickle.dump((embeddings, i + 1), file)
                print(f"Checkpoint saved at index {i + 1}")

    return embeddings

In [5]:
embeddings = generate_embeddings_with_checkpoints(df, embed_row, checkpoint_interval=10, checkpoint_path="input/embeddings_checkpoint_skills.pkl")

No checkpoint found, starting from scratch
Index 0 - 308: Portuguese Language Immersion Portuguese Conversational Skills Grammar Regional Accents
Index 1 - 308: Introduction to Data Visualization with D3.js D3.js Data Binding SVG Graphics Interactive Data Visualizations
Index 2 - 308: Learn to Play Chess Chess Rules Chess Strategies Tactical Maneuvers
Index 3 - 308: Introduction to Storyboarding for Animation Visual Storytelling for Animation Character Design Scene Composition for Animation
Index 4 - 308: Introduction to Social Psychology Social Behavior Attitudes Group Dynamics Social Psychology Basics
Index 5 - 308: Mandarin Chinese Film Analysis Chinese Film Analysis Mandarin for Film Critics Cinematic Terminology
Index 6 - 308: Introduction to Indoor Gardening Indoor Plant Care Container Gardening Herb Cultivation
Index 7 - 308: Introduction to American Sign Language (ASL) ASL Vocabulary Sign Language Grammar Conversational ASL
Index 8 - 308: Artificial Intelligence Ethics AI Ethic

KeyboardInterrupt: 

In [None]:
# Create an word occurance matrix for each row in the dataset
# each row in the matrix corresponds to a row in the dictionary and each column corresponds to a word in the dictionary
# the value of each cell is the number of times the word appears in the sentence
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
from nltk.corpus import stopwords

stop_words = set(stopwords.words("english"))

def get_word_occurrence_matrix(data):
    vectorizer = CountVectorizer()
    X = vectorizer.fit_transform(data)
    df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())
    return df

def filter_out_stopwords(query):
    filtered_list = []
    for w in query.split():
        if w.lower() not in stop_words:
            filtered_list.append(w.lower())
    filtered_query = " ".join(filtered_list)
    return filtered_query

In [None]:
# Create a word occurance matrix for the dataset
data = pd.read_csv("input/educations.csv")
# create a list of a concatenation of the title and description
data["Temp"] = data["EducationTitle"] + "; " + data["SkillsAcquired"]
# filter out stopwords
data["Temp"] = data["Temp"].apply(filter_out_stopwords)
data = data["Temp"].tolist()
df = get_word_occurrence_matrix(data)
df.to_csv("input/word_matrix.csv", index=False)