# Loading and preparing the dataset

In [1]:
from datasets import load_dataset

dataset = load_dataset("csv", data_files="./vm_info.csv")
dataset

DatasetDict({
    train: Dataset({
        features: ['Unique_ID', 'Platform', 'Distributor', 'Description', 'Min_CPU', 'Min_RAM_GB', 'Min_Storage_GB', 'Installed_Software', 'Image_Version'],
        num_rows: 16
    })
})

In [2]:
columns = dataset.column_names
columns_to_keep = ['Platform', 'Distributor', 'Description', 'Min_CPU', 'Min_RAM_GB', 'Min_Storage_GB', 'Installed_Software', 'Image_Version']
columns_to_remove = set(columns_to_keep).symmetric_difference(columns['train'])
issues_dataset = dataset.remove_columns(columns_to_remove)
issues_dataset

DatasetDict({
    train: Dataset({
        features: ['Platform', 'Distributor', 'Description', 'Min_CPU', 'Min_RAM_GB', 'Min_Storage_GB', 'Installed_Software', 'Image_Version'],
        num_rows: 16
    })
})

In [3]:
issues_dataset.set_format(type="pandas")
df = issues_dataset['train'][:]

In [4]:
df.head()

Unnamed: 0,Platform,Distributor,Description,Min_CPU,Min_RAM_GB,Min_Storage_GB,Installed_Software,Image_Version
0,Windows,Windows Server,A general-purpose VM suitable for Windows-base...,2,4,50,"OS:Windows Server 2019, IIS:10.0, SQL Server:2019",2019
1,Linux,Ubuntu,A lightweight VM for Linux development environ...,1,2,20,"OS:Ubuntu 20.04, Apache:2.4, MySQL:8.0",20.04
2,Windows,Windows Pro,High-performance VM for Windows desktop applic...,4,8,100,"OS:Windows 10 Pro, Office:2019, Visual Studio:...",win10_pro
3,Linux,CentOS,Stable and secure VM for server applications,2,4,50,"OS:CentOS 8, Nginx:1.18, PostgreSQL:13",centos_8
4,MacOS,MacOS,High-end VM for macOS development and testing,4,16,200,"OS:macOS Catalina, Xcode:12.4, Homebrew:2.7",10.15


In [5]:
from datasets import Dataset

comments_dataset = Dataset.from_pandas(df)
comments_dataset

Dataset({
    features: ['Platform', 'Distributor', 'Description', 'Min_CPU', 'Min_RAM_GB', 'Min_Storage_GB', 'Installed_Software', 'Image_Version'],
    num_rows: 16
})

In [6]:
def concatenate_text(examples):
    return {
        "text": examples["Platform"] + " " + 
               examples["Distributor"] + " " + 
               examples["Description"] + " " + 
               "Min CPU: " + str(examples["Min_CPU"]) + " cores " + 
               "Min RAM: " + str(examples["Min_RAM_GB"]) + " GB " + 
               "Min Storage: " + str(examples["Min_Storage_GB"]) + " GB " +
               "Installed Software: " + examples["Installed_Software"] + " " + 
               "Image Version: " + examples["Image_Version"]
    }

In [7]:
comments_dataset = comments_dataset.map(concatenate_text)
comments_dataset

Map:   0%|          | 0/16 [00:00<?, ? examples/s]

Dataset({
    features: ['Platform', 'Distributor', 'Description', 'Min_CPU', 'Min_RAM_GB', 'Min_Storage_GB', 'Installed_Software', 'Image_Version', 'text'],
    num_rows: 16
})

In [8]:
from transformers import AutoTokenizer, AutoModel

model_ckpt = "sentence-transformers/multi-qa-mpnet-base-dot-v1"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
model = AutoModel.from_pretrained(model_ckpt)

In [9]:
import torch

device = torch.device("cpu")
model.to(device)

MPNetModel(
  (embeddings): MPNetEmbeddings(
    (word_embeddings): Embedding(30527, 768, padding_idx=1)
    (position_embeddings): Embedding(514, 768, padding_idx=1)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): MPNetEncoder(
    (layer): ModuleList(
      (0-11): 12 x MPNetLayer(
        (attention): MPNetAttention(
          (attn): MPNetSelfAttention(
            (q): Linear(in_features=768, out_features=768, bias=True)
            (k): Linear(in_features=768, out_features=768, bias=True)
            (v): Linear(in_features=768, out_features=768, bias=True)
            (o): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (intermediate): MPNetIntermediate(
          (dense): Linear(in_

In [10]:
def cls_pooling(model_output):
    return model_output.last_hidden_state[:, 0]

In [11]:
import pandas as pd

def get_embeddings(text_list):
    encoded_input = tokenizer(
        text_list, padding=True, truncation=True, return_tensors="pt"
    )
    encoded_input = {k: v.to(device) for k, v in encoded_input.items()}
    model_output = model(**encoded_input)
    return cls_pooling(model_output)

In [12]:
embeddings_dataset = comments_dataset.map(
    lambda x: {"embeddings": get_embeddings(x["text"])[0]}
)

Map:   0%|          | 0/16 [00:00<?, ? examples/s]

In [13]:
embeddings_dataset.add_faiss_index(column="embeddings")

  0%|          | 0/1 [00:00<?, ?it/s]

Dataset({
    features: ['Platform', 'Distributor', 'Description', 'Min_CPU', 'Min_RAM_GB', 'Min_Storage_GB', 'Installed_Software', 'Image_Version', 'text', 'embeddings'],
    num_rows: 16
})

In [14]:
question = "How can I load a dataset offline?"
question_embedding = get_embeddings([question]).detach().numpy()
question_embedding.shape

(1, 768)

In [18]:
scores, samples = embeddings_dataset.get_nearest_examples(
    "embeddings", question_embedding, k=5
)

In [16]:
import pandas as pd

samples_df = pd.DataFrame.from_dict(samples)
samples_df["scores"] = scores
samples_df.sort_values("scores", ascending=False, inplace=True)

In [17]:
for _, row in samples_df.iterrows():
    print(f"COMMENT: {row.text}")
    print(f"SCORE: {row.scores}")
    print("=" * 50)
    print()

COMMENT: Windows Windows Server A general-purpose VM suitable for Windows-based workloads Min CPU: 2 cores Min RAM: 4 GB Min Storage: 50 GB Installed Software: OS:Windows Server 2019, IIS:10.0, SQL Server:2019 Image Version: 2019
SCORE: 57.859901428222656

COMMENT: Linux Ubuntu A lightweight VM for Linux development environments Min CPU: 1 cores Min RAM: 2 GB Min Storage: 20 GB Installed Software: OS:Ubuntu 20.04, Apache:2.4, MySQL:8.0 Image Version: 20.04
SCORE: 55.72771072387695

COMMENT: Linux CentOS Database server optimized for high-performance databases Min CPU: 4 cores Min RAM: 16 GB Min Storage: 100 GB Installed Software: OS:CentOS 8, MySQL:8.0, PostgreSQL:13 Image Version: 8
SCORE: 55.6778678894043

COMMENT: Linux CentOS Storage-optimized VM for data warehousing Min CPU: 2 cores Min RAM: 8 GB Min Storage: 500 GB Installed Software: OS:CentOS 8, Hadoop:3.2, Spark:3.0 Image Version: centos_8_storage
SCORE: 55.03107452392578

COMMENT: Linux CentOS Database server optimized for bi