# Overview

We are going to calculate the embeddings for the whole datasets.

In [1]:
!pip install -U -q transformers==4.39.3
!pip install -U -q datasets==2.18.0

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
cudf 24.4.1 requires cubinlinker, which is not installed.
cudf 24.4.1 requires cupy-cuda11x>=12.0.0, which is not installed.
cudf 24.4.1 requires ptxcompiler, which is not installed.
cuml 24.4.0 requires cupy-cuda11x>=12.0.0, which is not installed.
dask-cudf 24.4.1 requires cupy-cuda11x>=12.0.0, which is not installed.
cudf 24.4.1 requires cuda-python<12.0a0,>=11.7.1, but you have cuda-python 12.5.0 which is incompatible.
distributed 2024.1.1 requires dask==2024.1.1, but you have dask 2024.5.2 which is incompatible.
gcsfs 2024.3.1 requires fsspec==2024.3.1, but you have fsspec 2024.2.0 which is incompatible.
rapids-dask-dependency 24.4.1a0 requires dask==2024.1.1, but you have dask 2024.5.2 which is incompatible.
rapids-dask-dependency 24.4.1a0 requires dask-expr==0.4.0, but you have dask-expr 1.1.2 which is

In [2]:
from kaggle_secrets import UserSecretsClient
from huggingface_hub import login

user_secrets = UserSecretsClient()
login(token=user_secrets.get_secret("HUGGINGFACE_TOKEN"))

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [3]:
import pandas as pd
import numpy as np

df=pd.read_csv("/kaggle/input/rmit-2024-postgraduate-study-areas/RMIT 2024 Postgraduate Study Areas.csv")
df.head(2)

Unnamed: 0,Course Name,Study area,Study type,Location,Entry Score,Prerequisites,Student type,Learning mode,Duration,Fees,Next intake,Course Page
0,Master of Architecture,Architecture,PG,Melbourne City,,,International,On campus,Full-time 2 years,48000.0,"Februray, July",https://www.rmit.edu.au/study-with-us/levels-o...
1,Master of Interior Design,Architecture,PG,Melbourne City,,,International,On campus,Full-time 2 years,46080.0,"Februray, July",https://www.rmit.edu.au/study-with-us/levels-o...


In [4]:
def compile_text(x):
    text=f"""
        Course Name: {x.get("Course Name")},
        Study Area: {x.get("Study area")},
        Study type: {x.get("Study type")},
        Location: {x.get("Location")},
        Entry Score: {x.get("Entry Score")},
        Prerequisites: {x.get("Prerequisites")},
        Student type: {x.get("Student type")},
        Learning mode: {x.get("Learning mode")},
        Duration: {x.get("Duration")},
        Fees: {x.get("Fees")},
        Next intake: {x.get("Next intake")},
        Course Page: {x.get("Course Page")}
        """
    return text

df=df.apply(lambda x: compile_text(x), axis=1)
df.info()

<class 'pandas.core.series.Series'>
RangeIndex: 234 entries, 0 to 233
Series name: None
Non-Null Count  Dtype 
--------------  ----- 
234 non-null    object
dtypes: object(1)
memory usage: 2.0+ KB


In [5]:
import os
from transformers import AutoTokenizer

os.environ["MODEL_NAME"]="sentence-transformers/all-MiniLM-L6-v2"

# https://github.com/huggingface/transformers/blob/v4.40.1/src/transformers/tokenization_utils.py#L335
tokenizer=AutoTokenizer.from_pretrained(os.getenv("MODEL_NAME"))



tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [6]:
from transformers import AutoModel

# https://www.sbert.net/examples/applications/computing-embeddings/README.html#input-sequence-length
model=AutoModel.from_pretrained(os.getenv("MODEL_NAME"))
model.max_seq_length=200
model.to("cuda")

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 384, padding_idx=0)
    (position_embeddings): Embedding(512, 384)
    (token_type_embeddings): Embedding(2, 384)
    (LayerNorm): LayerNorm((384,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-5): 6 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=384, out_features=384, bias=True)
            (key): Linear(in_features=384, out_features=384, bias=True)
            (value): Linear(in_features=384, out_features=384, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=384, out_features=384, bias=True)
            (LayerNorm): LayerNorm((384,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
    

In [7]:
df_list=df.to_list()

encoded_input=tokenizer(df_list[0], padding=True, truncation=True, max_length=200, return_tensors='pt')
encoded_input.to('cuda')
type(encoded_input)

transformers.tokenization_utils_base.BatchEncoding

In [8]:
import torch

with torch.no_grad():
    output=model(**encoded_input)

In [9]:
def mean_pooling(model_output, attention_mask):
    # The first element of model_output contains all token embeddings
    token_embeddings=model_output[0]
    input_mask_expanded=attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    sum_embeddings=torch.sum(token_embeddings*input_mask_expanded,1)
    sum_mask=torch.clamp(input_mask_expanded.sum(1), min=1e-9)
    return sum_embeddings/sum_mask

sentence_embeddings=mean_pooling(output,encoded_input['attention_mask'])

print(sentence_embeddings.shape)

torch.Size([1, 384])


In [11]:
import torch
from tqdm import tqdm


def embedding_with_batch(df_list, batch_size):
    embedding_list=[]
    for i in tqdm(range(0, len(df_list), batch_size)):
        sentences=df_list[i:i+batch_size]
        encoded_input=tokenizer(sentences, padding=True, truncation=True, max_length=200, return_tensors='pt').to('cuda')
        with torch.no_grad():
            output=model(**encoded_input)
        sentence_embeddings=mean_pooling(output,encoded_input['attention_mask'])
        embedding_list.append(sentence_embeddings)
    return embedding_list

em_list=embedding_with_batch(df_list,50)

100%|██████████| 5/5 [00:00<00:00, 40.81it/s]


In [12]:
def build_embedding_tensor(embedding_list):
    combined_embedding=None
    
    for embedding in embedding_list:
        if combined_embedding is None:
            combined_embedding=embedding
        else:
            combined_embedding=torch.cat((combined_embedding, embedding), dim=0)
    return combined_embedding

combined_embedding=build_embedding_tensor(em_list)
combined_embedding.shape

torch.Size([234, 384])

In [13]:
combined_embedding.shape[0]==len(df_list)

True

In [14]:
combined_embedding=combined_embedding.to('cpu')

corpus_embedding=pd.DataFrame(combined_embedding.numpy())

In [15]:
# Combine the DataFrames using pd.concat
combined_df = pd.concat([df, corpus_embedding], axis=1)

In [17]:
combined_df.head(5)

Unnamed: 0,0,0.1,1,2,3,4,5,6,7,8,...,374,375,376,377,378,379,380,381,382,383
0,"\n Course Name: Master of Architecture,...",0.176837,-0.050511,-0.024134,-0.039144,-0.05636,-0.003315,-0.17175,-0.064779,-0.158276,...,-0.021173,0.122215,0.000262,-0.219264,-0.05892,-0.090593,0.221892,-0.183735,-0.304098,-0.021204
1,\n Course Name: Master of Interior Desi...,0.152464,-0.092725,0.066123,0.034264,-0.073175,0.027105,-0.137137,-0.041486,-0.187823,...,-0.056002,0.089968,0.004738,-0.191967,-0.076269,-0.077909,0.18284,-0.15041,-0.249122,0.033729
2,\n Course Name: Master of Landscape Arc...,0.161379,-0.00096,0.056005,-0.040425,-0.009669,-0.035189,-0.208946,-0.087116,-0.148352,...,-0.01974,0.141001,0.030829,-0.250756,-0.060055,-0.072253,0.180991,-0.188001,-0.299528,-0.012939
3,"\n Course Name: Master of Urban Design,...",0.151037,-0.085215,0.088451,0.013106,-0.023892,0.021178,-0.160584,-0.076862,-0.224178,...,-0.016173,0.093891,-0.021906,-0.216543,-0.113011,-0.091952,0.18927,-0.086224,-0.272116,0.042567
4,\n Course Name: Master of Design (Aechi...,0.086767,-0.007952,-0.006016,-0.008897,-0.024392,-0.047597,-0.15104,0.002068,-0.194113,...,-0.029882,0.125413,0.039139,-0.249836,-0.015872,-0.114328,0.204139,-0.081095,-0.279364,-0.012074


In [19]:
combined_df.to_csv('RMIT 2024 PD Study Areas.csv', index=False)

In [20]:
from datasets import load_dataset

ds=load_dataset('csv', data_files='/kaggle/working/RMIT 2024 PD Study Areas.csv')
ds.push_to_hub('aisuko/RMIT-2024-pd-study-areas')

Generating train split: 0 examples [00:00, ? examples/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/aisuko/RMIT-2024-pd-study-areas/commit/d4183ba616f9014255f23fcae025a16215e837a3', commit_message='Upload dataset', commit_description='', oid='d4183ba616f9014255f23fcae025a16215e837a3', pr_url=None, pr_revision=None, pr_num=None)