In [3]:
import pandas as pd
import requests 

print(pd.__version__)

2.2.3


In [4]:
docs_url = 'https://github.com/alexeygrigorev/llm-rag-workshop/raw/main/notebooks/documents.json'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

documents = []

for course in documents_raw:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

In [9]:
documents[2]

{'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
 'section': 'General course-related questions',
 'question': 'Course - Can I still join the course after the start date?',
 'course': 'data-engineering-zoomcamp'}

In [None]:
df = pd.DataFrame(documents, columns=['course', 'section', 'question', 'text'])

## Vector spaces

- turn the docs into vectors
- term-document matrix:
    - rows: documents
    - columns: wrods/tokens
- bag of words:
    - word order is lost
    - sparse matrix 

## Vector Spaces Example Start: -----------------------------------------------------------

In [58]:
docs_example = [
    "Course starts on 15th Jan 2024",
    "Prerequisites listed on GitHub",
    "Submit homeworks after start date",
    "Registration not required for participation",
    "Setup Google Cloud and Python before course"
]
cv_vect_spaces_test = CountVectorizer(stop_words='english')
cv_vect_spaces_test.fit(docs_example)

In [59]:
cv_vect_spaces_test.get_feature_names_out()


array(['15th', '2024', 'cloud', 'course', 'date', 'github', 'google',
       'homeworks', 'jan', 'listed', 'participation', 'prerequisites',
       'python', 'registration', 'required', 'setup', 'start', 'starts',
       'submit'], dtype=object)

In [60]:
cv_vect_spaces_test.get_feature_names_out().shape

(19,)

In [61]:
X = cv_vect_spaces_test.transform(docs_example)

In [62]:
pd.DataFrame(X.todense(), columns=cv_vect_spaces_test.get_feature_names_out()).T

Unnamed: 0,0,1,2,3,4
15th,1,0,0,0,0
2024,1,0,0,0,0
cloud,0,0,0,0,1
course,1,0,0,0,1
date,0,0,1,0,0
github,0,1,0,0,0
google,0,0,0,0,1
homeworks,0,0,1,0,0
jan,1,0,0,0,0
listed,0,1,0,0,0


## Vector Spaces Example End: -----------------------------------------------------------

## TF-IDF Spaces Example Start: -----------------------------------------------------------

In [80]:
from sklearn.feature_extraction.text import TfidfVectorizer

cv_tf_idf_test = TfidfVectorizer(stop_words='english')
X_tf_idf_test = cv_tf_idf_test.fit_transform(docs_example)

names_tf_idf_test = cv_tf_idf_test.get_feature_names_out()

df_docs = pd.DataFrame(X_tf_idf_test.toarray(), columns=names_tf_idf_test).T
df_docs
# df_docs.round(2)

Unnamed: 0,0,1,2,3,4
15th,0.463693,0.0,0.0,0.0,0.0
2024,0.463693,0.0,0.0,0.0,0.0
cloud,0.0,0.0,0.0,0.0,0.463693
course,0.374105,0.0,0.0,0.0,0.374105
date,0.0,0.0,0.5,0.0,0.0
github,0.0,0.57735,0.0,0.0,0.0
google,0.0,0.0,0.0,0.0,0.463693
homeworks,0.0,0.0,0.5,0.0,0.0
jan,0.463693,0.0,0.0,0.0,0.0
listed,0.0,0.57735,0.0,0.0,0.0


In [81]:
query_tf_idf_test = "Do I need to know python to sign up for the January course?"

q_tf_idf_test = cv.transform([query_tf_idf_test])
q_tf_idf_test.toarray()

array([[0., 0., 0., ..., 0., 0., 0.]], shape=(1, 1333))

In [82]:
query_dict_tf_idf_test = dict(zip(names_tf_idf_test, q_tf_idf_test.toarray()[0]))
query_dict_tf_idf_test

{'15th': np.float64(0.0),
 '2024': np.float64(0.0),
 'cloud': np.float64(0.0),
 'course': np.float64(0.0),
 'date': np.float64(0.0),
 'github': np.float64(0.0),
 'google': np.float64(0.0),
 'homeworks': np.float64(0.0),
 'jan': np.float64(0.0),
 'listed': np.float64(0.0),
 'participation': np.float64(0.0),
 'prerequisites': np.float64(0.0),
 'python': np.float64(0.0),
 'registration': np.float64(0.0),
 'required': np.float64(0.0),
 'setup': np.float64(0.0),
 'start': np.float64(0.0),
 'starts': np.float64(0.0),
 'submit': np.float64(0.0)}

In [83]:
doc_dict_tf_idf_test = dict(zip(names_tf_idf_test, X_tf_idf_test.toarray()[1]))
doc_dict_tf_idf_test

{'15th': np.float64(0.0),
 '2024': np.float64(0.0),
 'cloud': np.float64(0.0),
 'course': np.float64(0.0),
 'date': np.float64(0.0),
 'github': np.float64(0.5773502691896258),
 'google': np.float64(0.0),
 'homeworks': np.float64(0.0),
 'jan': np.float64(0.0),
 'listed': np.float64(0.5773502691896258),
 'participation': np.float64(0.0),
 'prerequisites': np.float64(0.5773502691896258),
 'python': np.float64(0.0),
 'registration': np.float64(0.0),
 'required': np.float64(0.0),
 'setup': np.float64(0.0),
 'start': np.float64(0.0),
 'starts': np.float64(0.0),
 'submit': np.float64(0.0)}

## TF-IDF Spaces Example End: -----------------------------------------------------------

In [64]:
from sklearn.feature_extraction.text import TfidfVectorizer

cv = TfidfVectorizer(stop_words='english', min_df=5)
X = cv.fit_transform(df.text)

In [65]:
X

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 23808 stored elements and shape (948, 1333)>

In [95]:
# query = "Do I need to know python to sign up for the January course?"
query = "I just discovered the course, is it too late to join?"

q = cv.transform([query])
q.toarray()

array([[0., 0., 0., ..., 0., 0., 0.]], shape=(1, 1333))

In [96]:
query_dict = dict(zip(names, q.toarray()[0]))
query_dict

{'15th': np.float64(0.0),
 '2024': np.float64(0.0),
 'cloud': np.float64(0.0),
 'course': np.float64(0.0),
 'date': np.float64(0.0),
 'github': np.float64(0.0),
 'google': np.float64(0.0),
 'homeworks': np.float64(0.0),
 'jan': np.float64(0.0),
 'listed': np.float64(0.0),
 'participation': np.float64(0.0),
 'prerequisites': np.float64(0.0),
 'python': np.float64(0.0),
 'registration': np.float64(0.0),
 'required': np.float64(0.0),
 'setup': np.float64(0.0),
 'start': np.float64(0.0),
 'starts': np.float64(0.0),
 'submit': np.float64(0.0)}

In [97]:
doc_dict = dict(zip(names, X.toarray()[2]))
doc_dict

{'15th': np.float64(0.0),
 '2024': np.float64(0.0),
 'cloud': np.float64(0.0),
 'course': np.float64(0.0),
 'date': np.float64(0.0),
 'github': np.float64(0.0),
 'google': np.float64(0.0),
 'homeworks': np.float64(0.0),
 'jan': np.float64(0.0),
 'listed': np.float64(0.0),
 'participation': np.float64(0.0),
 'prerequisites': np.float64(0.0),
 'python': np.float64(0.0),
 'registration': np.float64(0.0),
 'required': np.float64(0.0),
 'setup': np.float64(0.0),
 'start': np.float64(0.0),
 'starts': np.float64(0.0),
 'submit': np.float64(0.0)}

In [98]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# X.dot(q.T).todense() # cosin similarity
score = cosine_similarity(X, q).flatten()
np.argsort(score)[-5:]

array([ 22, 448, 449, 440,   0])

In [104]:
df.iloc[449].text

'Yes, you can. You won’t be able to submit some of the homeworks, but you can still take part in the course.\nIn order to get a certificate, you need to submit 2 out of 3 course projects and review 3 peers’ Projects by the deadline. It means that if you join the course at the end of November and manage to work on two projects, you will still be eligible for a certificate.'

In [109]:
fields = ['section', 'question', 'text']

matrices = {}
vectorizers = {}

for f in fields:
    cv = TfidfVectorizer(stop_words='english', min_df=5)
    X = cv.fit_transform(df[f])
    matrices[f] = X
    vectorizers[f] = cv

In [110]:
matrices

{'section': <Compressed Sparse Row sparse matrix of dtype 'float64'
 	with 3090 stored elements and shape (948, 66)>,
 'question': <Compressed Sparse Row sparse matrix of dtype 'float64'
 	with 3431 stored elements and shape (948, 291)>,
 'text': <Compressed Sparse Row sparse matrix of dtype 'float64'
 	with 23808 stored elements and shape (948, 1333)>}

In [108]:
vectorizers

{'section': TfidfVectorizer(min_df=5, stop_words='english'),
 'question': TfidfVectorizer(min_df=5, stop_words='english'),
 'text': TfidfVectorizer(min_df=5, stop_words='english')}

In [111]:
n = len(df)
n

948

In [133]:
score = np.zeros(n)

query = "I just discovered this course, is it too late to join?"
boosts = {
    'question': 3,
    # 'text': 0.5
}

for f in fields:
    q = vectorizers[f].transform([query])
    X = matrices[f]

    f_score = cosine_similarity(X, q).flatten()

    boost = boosts.get(f, 1.0)
    
    score = score + boost * f_score

In [134]:
filters = {
    'course': 'data-engineering-zoomcamp'
}

In [135]:
for field, value in filters.items():
    mask = (df[field] == value).astype(int).values
    score = score * mask

# mask
score

array([3.52985023, 3.49512426, 2.70735166, 2.96614194, 3.49512426,
       3.49512426, 1.93689291, 3.67069698, 2.67242848, 3.49512426,
       3.10198469, 2.46096752, 0.49512426, 0.49512426, 0.49512426,
       0.59193348, 0.49512426, 2.63772182, 0.57041627, 0.49512426,
       0.49512426, 0.49512426, 0.79499188, 0.60033101, 0.49512426,
       0.49512426, 0.49512426, 0.76959902, 0.62340833, 0.49512426,
       0.49512426, 0.49512426, 0.49512426, 1.78972334, 3.49512426,
       1.72080809, 0.49512426, 0.49512426, 0.49512426, 0.52668735,
       0.54427244, 2.00115141, 0.49512426, 0.53842198, 0.        ,
       0.        , 0.        , 0.        , 0.02804374, 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.06739038, 0.        , 0.00980845,
       0.        , 0.        , 0.        , 0.        , 0.05820102,
       0.        , 0.        , 0.        , 0.        , 0.     

In [145]:
idx = np.argsort(-score)[:5]

In [146]:
df.iloc[idx]

Unnamed: 0,course,section,question,text
7,data-engineering-zoomcamp,General course-related questions,Course - Can I follow the course after it fini...,"Yes, we will keep all the materials after the ..."
0,data-engineering-zoomcamp,General course-related questions,Course - When will the course start?,The purpose of this document is to capture fre...
1,data-engineering-zoomcamp,General course-related questions,Course - What are the prerequisites for this c...,GitHub - DataTalksClub data-engineering-zoomca...
4,data-engineering-zoomcamp,General course-related questions,Course - What can I do before the course starts?,You can start by installing and setting up all...
5,data-engineering-zoomcamp,General course-related questions,Course - how many Zoomcamps in a year?,"There are 3 Zoom Camps in a year, as of 2024. ..."


In [147]:
class TextSearch:

    def __init__(self, text_fields):
        self.text_fields = text_fields
        self.matrices = {}
        self.vectorizers = {}

    def fit(self, records, vectorizer_params={}):
        self.df = pd.DataFrame(records)

        for f in self.text_fields:
            cv = TfidfVectorizer(**vectorizer_params)
            X = cv.fit_transform(self.df[f])
            self.matrices[f] = X
            self.vectorizers[f] = cv

    def search(self, query, n_results=10, boost={}, filters={}):
        score = np.zeros(len(self.df))

        for f in self.text_fields:
            b = boost.get(f, 1.0)
            q = self.vectorizers[f].transform([query])
            s = cosine_similarity(self.matrices[f], q).flatten()
            score = score + b * s

        for field, value in filters.items():
            mask = (self.df[field] == value).values
            score = score * mask

        idx = np.argsort(-score)[:n_results]
        results = self.df.iloc[idx]
        return results.to_dict(orient='records')

In [148]:
index = TextSearch(
    text_fields=['section', 'question', 'text']
)
index.fit(documents)

index.search(
    query='I just signed up. Is it too late to join the course?',
    n_results=5,
    boost={'question': 3.0},
    filters={'course': 'data-engineering-zoomcamp'}
)

[{'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
  'section': 'General course-related questions',
  'question': 'Course - Can I still join the course after the start date?',
  'course': 'data-engineering-zoomcamp'},
 {'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
  'section': 'General course-related questions',
  'question': 'Course - When will the course start?',
  'course': 'data-engineerin

# Apply SVD - Singula Value Decomposition to reduce the dimension of the X

In [160]:
X.shape

(948, 1333)

In [153]:
from sklearn.decomposition import TruncatedSVD

X = matrices['text']
cv = vectorizers['text']

In [154]:
cv

In [156]:
svd = TruncatedSVD(n_components=16)
X_emb = svd.fit_transform(X)

In [159]:
X_emb.shape

(948, 16)

In [158]:
X_emb[0]

array([ 0.09652996, -0.08213711, -0.10111203, -0.07852638,  0.06875113,
       -0.05981069,  0.02876635, -0.13918519, -0.21409283,  0.29397564,
        0.04713034, -0.0279275 ,  0.0892118 ,  0.12138612, -0.05970725,
       -0.00743149])

In [161]:
query = 'I just signed up. Is it too late to join the course?'

Q = cv.transform([query])
Q_emb = svd.transform(Q)
Q_emb[0]

array([ 0.05790187, -0.03859532, -0.05610325, -0.02748895,  0.03992146,
       -0.06204425,  0.01604323, -0.09179959, -0.1507916 ,  0.18916727,
        0.03821981, -0.04277012,  0.06632205,  0.09439906, -0.03195253,
       -0.02218884])

In [164]:
Q.shape

(1, 1333)

In [165]:
Q_emb.shape

(1, 16)

In [166]:
np.dot(X_emb[0], Q_emb[0])

np.float64(0.1466227830805103)

In [168]:
score = cosine_similarity(X_emb, Q_emb).flatten()
idx = np.argsort(-score)[:10]
df.loc[idx]

Unnamed: 0,course,section,question,text
764,machine-learning-zoomcamp,Projects (Midterm and Capstone),What If I submitted only two projects and fail...,If you have submitted two projects (and peer-r...
451,machine-learning-zoomcamp,General course-related questions,Can I submit the homework after the due date?,"No, it’s not possible. The form is closed afte..."
0,data-engineering-zoomcamp,General course-related questions,Course - When will the course start?,The purpose of this document is to capture fre...
814,mlops-zoomcamp,+-General course questions,What if my answer is not exactly the same as t...,Please choose the closest one to your answer. ...
449,machine-learning-zoomcamp,General course-related questions,The course has already started. Can I still jo...,"Yes, you can. You won’t be able to submit some..."
15,data-engineering-zoomcamp,General course-related questions,Homework - Are late submissions of homework al...,"No, late submissions are not allowed. But if t..."
11,data-engineering-zoomcamp,General course-related questions,Certificate - Can I follow the course in a sel...,"No, you can only get a certificate if you fini..."
7,data-engineering-zoomcamp,General course-related questions,Course - Can I follow the course after it fini...,"Yes, we will keep all the materials after the ..."
436,machine-learning-zoomcamp,General course-related questions,Is it going to be live? When?,"The course videos are pre-recorded, you can st..."
440,machine-learning-zoomcamp,General course-related questions,"I filled the form, but haven't received a conf...","The process is automated now, so you should re..."


# Apply NMF -  Non-Negative Matrix Factorization to reduce the dimension of the X

In [169]:
from sklearn.decomposition import NMF
nmf = NMF(n_components=16)
X_emb = nmf.fit_transform(X)
X_emb[0]

array([0.129036  , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.00055247, 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        ])

In [170]:
Q = cv.transform([query])
Q_emb = nmf.transform(Q)
Q_emb[0]

array([0.08583686, 0.00235429, 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.002463  , 0.        ,
       0.        ])

In [171]:
score = cosine_similarity(X_emb, Q_emb).flatten()
idx = np.argsort(-score)[:10]
df.loc[idx]

Unnamed: 0,course,section,question,text
2,data-engineering-zoomcamp,General course-related questions,Course - Can I still join the course after the...,"Yes, even if you don't register, you're still ..."
449,machine-learning-zoomcamp,General course-related questions,The course has already started. Can I still jo...,"Yes, you can. You won’t be able to submit some..."
814,mlops-zoomcamp,+-General course questions,What if my answer is not exactly the same as t...,Please choose the closest one to your answer. ...
11,data-engineering-zoomcamp,General course-related questions,Certificate - Can I follow the course in a sel...,"No, you can only get a certificate if you fini..."
0,data-engineering-zoomcamp,General course-related questions,Course - When will the course start?,The purpose of this document is to capture fre...
764,machine-learning-zoomcamp,Projects (Midterm and Capstone),What If I submitted only two projects and fail...,If you have submitted two projects (and peer-r...
7,data-engineering-zoomcamp,General course-related questions,Course - Can I follow the course after it fini...,"Yes, we will keep all the materials after the ..."
451,machine-learning-zoomcamp,General course-related questions,Can I submit the homework after the due date?,"No, it’s not possible. The form is closed afte..."
436,machine-learning-zoomcamp,General course-related questions,Is it going to be live? When?,"The course videos are pre-recorded, you can st..."
450,machine-learning-zoomcamp,General course-related questions,When does the next iteration start?,The course is available in the self-paced mode...


# Using BERT (Bidirectional Encoder Representations from Transformers)

In [172]:
import torch
from transformers import BertModel, BertTokenizer

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained("bert-base-uncased")
model.eval()  # Set the model to evaluation mode if not training

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

In [173]:
texts = [
    "Yes, we will keep all the materials after the course finishes.",
    "You can follow the course at your own pace after it finishes"
]
encoded_input = tokenizer(texts, padding=True, truncation=True, return_tensors='pt')
encoded_input

{'input_ids': tensor([[  101,  2748,  1010,  2057,  2097,  2562,  2035,  1996,  4475,  2044,
          1996,  2607, 12321,  1012,   102],
        [  101,  2017,  2064,  3582,  1996,  2607,  2012,  2115,  2219,  6393,
          2044,  2009, 12321,   102,     0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0]])}

In [174]:
with torch.no_grad():  # Disable gradient calculation for inference
    outputs = model(**encoded_input)
    hidden_states = outputs.last_hidden_state

In [175]:
hidden_states.shape

torch.Size([2, 15, 768])

In [178]:
hidden_states[0].shape

torch.Size([15, 768])

In [179]:
hidden_states[1].shape

torch.Size([15, 768])

In [180]:
sentence_embeddings = hidden_states.mean(dim=1)
sentence_embeddings.shape

torch.Size([2, 768])

In [182]:
sentence_embeddings # Embeding with mean for hidden_states[0] and hidden_states[1]

tensor([[ 0.3600, -0.1607,  0.3545,  ...,  0.0429,  0.0348, -0.0382],
        [ 0.1785, -0.5000,  0.2528,  ..., -0.1141, -0.3361,  0.4110]])

In [183]:
X_emb = sentence_embeddings.numpy()

In [184]:
X_emb.shape

(2, 768)

In [185]:
X_emb

array([[ 0.35999215, -0.16072343,  0.35452336, ...,  0.04289271,
         0.03482307, -0.0382222 ],
       [ 0.17849936, -0.50002533,  0.25277546, ..., -0.11413094,
        -0.33608466,  0.4109513 ]], shape=(2, 768), dtype=float32)

In [186]:
sentence_embeddings_cpu = sentence_embeddings.cpu()

In [187]:
sentence_embeddings_cpu.shape

torch.Size([2, 768])

In [188]:
sentence_embeddings_cpu

tensor([[ 0.3600, -0.1607,  0.3545,  ...,  0.0429,  0.0348, -0.0382],
        [ 0.1785, -0.5000,  0.2528,  ..., -0.1141, -0.3361,  0.4110]])

In [189]:
def make_batches(seq, n):
    result = []
    for i in range(0, len(seq), n):
        batch = seq[i:i+n]
        result.append(batch)
    return result

In [190]:
from tqdm.auto import tqdm
texts = df['text'].tolist()
text_batches = make_batches(texts, 8)

all_embeddings = []

for batch in tqdm(text_batches):
    encoded_input = tokenizer(batch, padding=True, truncation=True, return_tensors='pt')

    with torch.no_grad():
        outputs = model(**encoded_input)
        hidden_states = outputs.last_hidden_state
        
        batch_embeddings = hidden_states.mean(dim=1)
        batch_embeddings_np = batch_embeddings.cpu().numpy()
        all_embeddings.append(batch_embeddings_np)

final_embeddings = np.vstack(all_embeddings)

  0%|          | 0/119 [00:00<?, ?it/s]

In [191]:
final_embeddings.shape

(948, 768)

In [192]:
final_embeddings

array([[-0.00456325, -0.11667515,  0.6274717 , ..., -0.03659188,
         0.10031687,  0.02927116],
       [-0.14233607, -0.19853897,  0.28455406, ..., -0.01139052,
        -0.15399775,  0.0953509 ],
       [ 0.19672215, -0.08461291,  0.28200477, ...,  0.11395848,
        -0.06448034, -0.01282625],
       ...,
       [-0.28217435, -0.3332434 ,  0.29785013, ..., -0.3504273 ,
         0.03266073,  0.09537268],
       [-0.4280712 , -0.39468765,  0.30942   , ..., -0.05943285,
        -0.12965178,  0.0788708 ],
       [-0.16892141, -0.25146288,  0.47843295, ..., -0.18535407,
        -0.16108926,  0.27272934]], shape=(948, 768), dtype=float32)

In [193]:
def compute_embeddings(texts, batch_size=8):
    text_batches = make_batches(texts, 8)
    
    all_embeddings = []
    
    for batch in tqdm(text_batches):
        encoded_input = tokenizer(batch, padding=True, truncation=True, return_tensors='pt')
    
        with torch.no_grad():
            outputs = model(**encoded_input)
            hidden_states = outputs.last_hidden_state
            
            batch_embeddings = hidden_states.mean(dim=1)
            batch_embeddings_np = batch_embeddings.cpu().numpy()
            all_embeddings.append(batch_embeddings_np)
    
    final_embeddings = np.vstack(all_embeddings)
    return final_embeddings

In [194]:
X_text = compute_embeddings(df['text'].tolist())

  0%|          | 0/119 [00:00<?, ?it/s]

In [195]:
X_text

array([[-0.00456325, -0.11667515,  0.6274717 , ..., -0.03659188,
         0.10031687,  0.02927116],
       [-0.14233607, -0.19853897,  0.28455406, ..., -0.01139052,
        -0.15399775,  0.0953509 ],
       [ 0.19672215, -0.08461291,  0.28200477, ...,  0.11395848,
        -0.06448034, -0.01282625],
       ...,
       [-0.28217435, -0.3332434 ,  0.29785013, ..., -0.3504273 ,
         0.03266073,  0.09537268],
       [-0.4280712 , -0.39468765,  0.30942   , ..., -0.05943285,
        -0.12965178,  0.0788708 ],
       [-0.16892141, -0.25146288,  0.47843295, ..., -0.18535407,
        -0.16108926,  0.27272934]], shape=(948, 768), dtype=float32)