In [1]:
# Embedding -> Representing a chunk of text as a vector.
# This allows each chunk to live in a vector space where we can determine the semantic "closeness" between vectors.
# semantic -> अर्थपूर्ण -> connected with the meaning of words and sentences -> 

In [2]:
%load_ext dotenv
%dotenv

# Indexing : Text Embedding with OpenAI

In [3]:
from langchain_community.document_loaders import Docx2txtLoader
from langchain_text_splitters.markdown import MarkdownHeaderTextSplitter
from langchain_text_splitters.character import CharacterTextSplitter
from langchain_openai.embeddings import OpenAIEmbeddings
import numpy as np

In [4]:
loader_docx = Docx2txtLoader("Introduction_to_Data_and_Data_Science_2.docx")
pages = loader_docx.load()

In [5]:
md_splitter = MarkdownHeaderTextSplitter(headers_to_split_on = [("#", "Course Title"), ("##", "Lecture Title")])

pages_md_split = md_splitter.split_text(pages[0].page_content)


In [6]:
len(pages_md_split)

2

In [7]:
for i in range(len(pages_md_split)):
    pages_md_split[i].page_content = ' '.join(pages_md_split[i].page_content.split())

In [8]:
char_splitter = CharacterTextSplitter(
    separator = ".",
    chunk_size = 500,
    chunk_overlap = 50
)

In [9]:
pages_char_split = char_splitter.split_documents(pages_md_split)

In [10]:
pages_char_split

[Document(metadata={'Course Title': 'Introduction to Data and Data Science', 'Lecture Title': 'Analysis vs Analytics'}, page_content='Alright! So… Let’s discuss the not-so-obvious differences between the terms analysis and analytics. Due to the similarity of the words, some people believe they share the same meaning, and thus use them interchangeably. Technically, this isn’t correct. There is, in fact, a distinct difference between the two. And the reason for one often being used instead of the other is the lack of a transparent understanding of both. So, let’s clear this up, shall we? First, we will start with analysis'),
 Document(metadata={'Course Title': 'Introduction to Data and Data Science', 'Lecture Title': 'Analysis vs Analytics'}, page_content='Consider the following… You have a huge dataset containing data of various types. Instead of tackling the entire dataset and running the risk of becoming overwhelmed, you separate it into easier to digest chunks and study them individu

In [11]:
len(pages_char_split)

20

In [12]:
embedding = OpenAIEmbeddings(model = "text-embedding-ada-002")

In [13]:
pages_char_split[3]

Document(metadata={'Course Title': 'Introduction to Data and Data Science', 'Lecture Title': 'Analysis vs Analytics'}, page_content='Analytics is essentially the application of logical and computational reasoning to the component parts obtained in an analysis. And in doing this you are looking for patterns and exploring what you could do with them in the future. Here, analytics branches off into two areas: qualitative analytics – this is using your intuition and experience in conjunction with the analysis to plan your next business move')

In [14]:
pages_char_split[5]

Document(metadata={'Course Title': 'Introduction to Data and Data Science', 'Lecture Title': 'Analysis vs Analytics'}, page_content='You may use this intuition to decide on which styles of clothing to start selling. This would be qualitative analytics. But you might not know when to introduce the new collection. In that case, relying on past sales data and user experience data, you could predict in which month it would be best to do that. This is an example of using quantitative analytics')

In [24]:
pages_char_split[18]

Document(metadata={'Course Title': 'Introduction to Data and Data Science', 'Lecture Title': 'Programming Languages & Software Employed in Data Science - All the Tools You Need'}, page_content='More importantly, it will be sufficient for your need to create quick and accurate analyses. However, if your theoretical preparation is strong enough, you will find yourself restricted by software. Knowing a programming language such as R and Python, gives you the freedom to create specific, ad-hoc tools for each project you are working on')

In [16]:
vector1 = embedding.embed_query(pages_char_split[3].page_content)
vector2 = embedding.embed_query(pages_char_split[5].page_content)
vector3 = embedding.embed_query(pages_char_split[18].page_content)

In [17]:
vector1

[0.0036942267324775457,
 0.010471823625266552,
 0.010045194067060947,
 -0.030458787456154823,
 -0.01976718381047249,
 0.012068454176187515,
 -0.024227404966950417,
 -0.01304453145712614,
 0.010484752245247364,
 -0.027097459882497787,
 0.004770497791469097,
 0.016690276563167572,
 -0.013128564693033695,
 0.006357431411743164,
 -0.01092431042343378,
 -0.016586851328611374,
 0.036095473915338516,
 -0.0008669958915561438,
 0.020258454605937004,
 -0.02367149293422699,
 -0.043645527213811874,
 0.02253381349146366,
 -0.008629558607935905,
 -0.027588730677962303,
 -0.010012873448431492,
 0.0037265471182763577,
 0.010342542082071304,
 -0.025028951466083527,
 0.0048642270267009735,
 -0.018202874809503555,
 0.009896519593894482,
 0.0057498072274029255,
 -0.007931437343358994,
 0.0004916747566312551,
 -0.009101437404751778,
 0.0028959119226783514,
 0.011215194128453732,
 0.015837017446756363,
 0.01533281896263361,
 0.01765989139676094,
 0.0151647524908185,
 0.0008993163355626166,
 -0.0341045334935

In [18]:
# dimensions
len(vector1), len(vector2), len(vector3)

(1536, 1536, 1536)

In [19]:
np.dot(vector1, vector2), np.dot(vector1, vector3), np.dot(vector2, vector3)

(np.float64(0.8793274798918262),
 np.float64(0.8000963686459617),
 np.float64(0.793592946863269))

In [20]:
# vectors 1 and 2 are further apart from vector 3 in this vector space

In [21]:
np.linalg.norm(vector1), np.linalg.norm(vector2), np.linalg.norm(vector3)

(np.float64(0.9999999683933797),
 np.float64(1.000000015618623),
 np.float64(0.9999999688261215))