In [1]:
%load_ext dotenv
%dotenv

In [2]:
from langchain_community.document_loaders import Docx2txtLoader
from langchain_text_splitters.markdown import MarkdownHeaderTextSplitter
from langchain_text_splitters.character import CharacterTextSplitter
from langchain_openai.embeddings import OpenAIEmbeddings
import numpy as np

In [3]:
loader_docx = Docx2txtLoader("Introduction_to_Data_and_Data_Science_2.docx")
pages = loader_docx.load()



md_splitter = MarkdownHeaderTextSplitter(headers_to_split_on = [('#', 'Course Title'),
                                                                ('##', 'Lecture Title')] )



pages_md_split = md_splitter.split_text(pages[0].page_content)



for i in range(len(pages_md_split)):
    pages_md_split[i].page_content = ' '.join(pages[0].page_content.split())



char_splitter = CharacterTextSplitter(
    separator = '.',
    chunk_size = 500,
    chunk_overlap = 50
)

pages_char_split = char_splitter.split_documents(pages_md_split)

In [4]:
pages_char_split


[Document(metadata={'Course Title': 'Introduction to Data and Data Science', 'Lecture Title': 'Analysis vs Analytics'}, page_content='# Introduction to Data and Data Science ## Analysis vs Analytics Alright! So… Let’s discuss the not-so-obvious differences between the terms analysis and analytics. Due to the similarity of the words, some people believe they share the same meaning, and thus use them interchangeably. Technically, this isn’t correct. There is, in fact, a distinct difference between the two. And the reason for one often being used instead of the other is the lack of a transparent understanding of both'),
 Document(metadata={'Course Title': 'Introduction to Data and Data Science', 'Lecture Title': 'Analysis vs Analytics'}, page_content='So, let’s clear this up, shall we? First, we will start with analysis. Consider the following… You have a huge dataset containing data of various types. Instead of tackling the entire dataset and running the risk of becoming overwhelmed, you

In [5]:
embedding = OpenAIEmbeddings(model = 'text-embedding-ada-002')

In [6]:
pages_char_split[18]

Document(metadata={'Course Title': 'Introduction to Data and Data Science', 'Lecture Title': 'Analysis vs Analytics'}, page_content='In terms of predictive analytics, EViews is mostly used for working with econometric time-series models, and Stata—for academic statistical and econometric research, where techniques like regression, cluster, and factor analysis are constantly applied. As a final note, remember the following. Should you have the relevant business and theoretical knowledge, learning a software tool is relatively easy as opposed to learning a programming language')

In [7]:
vector1 = embedding.embed_query(pages_char_split[3].page_content)
vector2 = embedding.embed_query(pages_char_split[5].page_content)
vector3 = embedding.embed_query(pages_char_split[18].page_content)

In [8]:
vector1

[-0.0009598906035535038,
 0.001555573078803718,
 0.00980286207050085,
 -0.02683161199092865,
 -0.022208597511053085,
 0.014477674849331379,
 -0.026391325518488884,
 -0.014322279952466488,
 0.002094601048156619,
 -0.01853090524673462,
 0.007187039125710726,
 0.022105000913143158,
 -0.008805741555988789,
 0.0011808434501290321,
 -0.009822286665439606,
 -0.02249348908662796,
 0.040972597897052765,
 -0.01149278786033392,
 0.01417983416467905,
 -0.019346732646226883,
 -0.035533756017684937,
 0.0159927811473608,
 -0.0017838100902736187,
 -0.024940967559814453,
 -0.01322803646326065,
 0.002340643899515271,
 0.009615092538297176,
 -0.026935208588838577,
 0.00833308044821024,
 -0.01048271730542183,
 0.007484880276024342,
 0.008825166150927544,
 -0.008864014409482479,
 -0.004597114864736795,
 -0.006562219932675362,
 -0.004917618352919817,
 0.013389906845986843,
 0.011097824200987816,
 0.017119398340582848,
 0.014529473148286343,
 0.006552507635205984,
 -0.0008109699701890349,
 -0.028515063226222

In [10]:
len(vector1)

1536

In [11]:
len(vector2)

1536

In [12]:
len(vector3)

1536

In [13]:
np.dot(vector1, vector2) , np.dot(vector1, vector3) , np.dot(vector2, vector3)

(0.8819651460848861, 0.8149513749284345, 0.8128024768485248)

In [14]:
np.linalg.norm(vector1), np.linalg.norm(vector2), np.linalg.norm(vector3)

(0.9999999970210938, 0.9999999432048748, 0.9999999243456454)