In [None]:
# required imports
import os
import langchain
from langchain_openai import ChatOpenAI
from langchain_core.prompts import PromptTemplate
import bs4
from langchain_community.document_loaders import DirectoryLoader, TextLoader
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.document_loaders import TextLoader
import pandas as pd
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
import numpy as np
from langchain.embeddings import HuggingFaceEmbeddings

In [None]:
#process text files
data = []

#go through each lecture and get metadata
for i in range(1, 13):
    directory = f"{i:02d}_week-{i}"
    
    for file in os.listdir(directory):
        if file.endswith(".txt"):
            file_path = os.path.join(directory, file)
            
            # Extract the week number and lesson name from the filename
            parts = file.split("_", 3)
            week_number = int(parts[0])
            lesson_name = parts[-1].replace(".en.txt", "").replace("-", " ").split(" ", 1)[-1]
            
            try:
                # Load the text files
                loader = TextLoader(file_path, encoding="utf-8")
                docs = loader.load()
                
                # Split the document into chunks
                text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
                splits = text_splitter.split_documents(docs)
                
                # Add each chunk and metadata to array
                for split in splits:
                    data.append({
                        "Week": week_number,
                        "Lesson": lesson_name,
                        "Chunk": split.page_content
                    })
            except Exception as e:
                print(f"Error loading {file_path}: {str(e)}")
                continue

# final dataframe containing week, lesson, and chunk columns
content = pd.DataFrame(data)

content.to_csv("content.csv", index=False)

# lessons dataframe with unique lessons for each week
lessons = content[["Week", "Lesson"]].drop_duplicates()

# save to csv
lessons.to_csv("lessons.csv", index=False)

content

Unnamed: 0,Week,Lesson,Chunk
0,1,introduction video from previous semester,Hello welcome to CS410 DSO Text Information Sy...
1,1,introduction video from previous semester,"news articles, or Emails and other kind of doc..."
2,1,introduction video from previous semester,the Text Retrieval and Text Mining. And these ...
3,1,introduction video from previous semester,two steps corresponding to Text Retrieval and ...
4,1,introduction video from previous semester,because those books have covered a general tec...
...,...,...,...
1236,12,8 summary for exam 2,why statistical learning is important. We also...
1237,12,8 summary for exam 2,are interested in building practical text appl...
1238,12,8 summary for exam 2,the original text to verify that. And that is ...
1239,12,8 summary for exam 2,text retrieval and text mining. And text retri...


In [None]:
#set up huggingface embeddings
device = "cpu"

model_name = "Alibaba-NLP/gte-large-en-v1.5"
model_kwargs = {"device": device, "trust_remote_code": True}
encode_kwargs = {
    "normalize_embeddings": False,
    "batch_size": 512
}
# initialize
ali = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs,
)

  ali = HuggingFaceEmbeddings(
  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# generate embeddings for Chunk column and save to csv
content["vectors"] = ali.embed_documents(content["Chunk"])
content.to_csv("content_vectors.csv", index=False)

In [None]:
# generate embeddings for Lesson column and save to csv
lessons["vectors"] = ali.embed_documents(lessons["Lesson"])
lessons.to_csv("lessons_vectors.csv", index=False)

In [None]:
# function to create embeddings table used for vector store
# pc_col = custom user-defined column name for column that gets converted to embeddings within 'df'

def vector_store_faiss(df, embeddings, embeddings_model, pc_col = False, metadata_cols = False):
    # converting metadata to dictionary for FAISS
    metadata = df.to_dict(orient='records') if metadata_cols == False else df[metadata_cols].to_dict(orient='records')

    #extract text data
    # if pc_col is false use Concatenated column, otherwise pc_col
    texts = df['Concatenated'].tolist() if pc_col == False else df[pc_col].tolist()
    
    # pair text data with embeddings
    text_embedding_pairs = zip(texts, embeddings)

    #create table
    vector_store = FAISS.from_embeddings(text_embedding_pairs, embeddings_model, metadatas = metadata)
    
    return vector_store


In [None]:
# embeddings_content = [arr.tolist() for arr in content['vectors']]

# create embeddings table for content from Chunk column, drop redundant data in vectors column and convert to list for correct format
vs_content = vector_store_faiss(content.drop(columns = ['vectors']), content['vectors'].tolist(), ali, pc_col = 'Chunk')

# create embeddings table for Lessons, drop redundant data in vectors column and convert to list for correct format
vs_lessons = vector_store_faiss(lessons.drop(columns = ['vectors']), lessons['vectors'].tolist(), ali, pc_col = 'Lesson')

In [None]:
# testing content table
vs_content.similarity_search('text retrieval and text mining')

[Document(metadata={'Week': 12, 'Lesson': '8 summary for exam 2', 'Chunk': "text retrieval and text mining. And text retrieval, as I explained,\nis to help convert big text data into a small amount of most relevant data for\na particular problem, and can also help providing knowledge provenance,\nhelp interpreting patterns later. Text mining has to do with further\nanalyzing the relevant data to discover the actionable knowledge that can be\ndirectly useful for decision making or many other tasks. So this course covers text mining. And there's a companion course\ncalled Text Retrieval and Search Engines that covers text retrieval. If you haven't taken that course,\nit would be useful for you to take it, especially if you are interested\nin building a text caching system. And taking both courses will give you\na complete set of practical skills for building such a system. So in [INAUDIBLE]\nI just would like to thank you for taking this course. I hope you have learned useful knowledge\n

In [None]:
# testing lessons table
vs_lessons.similarity_search('introduction video from previous semester')

[Document(metadata={'Week': 1, 'Lesson': 'introduction video from previous semester'}, page_content='introduction video from previous semester'),
 Document(metadata={'Week': 5, 'Lesson': '5 6 link analysis part 1'}, page_content='5 6 link analysis part 1'),
 Document(metadata={'Week': 6, 'Lesson': '6 10 summary for exam 1'}, page_content='6 10 summary for exam 1'),
 Document(metadata={'Week': 9, 'Lesson': '9 latent dirichlet allocation lda part 1'}, page_content='9 latent dirichlet allocation lda part 1')]