# Data Preprocessing

## Loading the data from the directory and creating one cleaning function

In [19]:
import os
import nltk
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from langchain.text_splitter import RecursiveCharacterTextSplitter

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

def read_file(file):
    with open(file, 'r') as f:
        return f.read()

def preprocess_text(text):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
    text_chunks = text_splitter.split_text(text)
    text = ' '.join(text_chunks)
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    text = text.lower()
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    preprocessed_text = ' '.join(tokens)
    
    return preprocessed_text


[nltk_data] Downloading package punkt to
[nltk_data]     /teamspace/studios/this_studio/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /teamspace/studios/this_studio/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /teamspace/studios/this_studio/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [20]:
introductory_notes = read_file("Data/introductory_notes.txt")
lecture_notes = read_file("Data/lecture_notes.txt")
pdf_texts = read_file("Data/pdf_extracted_texts.txt")

preprocessed_introductory_notes = preprocess_text(introductory_notes)
preprocessed_lecture_notes = preprocess_text(lecture_notes)
preprocessed_pdf_texts = preprocess_text(pdf_texts)

texts = [preprocessed_introductory_notes, preprocessed_lecture_notes, preprocessed_pdf_texts]

In [21]:
import pandas as pd

llm_milestone_df = pd.read_csv("Data/llm_milestone.csv")

In [22]:
llm_texts = []
for index, row in llm_milestone_df.iterrows():
    llm_text = f"Model: {row['Model']}, Year: {row['Year']}, Institution: {row['Institution']}, " \
               f"Paper Name: {row['Paper Name']}, Authors : {row['Authors']}, " \
               f"Abstract: {row['Abstract']}"
    llm_text = preprocess_text(llm_text)
    llm_texts.append(llm_text)

## Calculating Embeddings and preparing source files

In [30]:
from sentence_transformers import SentenceTransformer
from langchain_community.embeddings import HuggingFaceEmbeddings
import numpy as np
from langchain_community.vectorstores import FAISS

metadata = [
    {'source' : 'Introductory Notes'},
    {'source' : 'Standford Security Lecture'},
    {'source' : 'Standford Training Lecture'},
    {"source": "PDF Extracted Text MIT's Transformer Chapter"},
    {'source' : 'PDF Extracted Text Standford LLM Chapter'},
    {'source' : 'PDF Extracted Text Princeton LLM Chapter'},
    {'source' : 'PDF Extracted Text Standford LLM and Applications Chapter'},
    {'source' : 'LLM Milestone CSV File Model Transformer'},
    {'source' : 'LLM Milestone CSV File Model GPT 1.0'},
    {'source' : 'LLM Milestone CSV File Model BERT'},
    {'source' : 'LLM Milestone CSV File Model GPT 2.0'},
    {'source' : 'LLM Milestone CSV File Model T5'},
    {'source' : 'LLM Milestone CSV File Model GPT 3.0'}
]
all_texts = texts + llm_texts
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
vector_store = FAISS.from_texts(all_texts, embeddings, metadatas=metadata)
vector_store.save_local("Data/faiss_index")



### Saving preprocessed texts

In [31]:
with open("Data/preprocessed_introductory_notes.txt", "w") as f:
    f.write(preprocessed_introductory_notes)

with open("Data/preprocessed_lecture_notes.txt", "w") as f:
    f.write(preprocessed_lecture_notes)

with open("Data/preprocessed_pdf_notes.txt", "w") as f:
    f.write(preprocessed_pdf_texts)