In [1]:
#Importing all the important packages 

#For accessing and fetching info from the subtitle database file. Provides an SQL interface.
import sqlite3
#For creation of dataframe and saving subtitle info as csv for future retrieval.
import pandas as pd

import io
#For working with paths/directories.
import os
import glob

#Importing packages to use transformer models for embedding 
from sentence_transformers import SentenceTransformer, util

#For cacheing 
import joblib
from joblib import Memory

In [5]:
#loading a sentence BERT model 
model = SentenceTransformer("all-MiniLM-L6-v2")

In [6]:
#Chunking documents into mostly 240 characters 

#Defining a function
def chunking(list_):
    #List made to find last chunk beginning index
    chunk=[]
    #List to store all chunks
    chunks=[]
    j=200

    #To find the last chunk beginning index 
    for content in list_:
            for i in range(len(content)):
                #Finding all indexes that are divisible by 200 but not equal to 0
                if (i%200==0) and (i!=0):
                    #Appending indexes that follow above condition
                    chunk.append(i)
                    #Finds the last index in the list. This index is going to be the beginning of last chunk. 
                    last_one = chunk[-1]
    #Making chunks 
    for content in list_:
        for i in range(len(content)):
            #Chunks starting from the second to the second last
            if (i%200==0) and (i>=400):
                #Chunks including overlap of 20 characters forward and backward.
                chunks.append(content[j-20:i+20])
                #Assigning j with i value to beginning next round in the loop with j being the initial index (not condireing overlap).
                j=i
                #Last chunk
                if j == last_one:
                    chunks.append(content[j-20:])
            #First chunk
            elif (i==0):
                chunks.append(content[0:200+20])
    return chunks

In [9]:
path_1 = "data-20240401T135418Z-001/subtitle cleaned/*.srt"

names_=[]
all_docs=[]
for filename in glob.glob(path_1, recursive=True):
    #print(filename)
    #next 3 steps are useful for the last part of this for loop in naming output files.
    #removing the path name 
    name = filename.replace('data-20240401T135418Z-001\\subtitle cleaned/','')
    names_.append(name)
    with open(filename, 'r') as file:
        f = file.read().replace('\n','')
        f =  f.split('\n')
        #Each document is fed into the function to conduct chunking 
        chunks=chunking(f)
        #All list of chunks appended to a main list 
        all_docs.append(chunks)


In [10]:
cachedir = '.cache'
memory = Memory(location=cachedir, verbose=0)

#Taking only 30000 documents
data_final = all_docs[0:30000]

#finding the indexes 1000 indexes apart
for i in range(0, len(data_final), 1000):
    #Dividing 30000 into 30 parts documents
    batch = list(data_final[i:i + 1000])
    #Dividing 30000 into 30 parts names of files
    batch_name = list(names_[i:i + 1000])
    #Creating separate folder for each batch (30 batches in total)
    target_directory = "data-20240401T135418Z-001/embeddings/batch_" + str(i) + "to" + str(i + 1000) + "/"
    os.makedirs(target_directory, exist_ok=True)
    #Taking a document from a batch...
    for doc, name in zip(batch, batch_name):
        sub_name = name.replace('data-20240401T135418Z-001/subtitle_cleaned\\', '')
        #Embedding the document (consists of chunks)
        embedding = model.encode(doc) 
        df = pd.DataFrame(embedding)
        #Embeddings saved in a .txt file
        df.to_csv(os.path.join(target_directory, sub_name + '.txt'), index=False)

In [5]:
#Recollecting the names of embedded files

path_2='data-20240401T135418Z-001/embeddings'
embedded_filenames=[]
for root, dirs, files in os.walk(path_2):
    for file in files:
        file=file.replace('.txt','')
        embedded_filenames.append(file)

In [4]:
len(embedded_filenames)

30000

In [6]:
#All filenames saved in Dataframes
df = pd.DataFrame({'content_name':embedded_filenames})
#Opening previously saved .csv file containing IDs, names, and filenames of the subtitle files
df_1 = pd.read_csv('data-20240401T135418Z-001/subtitle_connection.csv')

In [7]:
#Cleaning
df_1=df_1.drop_duplicates(subset=['content_name'], keep='first')

In [8]:
#Finding the info that we need 
df_2 = pd.merge(df,df_1, how='left', on=['content_name'])

In [9]:
df_2.shape

(29990, 3)

In [10]:
df_2.isnull().sum()

content_name    0
ids             0
name            0
dtype: int64

In [10]:
ids= df_2['ids'].values
print(len(ids))

29990


In [2]:
import chromadb 

client=chromadb.PersistentClient(path='chroma')

In [3]:
#Creating a database
collection = client.create_collection("final_embed_data", metadata={"hnsw:space": "cosine"})

In [8]:
import numpy as np

path_2='data-20240401T135418Z-001/embeddings'
for root, dirs, files in os.walk(path_2):
    for file, id_ in zip(files, ids) :
        f = open(root+'/'+file)
        f=f.read()
        list_=list(f.split('\n'))
        #To exclude the numberings at the beginning of each file and the empty string a the end
        list_=list_[1:-1]
        
        for idx,x in enumerate(list_):
            array=[float(y) for y in x.split(',')]
            #Performing embedding
            collection.add(
                embeddings= array,
                ids=str(id_),
                metadatas={str(id_): idx})
        