In [1]:
# Import necessary libraries
import os
import pandas as pd
from PyPDF2 import PdfReader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_google_genai import GoogleGenerativeAIEmbeddings
import google.generativeai as genai
from dotenv import load_dotenv

# Load environment variables
load_dotenv()
genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Function to extract text line by line from a PDF
def extract_text_from_pdf(pdf_path):
    reader = PdfReader(pdf_path)
    text_lines = []
    
    # Loop through the pages and extract text line by line
    for page in reader.pages:
        page_text = page.extract_text().split('\n')  # Split the text by lines
        text_lines.extend(page_text)
    
    return text_lines

In [3]:
def generate_embeddings(text_lines):
    embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
    embedded_lines = [embeddings.embed_query(line) for line in text_lines]
    return embedded_lines

In [4]:
def process_pdfs_in_directory(directory_path):
    pdf_files = [f for f in os.listdir(directory_path) if f.endswith('.pdf')]
    all_data = []

    for pdf_file in pdf_files:
        pdf_path = os.path.join(directory_path, pdf_file)
        text_lines = extract_text_from_pdf(pdf_path)
        embeddings = generate_embeddings(text_lines)
        
        # Store data in a list of dictionaries
        for line, embedding in zip(text_lines, embeddings):
            all_data.append({"PDF_File": pdf_file, "Text_Line": line, "Embedding": embedding})
    
    return all_data

In [9]:
# Directory path containing PDFs
pdf_directory = r"./"  # Make sure to replace this with your actual directory path

# Process the PDFs and create a DataFrame
pdf_data = process_pdfs_in_directory(pdf_directory)
df = pd.DataFrame(pdf_data)

# Display the DataFrame
df.head()

Unnamed: 0,PDF_File,Text_Line,Embedding
0,Adarsh_Agarwal_resume__ML.pdf,Adarsh Agarwal,"[0.031593174, 0.005383688, -0.04268268, -0.026..."
1,Adarsh_Agarwal_resume__ML.pdf,/envel⌢peadarshagarwal932@gmail.com,"[0.042841185, 0.013480655, -0.043075725, -0.05..."
2,Adarsh_Agarwal_resume__ML.pdf,/githubGitHub Profile,"[0.057152078, -0.0010283713, -0.06052837, -0.0..."
3,Adarsh_Agarwal_resume__ML.pdf,♂phone+91-8509406302,"[0.056045964, 0.007374667, -0.050200704, -0.03..."
4,Adarsh_Agarwal_resume__ML.pdf,Education,"[0.037251502, -0.025072152, -0.050775126, 0.02..."


In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   PDF_File   50 non-null     object
 1   Text_Line  50 non-null     object
 2   Embedding  50 non-null     object
dtypes: object(3)
memory usage: 1.3+ KB


In [14]:
df.shape

(50, 3)

In [15]:
df.isnull().sum()

PDF_File     0
Text_Line    0
Embedding    0
dtype: int64

In [17]:
df['Embedding_length'] = df['Embedding'].apply(lambda x: len(x))  # Length of each embedding
print("Basic statistics on Embedding lengths:\n", df['Embedding_length'].describe())

Basic statistics on Embedding lengths:
 count     50.0
mean     768.0
std        0.0
min      768.0
25%      768.0
50%      768.0
75%      768.0
max      768.0
Name: Embedding_length, dtype: float64
