# Memory project - Database vectorization

## Secrets

In [1]:
import os
from openai import OpenAI
from dotenv import load_dotenv
from pathlib import Path

# Load path from the environment variable
env_ih1 = os.getenv("ENV_IH1")

dotenv_path = Path(env_ih1)
load_dotenv(dotenv_path=dotenv_path)
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
PINECONE_API_KEY= os.getenv('PINECONE_KEY')
SERPAPI_API_KEY = os.getenv('SERPAPI_API_KEY')
STEAMSHIP_API_KEY = os.getenv('STEAMSHIP_API_KEY')
LANGSMITH_API_KEY = os.getenv('LANGSMITH_API_KEY')
HUGGINGFACEHUB_API_TOKEN = os.getenv('HUGGINGFACEHUB_API_TOKEN')
GEMINI_KEY = os.getenv('GEMINI_KEY')

os.environ['PATH'] += os.pathsep + '/usr/bin'

## Libraries

## Family safe 

In [2]:
from pinecone import Pinecone

## Creating Pinecone DB

In [3]:
import pinecone as pc
from pinecone import Pinecone, ServerlessSpec

spec = ServerlessSpec(
    cloud="aws", region="us-east-1"
)

# connect to pinecone environment
pc = Pinecone(
    api_key = PINECONE_API_KEY,
    environment='us-east-1'  # find next to API key in console
)

In [4]:
index_name = "memory-project"

# check if the memory project index exists
if index_name not in pc.list_indexes().names():
    # create the index if it does not exist
    pc.create_index(name=index_name, dimension=768, metric="cosine", spec=spec)
# connect to extractive-question-answering index we created
index = pc.Index(index_name)

## test with pages as chunks

In [None]:
import os
import json
import pandas as pd
from sentence_transformers import SentenceTransformer 
from pinecone import Pinecone
import shutil
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer 

def process_json_files(data_dir='/Family safe', processed_dir=r'C:\Users\aurel\OneDrive\Documents\Python\IronHack\Project\Family safe\processed'):
    """
    Fetches all JSON files from the specified directory, 
    creates a DataFrame from the JSON data, 
    processes the data, and moves processed files to the specified directory.

    Args:
        data_dir: Path to the directory containing the JSON files.
        processed_dir: Path to the directory where processed files will be moved.

    Returns:
        A DataFrame containing the combined data from all JSON files.
    """

    all_data = []
    for filename in os.listdir(data_dir):
        if filename.endswith(".json"):
            filepath = os.path.join(data_dir, filename)
            with open(filepath, 'r') as f:
                json_data = json.load(f)
            all_data.extend(json_data)

            # Move processed file 
            processed_filepath = os.path.join(processed_dir, filename)
            shutil.move(filepath, processed_filepath) 

    df = pd.DataFrame(all_data)

    # Data cleaning and transformation
    df['Doc name'] = df['Name']  # Assuming 'Name' is in the JSON
    df['Type'] = df['Type'].fillna('unknown') 
    df['Full text'] = df['Pages'].apply(lambda x: ' '.join([page['Text'] for page in x]) if x else "") 

    # Generate summaries using a language model 
    model_name = "google/flan-t5-base"  # Replace with your preferred summarization model
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
    df['Summary'] = df['Full text'].apply(lambda x: generate_summary(x, model, tokenizer))

    df['Chunks'] = df['Pages'].apply(lambda x: [page['Text'] for page in x] if x else [])
    df['processed_address'] = processed_filepath 
    df = df[['Doc name', 'Type', 'Author', 'Date', 'Summary', 'Chunks', 'Full text', 'processed_address']] 

    return df

def generate_summary(text, model, tokenizer):
    """
    Generates a summary of the given text using a language model.

    Args:
        text: The input text.
        model: The language model.
        tokenizer: The tokenizer for the language model.

    Returns:
        The generated summary.
    """
    inputs = tokenizer(text, max_length=512, truncation=True, return_tensors="pt")
    summary_ids = model.generate(**inputs)
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

def get_embeddings(text):
    """
    Generates embeddings for the given text using a pre-trained model.

    Args:
        text: The text to generate embeddings for.

    Returns:
        A list of floats representing the embeddings.
    """
    model = SentenceTransformer('all-MiniLM-L6-v2') 
    embeddings = model.encode(text)
    return embeddings

def upload_to_pinecone(df, index_name, dimension):
    """
    Uploads the DataFrame to a Pinecone index.

    Args:
        df: The DataFrame containing the data.
        index_name: The name of the Pinecone index.
        dimension: The dimensionality of the embeddings.
    """
    initialize(api_key="YOUR_API_KEY", environment="YOUR_ENVIRONMENT") 
    index = Pinecone.Index(index_name) 

    # Upsert both full text and summary to Pinecone
    for _, row in df.iterrows():
        if pd.isna(row['processed_address']): 
            doc_id = row['Doc name'] 
            full_text_vector = get_embeddings(row['Full text'])
            summary_vector = get_embeddings(row['Summary']) 
            index.upsert([
                (doc_id + "_full_text", full_text_vector), 
                (doc_id + "_summary", summary_vector)
            ]) 

# Get the final DataFrame
final_df = process_json_files()

# Upload the DataFrame to Pinecone
index_name = "your_index_name" 
dimension = 768  # Dimensionality of your embeddings
upload_to_pinecone(final_df, index_name, dimension)

## Test with chunks overlap

In [None]:
import os
import json
import pandas as pd
from sentence_transformers import SentenceTransformer 
from pinecone import Pinecone, Index
import shutil
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer 

def process_json_files(data_dir=r'.\Family safe', processed_dir=r'.\Family_safe\processed', chunk_size=512, overlap=128):
    """
    Fetches all JSON files from the specified directory, 
    creates a DataFrame from the JSON data, 
    processes the data, and moves processed files to the specified directory.

    Args:
        data_dir: Path to the directory containing the JSON files.
        processed_dir: Path to the directory where processed files will be moved.
        chunk_size: Size of each chunk in tokens.
        overlap: Size of the overlap between chunks.

    Returns:
        A DataFrame containing the combined data from all JSON files.
    """

    all_data = []
    for filename in os.listdir(data_dir):
        if filename.endswith(".json"):
            filepath = os.path.join(data_dir, filename)
            with open(filepath, 'r', encoding='utf-8') as f:
                json_data = json.load(f)

            for doc in json_data:
                if isinstance(doc, dict) and 'Type' in doc and doc['Type'] == 'scan':
                    doc['Chunks'] = create_overlapping_chunks(doc['Pages'], chunk_size, overlap)
                elif isinstance(doc, dict) and 'Pages' in doc:
                    doc['Chunks'] = [[page['Text']] for page in doc['Pages']]
                else:
                    doc['Chunks'] = []

            all_data.extend(json_data)

            # Move processed file 
            processed_filepath = os.path.join(processed_dir, filename)
            shutil.move(filepath, processed_filepath) 

    df = pd.DataFrame(all_data)

    # Data cleaning and transformation
    df['Doc name'] = df['Name']  # Assuming 'Name' is in the JSON
    df['Type'] = df['Type'].fillna('unknown') 

    # Extract full text from Chunks (if available)
    df['Full text'] = df['Chunks'].apply(lambda x: ' '.join([chunk for sublist in x for chunk in sublist]) if x else "") 

    # Generate summaries using a language model 
    model_name = "google/flan-t5-base"  # Replace with your preferred summarization model
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
    df['Summary'] = df['Full text'].apply(lambda x: generate_summary(x, model, tokenizer))

    df['processed_address'] = processed_filepath 
    df = df[['Doc name', 'Type', 'Author', 'Date', 'Summary', 'Chunks', 'Full text', 'processed_address']] 

    return df

def create_overlapping_chunks(pages, chunk_size, overlap):
    """
    Creates overlapping chunks from a list of pages.

    Args:
        pages: List of pages, where each page is a dictionary with 'Text' and 'Page number'.
        chunk_size: Size of each chunk in tokens.
        overlap: Size of the overlap between chunks.

    Returns:
        List of chunks, where each chunk is a list of page texts.
    """

    all_text = ' '.join([page['Text'] for page in pages])
    tokens = tokenizer.tokenize(all_text)
    num_chunks = max(1, (len(tokens) - overlap) // (chunk_size - overlap)) 
    chunks = []

    for i in range(num_chunks):
        start = i * (chunk_size - overlap)
        end = start + chunk_size
        chunk_tokens = tokens[start:end]
        chunk_text = tokenizer.decode(chunk_tokens)
        # Determine which pages belong to this chunk
        chunk_pages = []
        current_chunk_len = 0
        for page in pages:
            page_tokens = tokenizer.tokenize(page['Text'])
            if current_chunk_len + len(page_tokens) <= len(chunk_tokens):
                chunk_pages.append(page['Text'])
                current_chunk_len += len(page_tokens)
        chunks.append(chunk_pages)

    return chunks

def generate_summary(text, model, tokenizer):
    """
    Generates a summary of the given text using a language model.

    Args:
        text: The input text.
        model: The language model.
        tokenizer: The tokenizer for the language model.

    Returns:
        The generated summary.
    """
    inputs = tokenizer(text, max_length=512, truncation=True, return_tensors="pt")
    summary_ids = model.generate(**inputs)
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

def get_embeddings(text):
    """
    Generates embeddings for the given text using a pre-trained model.

    Args:
        text: The text to generate embeddings for.

    Returns:
        A list of floats representing the embeddings.
    """
    model = SentenceTransformer('all-MiniLM-L6-v2') 
    embeddings = model.encode(text)
    return embeddings

def upload_to_pinecone(df, index_name, dimension):
    """
    Uploads the DataFrame to a Pinecone index.

    Args:
        df: The DataFrame containing the data.
        index_name: The name of the Pinecone index.
        dimension: The dimensionality of the embeddings.
    """
    # # itialize(PINECONE_API_KEY, environment=spec) 
    # index = Pinecone.Index(index_name) 

    for _, row in df.iterrows():
        if pd.isna(row['processed_address']): 
            doc_id = row['Doc name'] 
            for i, chunk in enumerate(row['Chunks']):
                chunk_id = f"{doc_id}_{i}" 
                chunk_text = ' '.join(chunk)
                chunk_vector = get_embeddings(chunk_text)
                index.upsert([(chunk_id, chunk_vector)]) 

# Get the final DataFrame
final_df = process_json_files()

# Upload the DataFrame to Pinecone
dimension = 768  # Dimensionality of your embeddings
upload_to_pinecone(final_df, index_name, dimension)

In [6]:
import os
import json
import pandas as pd
from sentence_transformers import SentenceTransformer
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import shutil
from pinecone import Index  # Import Index for Pinecone operations

def process_json_files(data_dir=r'.\Family safe', processed_dir=r'C:\Users\aurel\OneDrive\Documents\Python\IronHack\Project\Family safe\processed', chunk_size=512, overlap=128):
    
    """
    Processes all JSON files in the specified directory, extracts data, and generates chunks and summaries.

    Args:
        data_dir: Directory containing JSON files.
        processed_dir: Directory where processed files will be moved.
        chunk_size: Size of each chunk in tokens.
        overlap: Size of the overlap between chunks.

    Returns:
        A DataFrame containing the processed data.
    """
    all_data = []
    for filename in os.listdir(data_dir):
        if filename.endswith(".json"):
            filepath = os.path.join(data_dir, filename)
            with open(filepath, 'r', encoding='utf-8') as f:
                json_data = json.load(f)

            # Handle JSON as a dictionary or list
            if isinstance(json_data, dict):  # Single document
                doc = json_data
                if doc.get('Type') == 'scan' and 'Pages' in doc:
                    doc['Chunks'] = create_overlapping_chunks(doc['Pages'], chunk_size, overlap)
                elif 'Pages' in doc:
                    doc['Chunks'] = [[page.get('Extracted Text', '')] for page in doc['Pages'] if isinstance(page, dict)]
                else:
                    doc['Chunks'] = []
                all_data.append(doc)
            elif isinstance(json_data, list):  # List of documents
                for doc in json_data:
                    if isinstance(doc, dict):
                        if doc.get('Type') == 'scan' and 'Pages' in doc:
                            doc['Chunks'] = create_overlapping_chunks(doc['Pages'], chunk_size, overlap)
                        elif 'Pages' in doc:
                            doc['Chunks'] = [[page.get('Extracted Text', '')] for page in doc['Pages'] if isinstance(page, dict)]
                        else:
                            doc['Chunks'] = []
                        all_data.append(doc)
                    else:
                        raise ValueError(f"Unexpected doc structure in list: {doc}")
            else:
                raise ValueError(f"Unexpected JSON structure: {json_data}")

            # Move processed file
            processed_filepath = os.path.join(processed_dir, filename)
            shutil.move(filepath, processed_filepath)

    df = pd.DataFrame(all_data)

    # Data cleaning and transformation
    df['Doc name'] = df['Name']  # Extract document name
    df['Type'] = df['Type'].fillna('unknown')

    # Extract full text from Chunks
    df['Full text'] = df['Chunks'].apply(
        lambda x: ' '.join([chunk for sublist in x for chunk in sublist]) if x else "")

    # Generate summaries using a language model
    model_name = "google/flan-t5-base"  # Replace with your preferred summarization model
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
    df['Summary'] = df['Full text'].apply(lambda x: generate_summary(x, model, tokenizer))

    df['processed_address'] = processed_filepath
    df = df[['Doc name', 'Type', 'Author', 'Date', 'Summary', 'Chunks', 'Full text', 'processed_address']]

    return df

def create_overlapping_chunks(pages, chunk_size, overlap):
    """
    Creates overlapping chunks from a list of pages.

    Args:
        pages: List of pages, where each page is a dictionary with 'Extracted Text'.
        chunk_size: Size of each chunk in tokens.
        overlap: Size of the overlap between chunks.

    Returns:
        List of chunks, where each chunk is a list of page texts.
    """
    all_text = ' '.join([page.get('Extracted Text', '') for page in pages if isinstance(page, dict)])
    tokenizer = AutoTokenizer.from_pretrained('google/flan-t5-base')
    token_ids = tokenizer.encode(all_text)  # Encode the text into token IDs
    num_chunks = max(1, (len(token_ids) - overlap) // (chunk_size - overlap))
    chunks = []

    for i in range(num_chunks):
        start = i * (chunk_size - overlap)
        end = start + chunk_size
        chunk_token_ids = token_ids[start:end]
        chunk_text = tokenizer.decode(chunk_token_ids)  # Decode back to text
        chunk_pages = []
        current_chunk_len = 0
        for page in pages:
            page_tokens = tokenizer.encode(page.get('Extracted Text', ''))
            if current_chunk_len + len(page_tokens) <= len(chunk_token_ids):
                chunk_pages.append(page.get('Extracted Text', ''))
                current_chunk_len += len(page_tokens)
        chunks.append(chunk_pages)

    return chunks

def generate_summary(text, model, tokenizer):
    """
    Generates a summary of the given text using a language model.

    Args:
        text: The input text.
        model: The language model.
        tokenizer: The tokenizer for the language model.

    Returns:
        The generated summary.
    """
    inputs = tokenizer(text, max_length=512, truncation=True, return_tensors="pt")
    summary_ids = model.generate(**inputs)
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

def get_embeddings(text):
    """
    Generates embeddings for the given text using a pre-trained model.

    Args:
        text: The text to generate embeddings for.

    Returns:
        A list of floats representing the embeddings.
    """
    model = SentenceTransformer('all-MiniLM-L6-v2')
    embeddings = model.encode(text)
    return embeddings

def upload_to_pinecone(df, index_name, dimension):
    """
    Uploads the DataFrame to a Pinecone index.

    Args:
        df: The DataFrame containing the data.
        index_name: The name of the Pinecone index.
        dimension: The dimensionality of the embeddings.
    """


    for _, row in df.iterrows():
        if pd.isna(row['processed_address']):
            doc_id = row['Doc name']
            for i, chunk in enumerate(row['Chunks']):
                chunk_id = f"{doc_id}_{i}"
                chunk_text = ' '.join(chunk)
                chunk_vector = get_embeddings(chunk_text)
                index.upsert([(chunk_id, chunk_vector)])

# Get the final DataFrame
final_df = process_json_files()

# Upload the DataFrame to Pinecone
dimension = 768  # Dimensionality of your embeddings
index_name = "memory-project"  # Replace with your Pinecone index name
upload_to_pinecone(final_df, index_name, dimension)


KeyError: 'Name'