In [1]:
import os
from openai import OpenAI
import pandas as pd
from neo4j import GraphDatabase
import logging

from neo4j_connection import Neo4jConnection
from openai_embedding_connection import OpenAIEmbeddingConnecton
from utils import parse_embeddings, parse_genres

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

class DataLoader:
    def __init__(self, neo4j_connection, openai_connection, file_book_path, vector_dimensions, similarity_function):
        """
        Initializes DataLoader with connections to Neo4j, OpenAI, and the file path for CSV operations.
        """
        self.neo4j = neo4j_connection
        self.openai = openai_connection
        self.file_book_path = file_book_path
        self.vector_dimensions = int(vector_dimensions)
        self.similarity_function = similarity_function

    def get_embeddings_from_openai(self, data):
        """
        Retrieves text embeddings from OpenAI based on the 'Summary' field in the data,
        and updates the CSV file with these embeddings.
        """
        data['Embeddings'] = data['Summary'].apply(lambda x: self.openai.get_embedding(x) if x else None)
        data.to_csv(self.file_book_path, index=False)
        return data['Embeddings']
    
    def load_books_from_csv(self):
        """
        Loads books from a CSV file into the database, retrieving embeddings if necessary.
        """
        try:
            logging.info("Starting to load data from CSV.")
            data = pd.read_csv(self.file_book_path)
        except Exception as e:
            logging.error(f'Error reading csv file: {e}')
            raise

        embeddings_needed = 'Embeddings' not in data.columns
        
        if embeddings_needed:
            logging.info("Vector data is missing in CSV, retrieving using OpenAI.")
            embeddings = self.get_embeddings_from_openai(data)

        if not self.neo4j.check_index():
            self.neo4j.create_index(self.vector_dimensions, self.similarity_function)
        
        for index, row in data.iterrows():
            try:
                genres = parse_genres(row['Genre'])
                embeddings = parse_embeddings(row)
                book_data = {
                    'title': row['Title'], 
                    'author': row['Author'], 
                    'lang': row['Language'],
                    'rating': row['Ratings'], 
                    'summary': row['Summary'],
                    'year': row['Publication Year'],
                }
                self.neo4j.load_book(genres=genres, **book_data)
                self.neo4j.load_summary(book_data['title'], embeddings)
            except Exception as e:
                logging.error(f"Failed to load data for '{row['Title']}' at index {index}: {e}")
        logging.info("All data has been successfully loaded into the database.")

if __name__ == "__main__":
    try:
        logging.info("Process started.")
        uri = os.getenv("NEO4J_URI")
        user = os.getenv("NEO4J_USERNAME")
        password = os.getenv("NEO4J_PASSWORD")
        file_book_path = "/app/books.csv"
        vector_dimensions = os.getenv("DIMENSIONS")
        similarity_function = os.getenv("SIMILARITY_FUNCTION")
        openai_key = os.getenv("OPENAI_API_KEY")
        embedding_model = os.getenv("EMBEDDING_MODEL")
        
        neo4j_conn = Neo4jConnection(uri, user, password)
        openAI_embeddings_conn = OpenAIEmbeddingConnecton(openai_key, embedding_model)
        loader = DataLoader(neo4j_conn, openAI_embeddings_conn, file_book_path, vector_dimensions, similarity_function)
        loader.load_books_from_csv()
    finally:
        neo4j_conn.close()
        logging.info("Process finished.")


2024-08-01 13:11:28,000 - INFO - Process started.
2024-08-01 13:11:28,035 - INFO - Starting to load data from CSV.
2024-08-01 13:11:28,610 - INFO - No vector index found.
2024-08-01 13:11:30,095 - INFO - Vector index created successfully.
2024-08-01 13:11:53,975 - INFO - All data has been successfully loaded into the database.
2024-08-01 13:11:53,980 - INFO - Neo4j connection closed.
2024-08-01 13:11:53,982 - INFO - Process finished.
