# genAI
Ajout category à des livres

**Ce notebook ne sert qu'à faire des experimentations, run le ajout_category.py pour directement lancer le script.**

## 0. Préparation de l'environnement

In [2]:
pip install -r requirements.txt

Collecting aiohappyeyeballs==2.4.2 (from -r requirements.txt (line 1))
  Using cached aiohappyeyeballs-2.4.2-py3-none-any.whl.metadata (6.0 kB)
Collecting aiohttp==3.10.6 (from -r requirements.txt (line 2))
  Using cached aiohttp-3.10.6-cp312-cp312-win_amd64.whl.metadata (7.8 kB)
Collecting aiosignal==1.3.1 (from -r requirements.txt (line 3))
  Using cached aiosignal-1.3.1-py3-none-any.whl.metadata (4.0 kB)
Collecting annotated-types==0.7.0 (from -r requirements.txt (line 4))
  Using cached annotated_types-0.7.0-py3-none-any.whl.metadata (15 kB)
Collecting anyio==4.6.0 (from -r requirements.txt (line 5))
  Using cached anyio-4.6.0-py3-none-any.whl.metadata (4.6 kB)
Collecting asgiref==3.8.1 (from -r requirements.txt (line 6))
  Using cached asgiref-3.8.1-py3-none-any.whl.metadata (9.3 kB)
Collecting attrs==24.2.0 (from -r requirements.txt (line 8))
  Using cached attrs-24.2.0-py3-none-any.whl.metadata (11 kB)
Collecting backoff==2.2.1 (from -r requirements.txt (line 9))
  Using cached 

Reason for being yanked: Regression: https://github.com/aio-libs/aiohappyeyeballs/issues/100


In [14]:
# # If you prefer, you can install the requirement manually here : 

# ! pip install tqdm
# ! pip install langchain
# ! pip install langchain-community
# ! pip install unstructured
# ! pip install unstructured[docx]
# ! pip install python-magic-bin
# ! pip install python-magic
# ! pip install langchain_openai
# ! pip install chromadb
# ! pip install pandas
# ! pip install openpyxl
# # analyse des résultats :
# ! pip install matplotlib seaborn
# ! pip install chromadb


Collecting chromadb
  Using cached chromadb-0.5.11-py3-none-any.whl.metadata (6.8 kB)
Collecting build>=1.0.3 (from chromadb)
  Using cached build-1.2.2-py3-none-any.whl.metadata (6.2 kB)
Collecting chroma-hnswlib==0.7.6 (from chromadb)
  Using cached chroma_hnswlib-0.7.6.tar.gz (32 kB)
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'
Collecting fastapi>=0.95.2 (from chromadb)
  Using cached fastapi-0.115.0-py3-none-any.whl.metadata (27 kB)
Collecting uvicorn>=0.18.3 (from uvicorn[standard]>=0.18.3->chromadb)
  Using cached uvicorn-0.31.0-py3-none-any.whl.metadata (6.6 kB)
Collecting posthog>=2.4.0 (from chromadb)
  Using cached posthog-3.6.6-py2.py3-none-any.whl.metadata (2.0 kB)
Collecting onnxruntime>=1.14.

In [2]:
# Import librairies for the project
from tqdm.autonotebook import tqdm

# Langchain

from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain.retrievers.multi_query import MultiQueryRetriever # MultiQueryRetriever
from langchain_core.prompts import PromptTemplate


import numpy as np # similarity calculations (cosine similarity)



# Azure Open AI
from langchain_openai import AzureChatOpenAI
from langchain_openai import AzureOpenAIEmbeddings


# .env
import os
from dotenv import find_dotenv, load_dotenv


# load documents
from langchain_community.document_loaders import DirectoryLoader


# evaluation des résultats
import pandas as pd
from langchain.evaluation import load_evaluator
from langchain.evaluation import EvaluatorType
from langchain.evaluation import Criteria
from openpyxl import load_workbook
from datetime import datetime

# analyse des résultats
import matplotlib.pyplot as plt
import seaborn as sns



  from tqdm.autonotebook import tqdm


## 1. Préparation Langchain


In [3]:
# load env
load_dotenv()
OPENAI_API_VERSION = os.environ["AZURE_OPENAI_API_VERSION"]
AZURE_OPENAI_ENDPOINT= os.environ["AZURE_OPENAI_ENDPOINT"]
AZURE_OPENAI_API_KEY = os.environ["AZURE_OPENAI_API_KEY"]
# Pour tracer les requetes dans langchains
# LANGCHAIN_TRACING_V2 = 'true'
# LANGCHAIN_ENDPOINT= os.environ['LANGCHAIN_ENDPOINT']



# load models
embeddings = AzureOpenAIEmbeddings(
    azure_deployment="gpt-embedding-ada-002",
    azure_endpoint=AZURE_OPENAI_ENDPOINT,
    api_key=AZURE_OPENAI_API_KEY,
    openai_api_version="2023-12-01-preview"
)

# LLM. Ici on charge gpt 3.5 et gpt 4o pour comparer leurs résultats
llm_gpt35 = AzureChatOpenAI(
    deployment_name="gpt-35-t",api_key=AZURE_OPENAI_API_KEY,api_version= "2024-02-01",
    temperature=0,
    max_tokens=800,
    top_p= 0, frequency_penalty= 0, presence_penalty= 0
)

llm_gpt4o = AzureChatOpenAI(
    deployment_name="gpt4o",api_key=AZURE_OPENAI_API_KEY,api_version= "2024-02-01",
        temperature=0,
    max_tokens=800
)

In [4]:
# prompts
template_1 = """
en utilisant tes connaissances, classe l'oeuvre dans une des catégories suivantes : 

fiction/SF
fiction/fantasy
fiction/fantastique

non-fiction/lifestyle
non-fiction/parenting
non-fiction/histoire
non-fiction/santé

si aucune des catégories ne convient, tu peux générer une autre, en gardant la même structure (en commençant par fiction ou non-fiction).
n'ajoute rien d'autre.

exemple : 
harry potter et la chambre des secrets = fiction/fantastique

question: {oeuvre}

catégorie:"""


rag_prompt_1 = PromptTemplate.from_template(template_1)




# préparation post processing des résultats
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

In [5]:
chain_category = (
    {"oeuvre": RunnablePassthrough()}
    | rag_prompt_1
    | llm_gpt4o
    | StrOutputParser()
) # uniquement pour la partie analyse des résultats

In [6]:
df = pd.read_excel("export_xls\liste_livre.xlsx")

  df = pd.read_excel("export_xls\liste_livre.xlsx")


In [11]:
df = df.drop('categorie', axis=1)

In [17]:
df['oeuvre'] = df[['Titre', 'Auteurs']].astype(str).agg(' - '.join, axis=1)


In [18]:
df.head()

Unnamed: 0.1,Unnamed: 0,id,Titre,Auteurs,highlights,pages,Série,Langue,total Nbr de pages lues,Date dernière ouverture,id long,format,page,Auteurs courts,is read more than 50 %,oeuvre
0,,43,A Princess of Mars,Edgar Rice Burroughs,0,557,,en,1,2024-05-27,43,ebook,,Edgar Rice,0,A Princess of Mars - Edgar Rice Burroughs
1,,61,Atomic Habits: An Easy & Proven Way to Build G...,James Clear,0,482,,en,7,2024-08-08,61,ebook,,James Clear,0,Atomic Habits: An Easy & Proven Way to Build G...
2,,10,Black Destroyer,A. E. Van Vogt,0,60,,en,1,2024-01-08,10,ebook,,Van Vogt,0,Black Destroyer - A. E. Van Vogt
3,,63,Blackwater 1 - La crue: L'épique saga de la fa...,Michael McDowell,0,248,Blackwater #1,fr,234,2024-08-18,63,ebook,,Michael McDowell,1,Blackwater 1 - La crue: L'épique saga de la fa...
4,,49,CN4 Jeux d'ombres,Glen Cook,0,542,Les Annales de la Compagnie Noire #4,fr,7,2024-06-11,49,ebook,,Glen Cook,0,CN4 Jeux d'ombres - Glen Cook


In [19]:
# fonction pour créer une colonne avec un résumé de la question. Servira à gagner en lisibilité sur les visuels
def find_category(oeuvre):
    result = chain_category.invoke(oeuvre)
    return result

df['categorie'] = df['oeuvre'].apply(find_category)

In [20]:
# Extraire un excel :



output_directory = "export_xls/"
output_file = f'{output_directory}liste_livre_category.xlsx'
df.to_excel(output_file, index=False)
print(f"Le fichier {output_file} a été créé avec succès.")

Le fichier export_xls/liste_livre_category.xlsx a été créé avec succès.


In [22]:
jupyter nbconvert --to script mon_notebook.ipynb


SyntaxError: invalid syntax (2883228692.py, line 1)