In [84]:
import json
import pandas as pd
import os
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from processData.AnimeRequest import generateDataFramePersonagens, generateDataFrameAnimes, generateDataFrameEpisodios


In [None]:
files = os.listdir('./dados/')

listaDePersonagens = []
animesEpisodios = []

for file in files:
    if file.endswith('.json'):
        if file.startswith('personagensAnimes'):
            listaDePersonagens.append(file)
        if file.startswith('episodios-'):
            animesEpisodios.append(file)

print(listaDePersonagens)
print(len(listaDePersonagens))
print(animesEpisodios)
print(len(animesEpisodios))


In [86]:
df = generateDataFrameAnimes('C:/Repositorios/RAG-Anime/dados/animes.json')

In [None]:
df[0]

In [None]:
print(df[0].columns)

In [None]:
df[0].info()

In [None]:
#Criando os dataframes
dataFramePersonagens = []
for i in range(0, len(listaDePersonagens)):
    dataFramePersonagens.append(generateDataFramePersonagens(f'./dados/{listaDePersonagens[i]}'))
    

In [None]:
dataFramePersonagens[0][0].info()

In [None]:
dataFramePersonagens[0][0]['character.name']

In [93]:
# Dividir sinopses em chunks
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=400,
    chunk_overlap=50,
    separators=["\n", "。", "."]  # Adaptado para frases
)

In [None]:
chunksAnimes = []
print(len(df))
for i in range(0, 213):
    for index, row in df[i].iterrows():
        if row["synopsis"] != None:
            synopsis_chunks = text_splitter.split_text(row["synopsis"])
            print(synopsis_chunks)
            for chunk in synopsis_chunks:
                chunksAnimes.append({
                    "text": chunk,
                    "title": row["title"],
                    "episodes": row["episodes"],
                    "year": row["year"],
                    "mal_id": row["mal_id"],
                    'type': row['type'],
                    'status': row['status'],
                    'studios': row['studios'],

                })

In [None]:
chunksPersonagens = []

for i in range(0, len(listaDePersonagens)):
    for item in dataFramePersonagens[i]:
        for index, row in item.iterrows():
            print(item['character.name'])
            names_chunks = text_splitter.split_text(row["character.name"])
            print(names_chunks)
            for chunk in names_chunks:
                chunksPersonagens.append({
                        "text": chunk,
                        "role": row["role"],
                })

In [96]:
# Extrair textos e metadados para o FAISS
texts = [chunk["text"] for chunk in chunksAnimes]
metadatas = [{
    "title": chunk["title"],
      "episodes": chunk["episodes"], 
      "year": chunk["year"], 
      'mal_id': chunk['mal_id'], 
      'type': chunk['type'], 
      'status': chunk['status'], 
      'studios': chunk['studios']} for chunk in chunksAnimes]


textsPersonagens = [chunk["text"] for chunk in chunksPersonagens]
metadatasPersonagens = [{
    'role': chunk['role'],
      } for chunk in chunksPersonagens]

In [97]:
# Gerar embeddings (modelo multilíngue)
embeddingsAnimes = HuggingFaceEmbeddings(
    model_name="sentence-transformers/paraphrase-multilingual-mpnet-base-v2"
)

# Armazenar no FAISS
vector_dbAnimes = FAISS.from_texts(
    texts,
    embeddingsAnimes,
    metadatas=metadatas
)

# Salvar o índice
vector_dbAnimes.save_local("animes_faiss_index")



# Armazenar no FAISS
vector_dbPersonagens = FAISS.from_texts(
    textsPersonagens,
    embeddingsAnimes,
    metadatas=metadatasPersonagens
)

# Salvar o índice
vector_dbAnimes.save_local("personagens_faiss_index")

In [98]:
#Criando os dataframes
dataFrameEpisodios = []
idsAnimes = []
for i in range(0, len(animesEpisodios)):
    episodios, id = generateDataFrameEpisodios(f'./dados/{animesEpisodios[i]}')
    dataFrameEpisodios.append(episodios)
    idsAnimes.append(id)


In [None]:
dataFrameEpisodios[0][0].info()


In [None]:
dataFrameEpisodios[0][0]['data'][0]

In [None]:
chunksEpisodios = []

for i in range(0, len(animesEpisodios)):
    for item in dataFrameEpisodios[i]:
        print(item['data'].keys())
        for index, row in item.iterrows():
            if 'error' not in row.keys():
                print(row['data'])
                names_chunks = text_splitter.split_text(str(idsAnimes[i]))
                #if row['data']['title'] != None and row['data']['synopsis'] != None and row['data']['duration'] != None and row['data']['filler'] != None:
                for chunk in names_chunks:
                    chunksEpisodios.append({
                                "animeId": chunk,
                                "numeroEpisodio": str(row['data']['mal_id']),
                                'title': str(row['data']['title']),
                                'duration': str(row['data']['duration']),
                                'synopsis': str(row['data']['synopsis']),
                                'filler': str(row['data']['filler']),
                    })

In [None]:
chunksEpisodios[30]

In [121]:
# Extrair textos e metadados para o FAISS
textsEpisodios = [chunk["synopsis"] for chunk in chunksEpisodios]
metadatasEpisodios = [{
    "numeroEpisodio": chunk['numeroEpisodio'],
    'title': chunk['title'],
    'duration': chunk['duration'],
    'filler': chunk['filler'],} for chunk in chunksEpisodios]

In [122]:
# Armazenar no FAISS
vector_dbEpisodios = FAISS.from_texts(
    textsEpisodios,
    embeddingsAnimes,
    metadatas=metadatasEpisodios
)

# Salvar o índice
vector_dbEpisodios.save_local("episodios_faiss_index")