In [None]:
import google.generativeai as genai
from google.generativeai.types import HarmCategory, HarmBlockThreshold
import pandas as pd
import json
import joblib
import os
from dotenv import load_dotenv

load_dotenv('../.env')  

try:
    data = pd.read_csv('../data/simpson_show_df.csv')
except FileNotFoundError:
    print("Error: simpsons_data.csv not found. Please provide the correct path.")
    exit()


class ChunkSummary():
    def __init__(self, model_name, apikey, text, window_size, overlap_size):
        self.text = text
        if isinstance(self.text, str):
            self.text = [self.text]
        self.window_size = window_size
        self.overlap_size = overlap_size
        # Aplicacao dos chunks
        self.chunks = self.__text_to_chunks()
        self.model = self.__create_model(apikey, model_name)


    def __create_model(self, apikey, model_name):
        genai.configure(api_key=apikey)
        self.prompt_base = f"""
        You are an editor assistant from the "The Simpsons" show.
        You will receive the #subtitles# from real episodes in the format:
        <location>, <character> said: <character line>
        
        You must create a summary of the #subtitles#, pointing out the most
        relevant information, jokes and key players in the story. Bare in mind
        that the summary must describe how the episode started, which key
        points are relevant along the story and its gran finale.
        The summary output must be written as a plain JSON with field 'summary'.
        """
        safety_settings={
            HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE,
            HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE,
            HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE,
            HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE,
        }
        generation_config = {
            'temperature': 0.2,
            'top_p': 0.8,
            'top_k': 20,
            'max_output_tokens': 1000
        }
        return genai.GenerativeModel(
            model_name,
            system_instruction=self.prompt_base,
            generation_config = generation_config,
            safety_settings=safety_settings
        )


    
    def __text_to_chunks(self):       
        n = self.window_size
        m = self.overlap_size  
        return [self.text[i:i+n] for i in range(0, len(self.text), n-m)]


    def __create_chunk_prompt(self, chunk):
        episode_lines = '\n'.join(chunk)
        prompt = f"""
        #subtitles#
        {episode_lines}
        ######
        Summarize it.
        """
        return prompt
        
    
    def __summarize_chunks(self):
        # Loop over chunks
        chunk_summaries = []
        for i, chunk in enumerate(self.chunks):
            print(f'Summarizing chunk {i+1} from {len(self.chunks)}')
           
            prompt = self.__create_chunk_prompt(chunk)
            response = self.model.generate_content(prompt)
            # Apendar resposta do chunk
            chunk_summaries.append(response.text)
            
        

        return chunk_summaries


    def summarize(self):
        print('Summarizing text')
        # Chamar o sumario 
        self.chunk_summaries = self.__summarize_chunks()
        # Prompt final
        summaries = '- ' + '\n- '.join(self.chunk_summaries)
        prompt = f"""
        You are an editor working on The Simpsons show. You must summarize
        a show episode considering the other summaries from part of the episode.
        The partitioned summaries are listed below:
        {summaries}
        ######
        The summary must describe the details in the story, like jokes, and details
        on what happens in the end with the key characters.
        Write a final summary based on the partitioned summaries in JSON format with
        the field 'summary'
        """
        print('Final summarization')
        response = self.model.generate_content(prompt)
        
        return response.text


episode_season = 5
episode_id = 92
X = (data[(data.episode_season == episode_season) &
          (data.episode_id == episode_id)].sort_values('number')
)

X['line'] = (X['location_normalized_name'].fillna('') + ', ' + 
             X['character_normalized_name'].fillna('') + ' said: ' + 
             X['normalized_text'].fillna('')
)



summarizer = ChunkSummary(
    model_name = "gemini-1.5-flash",
    apikey = os.environ.get("GEMINI_KEY"),
    text = X['line'].tolist(),
    window_size = 100,  
    overlap_size = 25  
)

episode_summary = summarizer.summarize()

try:
    final_summary = json.loads(episode_summary.replace("```json\n",'').replace("\n```",''))
except json.JSONDecodeError:
    print("Error decoding JSON.  Raw response:\n", episode_summary)
    final_summary = {"summary": "Error decoding JSON summary."}



# Number of chunks
num_chunks = len(summarizer.chunks)
print(f"Number of chunks used: {num_chunks}")

# Evaluation 
print("\nChunk Summaries:")
for i, chunk_summary in enumerate(summarizer.chunk_summaries):
    print(f"\nChunk {i+1}:\n{chunk_summary}")
    print(f"--- Evaluate veracity and coherence of Chunk {i+1} above ---")


print("\nFinal Summary:")
print(final_summary)
print("--- Evaluate veracity and coherence of the Final Summary above ---")



# Results
os.makedirs('../data/results/', exist_ok=True) 

joblib.dump({
    'chunks': summarizer.chunks,
    'chunk_summaries': summarizer.chunk_summaries,
    'final_summary': final_summary,                 
}, '../data/results/simpsons_episode_summary.joblib')

  data = pd.read_csv('../src/simpson_show_df.csv')


Summarizing text
Summarizing chunk 1 from 4
Summarizing chunk 2 from 4
Summarizing chunk 3 from 4
Summarizing chunk 4 from 4
Final summarization
Number of chunks used: 4

Chunk Summaries:

Chunk 1:
```json
{
  "summary": "The Simpsons' home is robbed by the Springfield Cat Burglar, who steals Lisa's saxophone, Bart's stamp collection, and Marge's necklace.  Homer is unconcerned, but the family discovers many of their neighbors have also been targeted.  Chief Wiggum's inept investigation leads to a city-wide panic.  Professor Frink demonstrates a high-tech security system that inadvertently causes chaos in the streets.  The burglar is eventually revealed to be Grampa Simpson, who was stealing to fund his gambling habit.  Despite the chaos and loss, Homer promises to get Lisa a replacement saxophone, and he is elected leader of the neighborhood watch, despite his questionable methods and lack of experience. The episode begins with the discovery of the robbery and ends with Homer taking c

['../data/results/simpsons_episode_summary.joblib']