In [None]:
%pip install openai
%pip install python-dotenv
%pip install pydub
%pip install ffmpeg-downloader
%ffdl install --add-path

In [None]:
from dotenv import load_dotenv
load_dotenv()

from pydub import AudioSegment
import os

ffmpeg_path = os.getenv("FFMPEG_PATH")
ffprobe_path = os.getenv("FFPROBE_PATH")

if ffmpeg_path:
    print(f"Setting pydub.AudioSegment.ffmpeg to: {ffmpeg_path}")
    AudioSegment.ffmpeg = ffmpeg_path

    # pydub often infers ffprobe from the ffmpeg path,
    # but you can set it explicitly if needed or if it's in a different location.
    # ffprobe_path from .env (if set) takes precedence.

    # Determine the potential inferred ffprobe path more robustly
    _inferred_ffprobe_base = os.path.join(os.path.dirname(ffmpeg_path), "ffprobe")
    _actual_inferred_ffprobe_path = _inferred_ffprobe_base
    # On Windows, check for .exe if the plain name doesn't exist and ffmpeg_path was likely a dir or plain name
    if os.name == 'nt' and \
       not os.path.exists(_actual_inferred_ffprobe_path) and \
       os.path.exists(_inferred_ffprobe_base + ".exe"):
        _actual_inferred_ffprobe_path = _inferred_ffprobe_base + ".exe"
    
    _inferred_ffprobe_exists = os.path.exists(_actual_inferred_ffprobe_path)

    if not ffprobe_path and _inferred_ffprobe_exists:
        AudioSegment.ffprobe = _actual_inferred_ffprobe_path
        print(f"Setting pydub.AudioSegment.ffprobe to: {_actual_inferred_ffprobe_path} (inferred alongside ffmpeg)")
    elif ffprobe_path:
        AudioSegment.ffprobe = ffprobe_path
        print(f"Setting pydub.AudioSegment.ffprobe to: {ffprobe_path} (from FFPROBE_PATH)")
    else: # This case means ffprobe_path was not set AND the inferred path also does not exist.
        print(f"Info: FFPROBE_PATH not set and ffprobe not found alongside ffmpeg. pydub will search system PATH for ffprobe.")
else:
    print("FFMPEG_PATH not found in .env. pydub will try to find ffmpeg in the system PATH.")
    print("Info: pydub will also try to find ffprobe in the system PATH.")

def split_audio_with_overlap(
    m4a_file_path: str,
    chunk_length_min: float = 2.0,
    overlap_sec: float = 5.0,
    output_dir: str = "audio_chunks"
) -> list[str]:
    """
    Splits an M4A audio file into smaller chunks with a specified overlap.

    Args:
        m4a_file_path (str): Path to the input M4A audio file.
        chunk_length_min (float): Desired length of each chunk in minutes.
                                  Default is 2 minutes.
        overlap_sec (float): Desired overlap between chunks in seconds.
                             Default is 5 seconds.
        output_dir (str): Directory to save the output chunk files.
                          It will be created if it doesn't exist.

    Returns:
        list[str]: A list of absolute paths to the created chunk files.
                   Returns an empty list if an error occurs or no chunks are made.
    """
    if not os.path.isfile(m4a_file_path):
        print(f"Error: Audio file not found or is not a file: {m4a_file_path}")
        return []

    try:
        print(f"Loading audio file: {m4a_file_path}...")
        audio = AudioSegment.from_file(m4a_file_path, format="m4a")
        print("Audio file loaded successfully.")
    except Exception as e:
        print(f"Error loading audio file '{m4a_file_path}': {e}")
        print("Please ensure FFmpeg is installed and accessible in your system's PATH.")
        print("You can download FFmpeg from https://ffmpeg.org/download.html")
        return []

    if not os.path.exists(output_dir):
        try:
            os.makedirs(output_dir)
            print(f"Created output directory: {output_dir}")
        except OSError as e:
            print(f"Error creating output directory '{output_dir}': {e}")
            return []
    
    chunk_length_ms = int(chunk_length_min * 60 * 1000)
    overlap_ms = int(overlap_sec * 1000)

    if chunk_length_ms <= 0:
        print("Error: Chunk length must be positive.")
        return []
    if overlap_ms < 0:
        print("Error: Overlap must be non-negative.")
        return []
    if chunk_length_ms <= overlap_ms and chunk_length_ms > 0 : # overlap can be 0
        print(f"Warning: Chunk length ({chunk_length_ms/1000}s) is not greater than overlap ({overlap_ms/1000}s). "
              "This might lead to unexpected behavior or very short effective steps.")
        # Allow proceeding if user intends this, but it's unusual.

    total_duration_ms = len(audio)
    if total_duration_ms == 0:
        print("Error: Audio file is empty.")
        return []
        
    print(f"Total audio duration: {total_duration_ms / 1000:.2f} seconds.")
    print(f"Target chunk length: {chunk_length_ms / 1000:.2f} seconds.")
    print(f"Overlap: {overlap_ms / 1000:.2f} seconds.")

    chunk_files = []
    start_ms = 0
    chunk_id = 0
    
    while start_ms < total_duration_ms:
        end_ms = start_ms + chunk_length_ms
        # Ensure the chunk doesn't go past the end of the audio
        actual_end_ms = min(end_ms, total_duration_ms)
        
        # This condition ensures we don't process an empty slice if start_ms somehow reaches total_duration_ms
        if start_ms >= actual_end_ms: 
            break 

        current_chunk_duration_ms = actual_end_ms - start_ms
        print(f"Processing chunk {chunk_id:03d}: "
              f"Start: {start_ms/1000:.2f}s, End: {actual_end_ms/1000:.2f}s, "
              f"Duration: {current_chunk_duration_ms/1000:.2f}s")

        chunk = audio[start_ms:actual_end_ms]
        
        # Sanity check for very small chunks, especially if they are smaller than overlap
        # (though the transcription service might handle this fine)
        if current_chunk_duration_ms < 100: # e.g. less than 0.1 seconds
            print(f"  Skipping very short chunk {chunk_id:03d} (duration < 0.1s).")
            if actual_end_ms == total_duration_ms: # If it was the last bit
                break
            start_ms += (chunk_length_ms - overlap_ms)
            if chunk_length_ms - overlap_ms <= 0 and chunk_length_ms > 0 : # Avoid infinite loop if step is not positive
                 print("Warning: Effective step is not positive due to overlap >= chunk_length. Stopping.")
                 break
            continue


        output_filename = f"chunk_{chunk_id:03d}.m4a"
        # Use absolute path for output_dir to ensure chunk_files contains absolute paths
        abs_output_dir = os.path.abspath(output_dir)
        output_path = os.path.join(abs_output_dir, output_filename)
        
        try:
            export_params = ["-strict", "experimental"]
            print(f"  Exporting with codec='aac', bitrate='128k', params={export_params} to: {output_path}")
            chunk.export(output_path,
                         format="mp4",
                         codec="aac",
                         bitrate="128k",
                         parameters=export_params)
            chunk_files.append(output_path)
            print(f"  Exported: {output_path}")
        except Exception as e:
            print(f"  Error exporting chunk {output_path}: {e}")
            # Optionally, decide if you want to stop or continue if a single chunk fails
        
        chunk_id += 1
        
        # If this chunk already processed up to the end of the audio, no more chunks needed.
        if actual_end_ms == total_duration_ms:
            print("Reached end of audio.")
            break 
            
        next_start_ms = start_ms + (chunk_length_ms - overlap_ms)

        # If the step (chunk_length_ms - overlap_ms) is zero or negative,
        # and we haven't reached the end, we'd loop infinitely.
        if next_start_ms <= start_ms and actual_end_ms < total_duration_ms:
            print(f"Error: Advancing start_ms from {start_ms} to {next_start_ms} would not progress or would go backward. "
                  "This usually means overlap is too large relative to chunk length. Stopping.")
            break
        
        start_ms = next_start_ms

    if not chunk_files:
        print("No chunks were created. This might be due to a very short audio file or configuration issues.")
    else:
        print(f"\nSuccessfully created {len(chunk_files)} chunks in '{os.path.abspath(output_dir)}'.")
    return chunk_files

# --- Example Usage (you can run this in a separate cell or script) ---
# if __name__ == "__main__":
#     # Create a dummy M4A file for testing if you don't have one readily available
#     # This requires FFmpeg to be installed and working with pydub.
#     # print("Attempting to create a dummy M4A file for testing...")
#     # try:
#     #     # 4.5 minutes of silence
#     #     duration_ms = int(4.5 * 60 * 1000) 
#     #     silence = AudioSegment.silent(duration=duration_ms) 
#     #     dummy_file_path = "dummy_long_audio.m4a"
#     #     silence.export(dummy_file_path, format="m4a")
#     #     print(f"Dummy M4A file created at: {dummy_file_path}")
#     #     input_audio_file = dummy_file_path
#     # except Exception as e:
#     #     print(f"Could not create dummy M4A file: {e}")
#     #     print("Please ensure FFmpeg is installed and pydub can use it.")
#     #     print("Using a placeholder for input_audio_file. Replace it with your actual file.")
#     #     input_audio_file = "REPLACE_WITH_YOUR_LONG_AUDIO.m4a" # Placeholder
#
#     # --- Replace with the path to YOUR M4A file ---
#     input_audio_file = "REPLACE_WITH_YOUR_LONG_AUDIO.m4a" 
#     # Example: input_audio_file = "/path/to/my/long_recording.m4a"
#     # Example: input_audio_file = "C:/Users/YourName/Music/long_interview.m4a"
#
#     if input_audio_file == "REPLACE_WITH_YOUR_LONG_AUDIO.m4a" or not os.path.exists(input_audio_file):
#         print(f"\n--- PLEASE READ ---")
#         print(f"The example is currently set to use a placeholder file: '{input_audio_file}'.")
#         print(f"Please replace this with the actual path to your long M4A audio file to test the splitting function.")
#         print(f"If you uncommented the dummy file creation, ensure it was successful.")
#         print(f"-------------------")
#     else:
#         print(f"\nStarting audio splitting process for: {input_audio_file}")
#         # Split into 2-minute chunks with 5-second overlap
#         created_chunk_paths = split_audio_with_overlap(
#             input_audio_file,
#             chunk_length_min=2.0,
#             overlap_sec=5.0,
#             output_dir="my_split_audio_chunks" 
#         )
#
#         if created_chunk_paths:
#             print("\nList of created chunk files:")
#             for path in created_chunk_paths:
#                 print(path)
#             # Now you can iterate through created_chunk_paths and send each to your transcription function
#         else:
#             print("\nAudio splitting process completed, but no chunk files were generated.")
#
#     # To clean up the dummy file and directory if you created them:
#     # if os.path.exists("dummy_long_audio.m4a"):
#     #     os.remove("dummy_long_audio.m4a")
#     # if os.path.exists("my_split_audio_chunks"):
#     #     import shutil
#     #     shutil.rmtree("my_split_audio_chunks")



In [None]:
import openai
import os
import sys # Still useful for stderr in case of errors within the function
from dotenv import load_dotenv

load_dotenv()


transcriptions_list = []

def transcribe_audio_notebook(api_key: str, audio_file_path: str, model: str = "gpt-4o-mini-transcribe") -> str | None:
    """
    Transcribes an audio file using OpenAI's Whisper model.
    Suitable for use in a Jupyter Notebook.

    Args:
        api_key (str): The OpenAI API key.
        audio_file_path (str): The path to the audio file.
        model (str): The model to use for transcription (default: "gpt-4o-mini-transcribe").

    Returns:
        str: The transcribed text, or None if an error occurred.
    """
    if not api_key:
        print("Error: OpenAI API key is missing. Please provide a valid API key.", file=sys.stderr)
        return None

    try:
        client = openai.OpenAI(api_key=api_key)

        print(f"Opening audio file: {audio_file_path}")
        created_chunk_paths = split_audio_with_overlap(
             audio_file_path,
             chunk_length_min=2.0,
             overlap_sec=5.0,
             output_dir="my_split_audio_chunks" 
         )
        for chunk_f in created_chunk_paths:
            with open(chunk_f, "rb") as audio_file:
                print(f"Sending audio to OpenAI API using model: {model}...")
                transcription = client.audio.transcriptions.create(
                    model=model,
                    file=audio_file
                )
                print("\ntranscription.text")
                transcriptions_list.append(transcription.text)

        print("Transcription received from API.")
        return transcription.text
    except openai.APIConnectionError as e:
        print(f"OpenAI API Connection Error: {e}", file=sys.stderr)
    except openai.RateLimitError as e:
        print(f"OpenAI API Rate Limit Error: {e}", file=sys.stderr)
    except openai.AuthenticationError as e:
        print(f"OpenAI API Authentication Error: {e}", file=sys.stderr)
        print("Please check your API key.", file=sys.stderr)
    except openai.APIStatusError as e:
        print(f"OpenAI API Status Error (HTTP Status {e.status_code}): {e.message}", file=sys.stderr)
        if hasattr(e, 'response') and e.response and hasattr(e.response, 'content'):
            try:
                print(f"Response body: {e.response.content.decode()}", file=sys.stderr)
            except Exception:
                print(f"Response body (raw): {e.response.content}", file=sys.stderr)
    except FileNotFoundError:
        print(f"Error: Audio file not found at {audio_file_path}", file=sys.stderr)
    except Exception as e:
        print(f"An unexpected error occurred: {e}", file=sys.stderr)
    return None

# --- Example Usage in a Jupyter Notebook Cell ---

# 1. Set your OpenAI API Key
# Option A: Retrieve from environment variable (if set before starting Jupyter)
my_api_key = os.getenv("OPENAI_API_KEY")

# Option B: Paste your API key directly (less secure, especially if sharing the notebook)
# my_api_key = "sk-your_actual_api_key_here"

# Option C: Prompt for the API key (more secure if typing it in)
# import getpass
# if not my_api_key: # If not found in environment
#     my_api_key = getpass.getpass("Enter your OpenAI API Key: ")

if not my_api_key:
    print("API Key not found. Please set it via environment variable or directly in the script.", file=sys.stderr)
    # sys.exit(1) # In a notebook, you might just let it fail or handle differently
else:
    # 2. Specify the path to your audio file
    audio_path = "C:\\Users\\Hydra\\Downloads\\Voice 241228_104608.m4a"
    # e.g., audio_path = "/path/to/your/audio.wav"

    #gpt-4o-transcribe
    #gpt-4o-mini-transcribe
    #whisper-1

    model_to_use = "gpt-4o-mini-transcribe"

    print(f"\nAttempting to transcribe '{audio_path}' using model '{model_to_use}'...")
    
    # 4. Call the transcription function
    transcribed_text = transcribe_audio_notebook(my_api_key, audio_path, model=model_to_use)

    # 5. Print the result
    if transcriptions_list:
        print("\nTranscription successfull, please run next step")
    else:
        print("\nTranscription failed. Please check the error messages above.", file=sys.stderr)



In [None]:
import openai
import os
import sys # For stderr

# (Ensure you've already run !pip install openai in a previous cell if needed)

def summarize_text_openai(api_key: str, text_to_summarize: str, model: str = "gpt-3.5-turbo") -> str | None:
    """
    Summarizes a given text using OpenAI's Chat Completions API.

    Args:
        api_key (str): The OpenAI API key.
        text_to_summarize (str): The text content to be summarized.
        model (str): The OpenAI model to use for summarization (e.g., "gpt-3.5-turbo", "gpt-4").

    Returns:
        str: The summarized text, or None if an error occurred.
    """
    if not api_key:
        print("Error: OpenAI API key is missing. Please provide a valid API key.", file=sys.stderr)
        return None
    if not text_to_summarize.strip():
        print("Error: Text to summarize is empty.", file=sys.stderr)
        return None

    try:
        client = openai.OpenAI(api_key=api_key)
        
        # Constructing the prompt for summarization
        # You can customize the system message and user prompt for different summary styles or lengths.
        messages = [
            {"role": "system", "content": "You must summarise the provided transcription, trying to include all information given, keeping the summary in time order as much as possible. For instance if dealing with bee hive maintenance, try summarise by frame number in a hive etc. Each box contains 10 frames, some hives have honey super boxes as well as brood boxes. We will have tried to mention frame number when doing them, and mentioned what box we are looking at."},
            {"role": "user", "content": f"Please summarize the following transcript:\n\n{text_to_summarize}"}
        ]

        print(f"Sending text to OpenAI API for summarization using model: {model}...")
        response = client.chat.completions.create(
            model=model,
            messages=messages,
            temperature=0.5,  # Lower temperature for more focused summaries
            max_tokens=10000   
        )
        
        summary = response.choices[0].message.content
        print("Summary received from API.")
        return summary.strip()

    except openai.APIConnectionError as e:
        print(f"OpenAI API Connection Error: {e}", file=sys.stderr)
    except openai.RateLimitError as e:
        print(f"OpenAI API Rate Limit Error: {e}", file=sys.stderr)
    except openai.AuthenticationError as e:
        print(f"OpenAI API Authentication Error: {e}", file=sys.stderr)
        print("Please check your API key.", file=sys.stderr)
    except openai.APIStatusError as e:
        print(f"OpenAI API Status Error (HTTP Status {e.status_code}): {e.message}", file=sys.stderr)
        if hasattr(e, 'response') and e.response and hasattr(e.response, 'content'):
            try:
                print(f"Response body: {e.response.content.decode()}", file=sys.stderr)
            except Exception:
                print(f"Response body (raw): {e.response.content}", file=sys.stderr)
    except Exception as e:
        print(f"An unexpected error occurred during summarization: {e}", file=sys.stderr)
    return None

# --- Example Usage in a Jupyter Notebook Cell ---

# Assuming 'transcriptions_list' is already populated from your previous transcription steps.
# For demonstration, let's create a sample transcriptions_list:
# transcriptions_list = [
#     "The first part of the meeting discussed quarterly earnings, which were above expectations.",
#     "Then, the team moved on to new product development, highlighting the upcoming X1 model.",
#     "Finally, there was a brief Q&A session addressing marketing strategies for the next fiscal year."
# ]
# Make sure transcriptions_list is defined and populated in a cell above this one.

# 1. Retrieve your OpenAI API Key (ensure this is set)
# Option A: From environment variable (if set before starting Jupyter)
my_api_key = os.getenv("OPENAI_API_KEY")

# Option B: Paste your API key directly (less secure)
# my_api_key = "sk-your_actual_api_key_here" 

# Option C: Prompt for the API key (more secure if typing it in)
# import getpass
# if not my_api_key:
#     my_api_key = getpass.getpass("Enter your OpenAI API Key: ")

if not my_api_key:
    print("API Key not found. Please set it via environment variable or directly in the script.", file=sys.stderr)
else:
    # Check if transcriptions_list exists and has content
    if transcriptions_list:
        # 2. Combine the list of transcriptions into a single string
        # Using a double newline to separate distinct transcription parts,
        # which can help the model understand them as separate utterances or segments.
        full_transcription_text = "\n\n".join(transcriptions_list)
        
        print(f"\n--- Full Text to Summarize ({len(full_transcription_text.split())} words) ---")
        print(full_transcription_text) # Print a preview
        print("--- End of Full Text ---")

        # 3. Specify the model for summarization (optional, defaults to "gpt-3.5-turbo")
        # summarization_model = "gpt-4" # Or "gpt-4o" if you prefer and it's available
        summarization_model = "gpt-4.1"

        # 4. Call the summarization function
        summary_text = summarize_text_openai(my_api_key, full_transcription_text, model=summarization_model)

        # 5. Print the result
        if summary_text:
            print("\n--- Generated Summary ---")
            print(summary_text)
            print("--- End of Summary ---")
        else:
            print("\nSummarization failed. Please check the error messages above.", file=sys.stderr)
    else:
        print("\n'transcriptions_list' is not defined or is empty. Please ensure it contains text to summarize.", file=sys.stderr)

