In [30]:
import os
import glob
from openai import OpenAI
import logging

# Setup logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Setup OpenAI API key and client
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

# Define paths
transcripts_dir = 'data/transcripts.'
organized_data_dir = "/Users/taha/Desktop/rag/data"
if not os.path.exists(organized_data_dir):
    os.makedirs(organized_data_dir)

# Define categories
categories = [
    "Geräte & Zubehör",
    "Hilfe bei Störungen",
    "Internet & Telefonie",
    "MagentaEINS",
    "Mobilfunk",
    "TV",
    "Vertrag & Rechnung",
    "Apps & Dienste",
    "Others"
]

# Function to categorize transcript, summarize, and generate questions
def process_transcript(file_path):
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            transcript = file.read()

        # Request GPT-4 for category
        category_response = client.chat.completions.create(
            model="gpt-4",
            messages=[
                {"role": "system", "content": "You are a helpful assistant."},
                {"role": "user", "content": f"Please categorize the following transcript into one of the categories: {', '.join(categories)}:\n\n{transcript}. Please provide only the chosen category name."}
            ],
            max_tokens=20,
        )
        category = category_response.choices[0].message.content.strip()

        # Ensure the category is valid
        if category not in categories:
            category = "Others"

        # Request GPT-4 for summary
        summary_response = client.chat.completions.create(
            model="gpt-4",
            messages=[
                {"role": "system", "content": "You are a helpful telecom help assistant."},
                {"role": "user", "content": f"Please summarize the following transcript in a concise and clear manner in German. This summary will be used as webpage content:\n\n{transcript}"}
            ],
        )
        summary = summary_response.choices[0].message.content.strip()

        # Request GPT-4 for generating a question
        question_response = client.chat.completions.create(
            model="gpt-4",
            messages=[
                {"role": "system", "content": "You are a helpful telecom help assistant."},
                {"role": "user", "content": f"Based on the following transcript, generate a relevant question in German:\n\n{transcript}"}
            ],
            max_tokens=50,
        )
        question = question_response.choices[0].message.content.strip()

        return category, summary, question, transcript
    except Exception as e:
        logging.error(f"Error processing {file_path}: {e}")
        return "Unknown", None, None, None

# Process all transcript files
for transcript_file in glob.glob(os.path.join(transcripts_dir, '*.txt')):
    filename = os.path.basename(transcript_file)
    title, _ = os.path.splitext(filename)
    
    category, summary, question, transcript = process_transcript(transcript_file)
    if summary and question:
        # Create category directory if it doesn't exist
        category_dir = os.path.join(organized_data_dir, category)
        if not os.path.exists(category_dir):
            os.makedirs(category_dir)

        # Define output path with 'youtube_' prefix
        output_path = os.path.join(category_dir, f'youtube_{title}.txt')
        with open(output_path, 'w', encoding='utf-8') as combined_file:
            combined_file.write("Question:\n")
            combined_file.write(question + "\n\n")
            combined_file.write("Answer:\n")
            combined_file.write(summary + "\n\n")

            # Transcripti txt dosyasina eklemek icin bunu ekle koda
            #combined_file.write("Youtube transcript:\n")
            #combined_file.write(transcript)
        logging.info(f"Processed {filename} and saved to {output_path}")
    else:
        logging.warning(f"Skipping {filename} due to errors.")

print("Processing complete. Combined files saved in their respective category folders under 'website/organized_data'.")

2024-09-01 16:39:33,368 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-09-01 16:39:48,641 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-09-01 16:39:52,419 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-09-01 16:39:52,423 - INFO - Processed Digitale Organisation mit Android-Geräten – Adressbuch, Kalender und Co. I Telekom Senioren-Akademie.txt and saved to /Users/taha/Desktop/rag/data/Geräte & Zubehör/youtube_Digitale Organisation mit Android-Geräten – Adressbuch, Kalender und Co. I Telekom Senioren-Akademie.txt
2024-09-01 16:39:53,073 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-09-01 16:40:04,699 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2024-09-01 16:40:05,925 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
20

Processing complete. Combined files saved in their respective category folders under 'website/organized_data'.
