 Part 1 preprocessing


In [None]:
import zipfile  # For working with ZIP archives
import os  # For interacting with the operating system (file paths)
import re  # For regular expressions (text cleaning)
import shutil  # For high-level file operations (deleting directories)
from bs4 import BeautifulSoup  # For parsing HTML and XML


def preprocess_text(text):
    """
    Preprocesses the given text (handles HTML, CSS, and plain text).

    Args:
        text: The input text string.

    Returns:
        The cleaned text string.
    """
    # Remove HTML/CSS tags using Beautiful Soup with lxml parser
    soup = BeautifulSoup(text, "lxml")  # Create a BeautifulSoup object to parse HTML/CSS
    text = soup.get_text(separator=" ", strip=True)  # Extract text, add spaces between elements, remove leading/trailing whitespace

    # Remove punctuation and convert to lowercase
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation using regular expression
    text = text.lower()  # Convert to lowercase

    return text


def load_and_preprocess_data(zip_file_path):
    """
    Loads and preprocesses data from the zip file.

    Args:
        zip_file_path: The path to the ZIP file.

    Returns:
        A dictionary where keys are file types and values are lists of
        preprocessed text strings.
    """
    data = {}  # Initialize an empty dictionary to store the processed data

    # Open the ZIP file in read mode using a 'with' statement (ensures proper closing)
    with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
        zip_ref.extractall('temp_data')  # Extract all contents to a temporary 'temp_data' directory

    # Assume the ZIP file contains a single inner folder
    inner_folder = os.listdir('temp_data')[0]  # Get the name of the inner folder
    print(f"Inner folder found: {inner_folder}")

    # Iterate through each file in the inner folder
    for filename in os.listdir(os.path.join('temp_data', inner_folder)):
        if filename.endswith(".txt"):  # Check if the file is a text file
            file_type = filename.split("_")[0]  # Extract file type from filename (before the first underscore)
            try:
                # Open the text file in read mode with UTF-8 encoding
                with open(os.path.join('temp_data', inner_folder, filename), 'r', encoding='utf-8') as file:
                    text = file.read()  # Read the entire file content into the 'text' variable
                    processed_text = preprocess_text(text)  # Clean the text using the preprocess_text function

                    # Organize the processed text by file type in the 'data' dictionary
                    if file_type not in data:
                        data[file_type] = []  # Create a new list for the file type if it doesn't exist
                    data[file_type].append(processed_text)  # Add the processed text to the list

                    print(f"  Loaded file: {filename}")
                    print(f"  Processed text (first 100 chars): {processed_text[:100]}...")

            except UnicodeDecodeError:
                print(f"UnicodeDecodeError while reading {filename}. Skipping this file.")

    # Delete the temporary directory after processing
    try:
        shutil.rmtree('temp_data')  # Delete the 'temp_data' directory and its contents
    except PermissionError:
        print("PermissionError: Could not delete 'temp_data' directory. Please close any open files and try again.")
    except OSError as e:
        print(f"Error deleting 'temp_data' directory: {e}")

    print(f"  Loaded data: {data}")  # Print the contents of the 'data' dictionary (for debugging)
    return data  # Return the dictionary containing the processed data

Part 2 extraction

In [None]:
import time
import random
import openai
import os
from dotenv import load_dotenv

# Load environment variables (if not already loaded)
load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY")

def extract_topics(text, delay_seconds):
    """
    Extracts topics from the given text using the ChatGPT API,
    guided by key themes and seed topics.
    """
    print(f"Extracting topics from text: {text[:50]}...")  # Print the first 50 characters of the input text

    # Define key themes (for focusing the topic extraction)
    key_themes = [
        "user experience",
        "technical issues",
        "personalization",
        "accessibility",
        "peak hours",
        "customer support"
    ]

    # Define seed topics (for context)
    seed_topics = [
        "teen therapy",
        "group therapy",
        "session notes",
        "teletherapy audio",
        "crisis response",
        "user onboarding",
        "video quality",
        "appointment scheduling",
        "therapy companion app",
        "payment system",
        "cultural matching",
        "patient engagement",
        "therapist matching"
    ]

    # Construct the prompt for the ChatGPT API
    prompt = f"""
    Extract the key themes and topics discussed in the following text,
    focusing on aspects like {", ".join(key_themes)} and
    considering the context of {", ".join(seed_topics)}:

    Text: \"\"\"{text}\"\"\"
    """

    try:
        client = openai.OpenAI()  # Create an instance of the OpenAI client
        time.sleep(delay_seconds)  # Introduce a delay before each API call to avoid rate limits

        for attempt in range(5):  # Retry up to 5 times in case of rate limit errors
            try:
                # Make the API call to generate completions (extract topics)
                response = client.chat.completions.create(
                    model="gpt-3.5-turbo",  # Specify the model to use
                    messages=[
                        {
                            "role": "system",
                            "content": "You are a helpful assistant that extracts key topics from text."
                        },
                        {"role": "user", "content": prompt}  # Pass the constructed prompt to the API
                    ],
                    temperature=0.7,  # Control the randomness of the output
                    max_tokens=150,  # Limit the number of tokens in the response
                    top_p=1.0,  # Parameters for nucleus sampling (control diversity)
                    frequency_penalty=0.0,  # Control the repetition of tokens
                    presence_penalty=0.0,  # Control the appearance of new tokens
                )

                # Extract the topics from the API response
                raw_topics = response.choices[0].message.content.split('\n')  # Split the response into lines
                topics = [topic.strip() for topic in raw_topics if topic.strip()]  # Remove extra whitespace and empty lines
                print(f"  Extracted topics from OpenAI: {topics}")
                return topics  # Return the extracted topics

            except openai.RateLimitError as e:
                # Implement exponential backoff with jitter for rate limit errors
                wait_time = (4 ** attempt) + random.uniform(0, 5)  # Calculate wait time with increasing base and random jitter
                print(f"Rate limit exceeded, waiting for {wait_time:.2f} seconds...")
                time.sleep(wait_time)  # Wait for the calculated time

        else:  # This 'else' block executes if the loop completes without a successful response
            print("Failed after multiple retries.")
            return []

    except Exception as e:
        print(f"Error in extract_topics: {e}")
        return []  # Return an empty list if any error occurs

Part 3 Sentiment Analysis and Data Point Extraction

In [None]:
import re  # Import the regular expression module
from nltk.sentiment import SentimentIntensityAnalyzer  # Import the sentiment analysis tool

# NLTK Downloads (you might need to uncomment these if you haven't downloaded them)
# nltk.download('vader_lexicon')  # Download the VADER lexicon for sentiment analysis

def analyze_sentiment(text):
    """
    Performs sentiment analysis on the given text using NLTK's VADER.

    Args:
        text: The input text string.

    Returns:
        A string representing the sentiment: "positive", "negative", or "neutral".
    """
    sia = SentimentIntensityAnalyzer()  # Create a SentimentIntensityAnalyzer object
    sentiment_score = sia.polarity_scores(text)['compound']  # Get the compound sentiment score

    # Classify the sentiment based on the compound score
    if sentiment_score >= 0.05:
        return "positive"  # Positive sentiment
    elif sentiment_score <= -0.05:
        return "negative"  # Negative sentiment
    else:
        return "neutral"  # Neutral sentiment


def extract_data_points(text):
    """
    Extracts relevant data points (percentages, numbers, dates) from the text.

    Args:
        text: The input text string.

    Returns:
        A list of extracted data points (strings).
    """
    try:
        data_points = []  # Initialize an empty list to store data points

        # Improved date pattern to capture complete dates
        date_pattern = r"\b\d{4}-\d{2}-\d{2}\b"  # Regular expression for YYYY-MM-DD dates
        data_points.extend(re.findall(date_pattern, text))  # Find all dates and add them to the list

        # Percentage pattern (no change needed)
        percentage_pattern = r"(\d+\.?\d*)%"  # Regular expression for percentages
        data_points.extend(re.findall(percentage_pattern, text))  # Find all percentages and add them to the list

        # Number pattern (modified to avoid capturing parts of dates)
        number_pattern = r"(?<!\d-)(?<!\d)\b\d{1,3}(?:,\d{3})*(?:\.\d+)?\b(?!\d)"  # Regular expression for numbers
        data_points.extend(re.findall(number_pattern, text))  # Find all numbers and add them to the list

        # Remove duplicates by converting to a set and back to a list
        data_points = list(set(data_points))

        return data_points  # Return the list of extracted data points
    except Exception as e:
        print(f"Error in extract_data_points: {e}")
        return []  # Return an empty list if an error occurs

part 4 generating story

In [None]:
import zipfile  # For working with zip files
import os  # For interacting with the operating system
import re  # For regular expressions
import shutil  # For high-level file operations
import nltk  # For natural language processing tasks
from nltk.sentiment import SentimentIntensityAnalyzer  # For sentiment analysis
import openai  # For using the OpenAI API
from dotenv import load_dotenv  # For loading environment variables

# Download necessary NLTK resources
nltk.download('vader_lexicon')  # Download the lexicon for sentiment analysis
nltk.download('punkt')  # Download the punkt sentence tokenizer
nltk.download('averaged_perceptron_tagger')  # Download the part-of-speech tagger
nltk.download('maxent_ne_chunker')  # Download the named entity recognition chunker
nltk.download('words')  # Download the words corpus

# Load environment variables (make sure you have a .env file with your OpenAI API key)
load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY")  # Set the API key for OpenAI

def preprocess_text(text):
    """
    Preprocesses the given text.

    Args:
        text: The raw text content to preprocess.

    Returns:
        The cleaned and preprocessed text.
    """
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation using regular expressions
    text = text.lower()  # Convert the text to lowercase
    # (Optional) Add stemming/lemmatization here using NLTK
    # ... your stemming/lemmatization code ...
    return text

def load_and_preprocess_data(zip_file_path):
    """
    Loads and preprocesses data from the zip file.

    Args:
        zip_file_path: Path to the zip file.

    Returns:
        A dictionary where keys are file types and values are lists of
        preprocessed text content for each file of that type.
    """
    data = {}  # Initialize an empty dictionary to store the data
    with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:  # Open the zip file in read mode
        zip_ref.extractall('temp_data')  # Extract all files to a temporary directory

    # Get the name of the inner folder
    inner_folder = os.listdir('temp_data')[0]  # Assuming there's only one folder inside

    # Access the files within the inner folder
    for filename in os.listdir(os.path.join('temp_data', inner_folder)):  # Loop through files in the inner folder
        if filename.endswith(".txt"):  # Check if the file is a text file
            file_type = filename.split("_")[0]  # Extract the file type from the filename
            try:
                with open(os.path.join('temp_data', inner_folder, filename), 'r', encoding='utf-8') as file:  # Open the file with UTF-8 encoding
                    text = file.read()  # Read the content of the file
                    processed_text = preprocess_text(text)  # Preprocess the text
                    if file_type not in data:  # If the file type is not already in the dictionary
                        data[file_type] = []  # Create a new list for that file type
                    data[file_type].append(processed_text)  # Add the processed text to the list
            except UnicodeDecodeError:  # Handle UnicodeDecodeError if it occurs
                print(f"UnicodeDecodeError while reading {filename}. Skipping this file.")

    # Clean up the temporary directory
    try:
        shutil.rmtree('temp_data')  # Remove the temporary directory and its contents
    except PermissionError:
        print("PermissionError: Could not delete 'temp_data' directory. Please close any open files and try again.")
    except OSError as e:
        print(f"Error deleting 'temp_data' directory: {e}")

    return data

def extract_topics(text):
    """
    Extracts topics from the given text using the ChatGPT API,
    guided by key themes and seed topics.
    """
    # Key themes from our analysis
    key_themes = [
        "user experience", "technical issues", "personalization",
        "accessibility", "peak hours", "customer support"
    ]

    # Seed topics (you'll need to customize these based on the specific file being analyzed)
    seed_topics = [
        "teen therapy", "group therapy", "session notes", "teletherapy audio",
        "crisis response", "user onboarding", "video quality",
        "appointment scheduling", "therapy companion app", "payment system",
        "cultural matching", "patient engagement", "therapist matching"
    ]

    # Construct the prompt with key themes and seed topics
    prompt = f"""
    Extract the key themes and topics discussed in the following text, 
    focusing on aspects like {", ".join(key_themes)} and 
    considering the context of {", ".join(seed_topics)}:

    {text}
    """

    try:
        response = openai.ChatCompletion.create(  # Make an API call to OpenAI's ChatCompletion
            model="gpt-3.5-turbo",  # Use the gpt-3.5-turbo model
            messages=[
                {"role": "system", "content": "You are a helpful assistant that extracts key topics from text."},  # Set the system message
                {"role": "user", "content": prompt}  # Set the user message with the prompt
            ]
        )
        topics = response['choices'][0]['message']['content'].split('\n')  # Split the response into topics by newline
        topics = [topic.strip() for topic in topics if topic.strip()]  # Remove leading/trailing whitespace from topics
        return topics
    except Exception as e:
        print(f"Error in extract_topics: {e}")  # Print any errors that occur during topic extraction
        return []

def analyze_sentiment(text):
    """
    Performs sentiment analysis on the given text.
    """
    sia = SentimentIntensityAnalyzer()  # Create a SentimentIntensityAnalyzer object
    sentiment_score = sia.polarity_scores(text)['compound']  # Get the compound sentiment score
    if sentiment_score >= 0.05:  # Classify sentiment based on the score
        return "positive"
    elif sentiment_score <= -0.05:
        return "negative"
    else:
        return "neutral"

def extract_data_points(text):
    """
    Extracts relevant data points (percentages, numbers, dates) from the text.
    """
    try:
        data_points = []  # Initialize an empty list to store data points

        # Find percentages
        percentage_pattern = r"(\d+\.?\d*)%"  # Regular expression pattern to find percentages
        data_points.extend(re.findall(percentage_pattern, text))  # Find all percentages in the text

        # Find numbers (integers and decimals)
        number_pattern = r"(\d+(?:,\d{3})*(?:\.\d+)?)"  # Regular expression pattern to find numbers
        data_points.extend(re.findall(number_pattern, text))  # Find all numbers in the text

        # Find dates (adjust the pattern based on your data)
        date_pattern = r"(\d{4}-\d{2}-\d{2})"  # Regular expression pattern to find dates (YYYY-MM-DD)
        data_points.extend(re.findall(date_pattern, text))  # Find all dates in the text

        return data_points  # Return the list of data points
    except Exception as e:
        print(f"Error in extract_data_points: {e}")  # Print any errors that occur during data point extraction
        return []

def generate_summary_with_chatgpt(topics, sentiment, data_points, references):
    """
    Generates a summary using the ChatGPT API, focusing on key themes and data points.
    """
    try:
        client = openai.OpenAI()  # Create an OpenAI API client

        prompt = f"""
        Generate a concise summary for a product story, based on the following information:

        Extracted Topics: {", ".join(topics)}

        Sentiment: {sentiment}

        Data Points: {", ".join(data_points)}

        References: {", ".join(references)}

        The summary should highlight key insights, user experiences, and relevant data points.
        Focus on clarity and conciseness.
        """

        response = client.chat.completions.create(  # Make an API call to OpenAI's ChatCompletion
            model="gpt-3.5-turbo",  # Use the gpt-3.5-turbo model
            messages=[
                {"role": "system", "content": "You are a helpful assistant that generates product story summaries."},  # Set the system message
                {"role": "user", "content": prompt}  # Set the user message with the prompt
            ],
            temperature=0.7,
            max_tokens=250,  # Adjust as needed for summary length
            top_p=1.0,
            frequency_penalty=0.0,
            presence_penalty=0.0,
        )

        summary = response.choices[0].message.content.strip()  # Extract the summary from the response
        return summary

    except Exception as e:
        print(f"Error in generate_summary_with_chatgpt: {e}")  # Print any errors that occur during summary generation
        return None

def generate_story(story_type, topics, sentiment, data_points, references, text):
    """
    Generates a product story based on the given information and detailed templates.
    """
    print(f"      Generating story of type: {story_type}")

    # Define templates for different story types
    templates = {
        "Concern": {
            "headline_template": "[Problem/Issue] is Negatively Impacting [User/Area]",
            "summary_template": "Analysis of {references} reveals concerns regarding {topics}. {data_points} indicate a negative impact on {user_group}.",
            "impact_template": "This issue has led to {impact}.",
            "user_perspective_template": "Users have expressed {user_perspectives} about this problem.",
            "recommendation_template": "To address this concern, it is recommended to {recommendations}.",
        },
        "Win": {
            "headline_template": "[Positive Outcome/Achievement] in [Area/Feature]",
            "summary_template": "{data_points} highlight the success of {feature} in {references}. User feedback indicates {positive_feedback}.",
            "contributing_factors_template": "This success can be attributed to {factors}.",
            "impact_template": "This positive outcome has resulted in {impact}.",
            "user_perspective_template": "Users have expressed {user_perspectives} about this feature.",
            "recommendation_template": "To maintain and further enhance this success, it is recommended to {recommendations}."
        },
        "Insight": {
            "headline_template": "Key Insight: [New Understanding/Discovery] in [Area/Feature]",
            "summary_template": "Analysis of {references} reveals a key insight: {insight}. This has implications for {area}.",
            "evidence_template": "This insight is supported by the following evidence: {evidence}.",
            "implication_template": "This insight suggests that {implication}.",
            "recommendation_template": "We recommend {recommendations} to leverage this insight."
        },
        "Opportunity": {
            "headline_template": "Opportunity to [Improve/Innovate] in [Area/Feature]",
            "summary_template": "{references} highlight an opportunity to {action} in {area}. This could lead to {benefits}.",
            "potential_template": "This opportunity has the potential to {potential}.",
            "recommendation_template": "To capitalize on this opportunity, it is recommended to {recommendations}."
        }
    }

    template = templates.get(story_type)  # Get the template for the specified story type
    if not template:
        return "Error: Invalid story type."  # Return an error message if the story type is invalid

    # Generate summary using ChatGPT
    summary = generate_summary_with_chatgpt(text, topics, data_points, references)  # Generate a summary using the ChatGPT API
    if summary is None:  # If summary generation fails
        summary = f"Analysis of {', '.join(references)} reveals {'concerns regarding' if sentiment == 'negative' else 'insights into'} {', '.join(topics)}."  # Create a default summary

    # Placeholder values (you might want to refine these based on your analysis)
    placeholder_values = {
        "Concern": {
            "impact": "increased frustration and decreased engagement among users",
            "user_perspectives": "'The app is slow and buggy,' and 'I'm having trouble finding the information I need'",
            "recommendations": "optimize the app's performance, improve navigation, and provide clearer instructions"
        },
        "Win": {
            "factors": "the user-friendly interface, personalized content, and effective communication tools",
            "impact": "increased user satisfaction, higher engagement rates, and improved clinical outcomes",
            "user_perspectives": "'The new onboarding process is much smoother and more intuitive,' and 'I feel like the app really understands my needs now'",
            "recommendations": "continue gathering user feedback, expand the platform's reach, and explore new features based on user needs"
        },
        "Insight": {
            "evidence": "Data from product_metrics_3.txt shows an 85% client retention rate and a 92% session completion rate for teen therapy.",
            "implication": "Teenagers are effectively engaging with the platform and finding value in the services offered.",
            "recommendations": "Continue to monitor these metrics and gather feedback to ensure ongoing satisfaction and engagement among teen users."
        },
        "Opportunity": {
            "potential": "significantly enhance the user experience, improve accessibility, and potentially increase user engagement and retention.",
            "recommendations": "conduct user research to gather more specific feedback, explore potential design solutions, and prioritize development based on user needs and impact."
        }
    }

    # Generate the story using the template and extracted information
    story = f"""
    ## {template['headline_template'].format(Problem=topics[0] if topics else "users", User='users', Area=topics[0] if topics else "users")}

    Summary: {summary}

    {template.get('impact_template', '').format(impact=placeholder_values[story_type]['impact'])}
    {template.get('user_perspective_template', '').format(user_perspectives=placeholder_values[story_type]['user_perspectives'])}
    {template.get('contributing_factors_template', '').format(factors=placeholder_values[story_type].get('factors', ''))}
    {template.get('evidence_template', '').format(evidence=placeholder_values[story_type].get('evidence', ''))}
    {template.get('implication_template', '').format(implication=placeholder_values[story_type].get('implication', ''))}
    {template.get('potential_template', '').format(potential=placeholder_values[story_type].get('potential', ''))}
    {template.get('recommendation_template', '').format(recommendations=placeholder_values[story_type]['recommendations'])}

    **References:**
    {", ".join(references)}
    """
    return story

def generate_product_stories(preprocessed_data):
    """
    Generates product stories for the given preprocessed data.
    """
    all_stories = []
    for file_type, files in preprocessed_data.items():
        if file_type in [
            "feedback_analysis",
            "interview_transcripts",
            "product_metrics",
            "product_intelligence_report",
            "user_journey",
        ]:
            for i, text in enumerate(files):
                print(f"Processing file: {file_type}_{i + 1}.txt")
                topics = extract_topics(text)
                sentiment = analyze_sentiment(text)
                data_points = extract_data_points(text)
                print(f"  Topics: {topics}")
                print(f"  Sentiment: {sentiment}")
                print(f"  Data points: {data_points}")

                # Basic story type classification logic
                if sentiment == "negative":
                    story_type = "Concern"
                elif sentiment == "positive":
                    story_type = "Win"
                else:
                    story_type = "Insight"  # Default

                story = generate_story(
                story_type,
                topics,
                sentiment,
                data_points,
                [f"{file_type}_{i + 1}.txt"],
                text,
                )
                print(f"  Generated story:\n{story}\n")

                all_stories.append(story)
    return all_stories

# Example usage
zip_file_path = r'C:\Users\Anshd\Downloads\data\take_home_data.zip'  # Replace with your actual zip file path
preprocessed_data = load_and_preprocess_data(zip_file_path)
stories = generate_product_stories(preprocessed_data)

# Print the generated stories
for i, story in enumerate(stories):
    print(f"--- Story {i+1} ---\n{story}\n")

In [None]:
# testing part 3 with sample test

import re
from nltk.sentiment import SentimentIntensityAnalyzer
import nltk

# NLTK Downloads (if not already downloaded)
# nltk.download('vader_lexicon')

def analyze_sentiment(text):
    """Performs sentiment analysis on the given text."""
    sia = SentimentIntensityAnalyzer()
    sentiment_score = sia.polarity_scores(text)['compound']
    if sentiment_score >= 0.05:
        return "positive"
    elif sentiment_score <= -0.05:
        return "negative"
    else:
        return "neutral"

def extract_data_points(text):
    """Extracts relevant data points (percentages, numbers, dates) from the text."""
    try:
        data_points = []

        # Improved date pattern to capture complete dates
        date_pattern = r"\b\d{4}-\d{2}-\d{2}\b"
        data_points.extend(re.findall(date_pattern, text))

        # Percentage pattern (no change needed)
        percentage_pattern = r"(\d+\.?\d*)%"
        data_points.extend(re.findall(percentage_pattern, text))

        # Number pattern (modified to avoid capturing parts of dates)
        number_pattern = r"(?<!\d-)(?<!\d)\b\d{1,3}(?:,\d{3})*(?:\.\d+)?\b(?!\d)"
        data_points.extend(re.findall(number_pattern, text))

        # Remove duplicates by converting to a set and back to a list
        data_points = list(set(data_points))

        return data_points
    except Exception as e:
        print(f"Error in extract_data_points: {e}")
        return []

# Example Usage for Testing:
sample_text = """
This is a sample text with some data points. The user satisfaction rate is 95.5%.
The app was launched on 2023-05-15. There are 1,234 active users. The feedback is positive.
"""
sentiment = analyze_sentiment(sample_text)
data_points = extract_data_points(sample_text)

print(f"Sentiment for sample text: {sentiment}")
print(f"Data points extracted: {data_points}")

In [None]:
# code without summarisation
import zipfile
import os
import re
import shutil
import nltk
from dotenv import load_dotenv
from bs4 import BeautifulSoup
import openai
import time
import random

# --- PART 1: Data Loading and Preprocessing ---

# Load environment variables
load_dotenv()

# NLTK Downloads (you might need to uncomment these if you haven't downloaded them)
# nltk.download('vader_lexicon')
# nltk.download('punkt')
# nltk.download('averaged_perceptron_tagger')
# nltk.download('maxent_ne_chunker')
# nltk.download('words')

def preprocess_text(text):
    """Preprocesses the given text (handles HTML, CSS, and plain text)."""
    # Remove HTML/CSS tags using Beautiful Soup with lxml parser
    soup = BeautifulSoup(text, "lxml")
    text = soup.get_text(separator=" ", strip=True)

    # Remove punctuation and convert to lowercase
    text = re.sub(r'[^\w\s]', '', text)
    text = text.lower()

    return text

def load_and_preprocess_data(zip_file_path):
    """Loads and preprocesses data from the zip file."""
    data = {}
    with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
        zip_ref.extractall('temp_data')

    inner_folder = os.listdir('temp_data')[0]
    print(f"Inner folder found: {inner_folder}")

    for filename in os.listdir(os.path.join('temp_data', inner_folder)):
        if filename.endswith(".txt"):
            file_type = filename.split("_")[0]
            try:
                with open(os.path.join('temp_data', inner_folder, filename), 'r', encoding='utf-8') as file:
                    text = file.read()
                    processed_text = preprocess_text(text)
                    if file_type not in data:
                        data[file_type] = []
                    data[file_type].append(processed_text)

                    print(f"  Loaded file: {filename}")
                    print(f"  Processed text (first 100 chars): {processed_text[:100]}...")

            except UnicodeDecodeError:
                print(f"UnicodeDecodeError while reading {filename}. Skipping this file.")

    try:
        shutil.rmtree('temp_data')
    except PermissionError:
        print("PermissionError: Could not delete 'temp_data' directory. Please close any open files and try again.")
    except OSError as e:
        print(f"Error deleting 'temp_data' directory: {e}")

    print(f"  Loaded data: {data}")
    return data

# --- PART 2: Topic Extraction ---

openai.api_key = os.getenv("OPENAI_API_KEY")

def extract_topics(text, delay_seconds):
    """
    Extracts topics from the given text using the ChatGPT API,
    guided by key themes and seed topics.
    """
    print(f"Extracting topics from text: {text[:50]}...")

    key_themes = [
        "user experience", "technical issues", "personalization",
        "accessibility", "peak hours", "customer support"
    ]

    seed_topics = [
        "teen therapy", "group therapy", "session notes", "teletherapy audio",
        "crisis response", "user onboarding", "video quality",
        "appointment scheduling", "therapy companion app", "payment system",
        "cultural matching", "patient engagement", "therapist matching"
    ]

    prompt = f"""
    Extract the key themes and topics discussed in the following text,
    focusing on aspects like {", ".join(key_themes)} and
    considering the context of {", ".join(seed_topics)}:

    Text: \"\"\"{text}\"\"\"
    """

    try:
        client = openai.OpenAI()
        time.sleep(delay_seconds)  # Delay before each API call
        for attempt in range(5):  # Retry up to 5 times
            try:
                response = client.chat.completions.create(
                    model="gpt-3.5-turbo",  # Or a cheaper model if available
                    messages=[
                        {"role": "system", "content": "You are a helpful assistant that extracts key topics from text."},
                        {"role": "user", "content": prompt}
                    ],
                    temperature=0.7,
                    max_tokens=150,
                    top_p=1.0,
                    frequency_penalty=0.0,
                    presence_penalty=0.0,
                )
                raw_topics = response.choices[0].message.content.split('\n')
                topics = [topic.strip() for topic in raw_topics if topic.strip()]
                print(f"  Extracted topics from OpenAI: {topics}")
                return topics

            except openai.RateLimitError as e:
                wait_time = (4 ** attempt) + random.uniform(0, 5)  # Aggressive backoff
                print(f"Rate limit exceeded, waiting for {wait_time:.2f} seconds...")
                time.sleep(wait_time)

        else:
            print("Failed after multiple retries.")
            return []

    except Exception as e:
        print(f"Error in extract_topics: {e}")
        return []

# --- PART 3: Sentiment Analysis and Data Point Extraction ---

def analyze_sentiment(text):
    """Performs sentiment analysis on the given text."""
    sia = SentimentIntensityAnalyzer()
    sentiment_score = sia.polarity_scores(text)['compound']
    if sentiment_score >= 0.05:
        return "positive"
    elif sentiment_score <= -0.05:
        return "negative"
    else:
        return "neutral"

def extract_data_points(text):
    """Extracts relevant data points (percentages, numbers, dates) from the text."""
    try:
        data_points = []

        # Improved date pattern to capture complete dates
        date_pattern = r"\b\d{4}-\d{2}-\d{2}\b"
        data_points.extend(re.findall(date_pattern, text))

        # Percentage pattern (no change needed)
        percentage_pattern = r"(\d+\.?\d*)%"
        data_points.extend(re.findall(percentage_pattern, text))

        # Number pattern (modified to avoid capturing parts of dates)
        number_pattern = r"(?<!\d-)(?<!\d)\b\d{1,3}(?:,\d{3})*(?:\.\d+)?\b(?!\d)"
        data_points.extend(re.findall(number_pattern, text))

        # Remove duplicates by converting to a set and back to a list
        data_points = list(set(data_points))

        return data_points
    except Exception as e:
        print(f"Error in extract_data_points: {e}")
        return []

# --- PART 4: Story Generation ---

def generate_story(story_type, topics, sentiment, data_points, references, text):
    """
    Generates a product story based on the given information and detailed templates.
    """
    print(f"      Generating story of type: {story_type}")

    # Define templates for different story types with additional sections
    templates = {
        "Concern": {
            "headline_template": "[Problem/Issue] is Negatively Impacting [User/Area]",
            "summary_template": "Analysis of {references} reveals concerns regarding {topics}. {data_points} indicate a negative impact on {user_group}.",
            "impact_template": "This issue has led to {impact}.",
            "user_perspective_template": "Users have expressed {user_perspectives} about this problem.",
            "recommendation_template": "To address this concern, it is recommended to {recommendations}.",
        },
        "Win": {
            "headline_template": "[Positive Outcome/Achievement] in [Area/Feature]",
            "summary_template": "{data_points} highlight the success of {feature} in {references}. User feedback indicates {positive_feedback}.",
            "contributing_factors_template": "This success can be attributed to {factors}.",
            "impact_template": "This positive outcome has resulted in {impact}.",
            "user_perspective_template": "Users have expressed {user_perspectives} about this feature.",
            "recommendation_template": "To maintain and further enhance this success, it is recommended to {recommendations}."
        },
        "Insight": {
            "headline_template": "Key Insight: [New Understanding/Discovery] in [Area/Feature]",
            "summary_template": "Analysis of {references} reveals a key insight: {insight}. This has implications for {area}.",
            "evidence_template": "This insight is supported by the following evidence: {evidence}.",
            "implication_template": "This insight suggests that {implication}.",
            "recommendation_template": "We recommend {recommendations} to leverage this insight."
        },
        "Opportunity": {
            "headline_template": "Opportunity to [Improve/Innovate] in [Area/Feature]",
            "summary_template": "{references} highlight an opportunity to {action} in {area}. This could lead to {benefits}.",
            "potential_template": "This opportunity has the potential to {potential}.",
            "recommendation_template": "To capitalize on this opportunity, it is recommended to {recommendations}."
        }
    }

    template = templates.get(story_type)
    if not template:
        return "Error: Invalid story type."

    # --- Input for Placeholders based on our analysis ---

    if story_type == "Concern":
        impact = "increased frustration and decreased engagement among users"
        user_perspectives = "'The app is slow and buggy,' and 'I'm having trouble finding the information I need'"
        recommendations = "optimize the app's performance, improve navigation, and provide clearer instructions"
    elif story_type == "Win":
        factors = "the user-friendly interface, personalized content, and effective communication tools"
        impact = "increased user satisfaction, higher engagement rates, and improved clinical outcomes"
        user_perspectives = "'The new onboarding process is much smoother and more intuitive,' and 'I feel like the app really understands my needs now'"
        recommendations = "continue gathering user feedback, expand the platform's reach, and explore new features based on user needs"
    elif story_type == "Insight":
        evidence = "Data from product_metrics_3.txt shows an 85% client retention rate and a 92% session completion rate for teen therapy."
        implication = "Teenagers are effectively engaging with the platform and finding value in the services offered."
        recommendations = "Continue to monitor these metrics and gather feedback to ensure ongoing satisfaction and engagement among teen users."
    elif story_type == "Opportunity":
        potential = "significantly enhance the user experience, improve accessibility, and potentially increase user engagement and retention."
        recommendations = "conduct user research to gather more specific feedback, explore potential design solutions, and prioritize development based on user needs and impact."

    # Generate the story using the template and extracted information
    story = f"""
    ## {template['headline_template'].format(Problem=topics[0], User='users', Area=topics[0])}

    {template['summary_template'].format(references=", ".join(references), topics=", ".join(topics), data_points=", ".join(data_points), user_group="users", feature=topics[0], insight=topics[0], area=topics[0], action="improve", benefits="better user experience")}

    {template.get('impact_template', '').format(impact=impact)}
    {template.get('user_perspective_template', '').format(user_perspectives=user_perspectives)}
    {template.get('contributing_factors_template', '').format(factors=factors)}
    {template.get('evidence_template', '').format(evidence=evidence)}
    {template.get('implication_template', '').format(implication=implication)}
    {template.get('potential_template', '').format(potential=potential)}
    
    {template['recommendation_template'].format(recommendations=recommendations)}

    **References:**
    {", ".join(references)}
    """
    return story

def generate_product_stories(preprocessed_data):
    """
    Generates product stories for the given preprocessed data.
    """
    all_stories = []
    for file_type, files in preprocessed_data.items():
        if file_type in ["feedback_analysis", "interview_transcripts", "product_metrics", "product_intelligence_report", "user_journey"]:
            for i, text in enumerate(files):
                print(f"Processing file: {file_type}_{i+1}.txt")
                topics = extract_topics(text)
                sentiment = analyze_sentiment(text)
                data_points = extract_data_points(text)

                print(f"  Topics: {topics}")
                print(f"  Sentiment: {sentiment}")
                print(f"  Data points: {data_points}")

                # Story type classification logic (add your rules here)
                if sentiment == "negative":
                    if any(topic in ["technical issues", "connectivity problems", "errors", "delays", "sync issues", "audio quality", "video quality", "payment delays"] for topic in topics):
                        story_type = "Concern"
                    elif any(topic in ["accessibility issues", "limited availability", "mismatch", "long wait times"] for topic in topics):
                        story_type = "Concern"
                    # ... (Add more rules for "Concern" based on your analysis) ...
                elif sentiment == "positive":
                    if any(topic in ["user satisfaction", "clinical improvement", "positive feedback", "high engagement", "effective communication", "successful implementation"] for topic in topics):
                        story_type = "Win"
                    elif any(topic in ["ease of use", "intuitive interface", "personalized experience", "high adoption rate"] for topic in topics):
                        story_type = "Win"
                    # ... (Add more rules for "Win" based on your analysis) ...
                elif sentiment == "neutral":
                    if any(topic in ["user behavior", "new discovery", "insights", "trends", "peak hours", "user journey"] for topic in topics):
                        story_type = "Insight"
                    # ... (Add more rules for "Insight" based on your analysis) ...
                elif any(topic in ["opportunity", "potential", "enhancement", "innovation", "future development"] for topic in topics):
                    story_type = "Opportunity"
                else:
                    story_type = "Insight"  # Default to "Insight" if no other rules match

                story = generate_story(story_type, topics, sentiment, data_points, [f"{file_type}_{i+1}.txt"], text)
                print(f"  Generated story:\n{story}\n")

                all_stories.append(story)
    return all_stories

# --- INTEGRATION AND TESTING (Manual File Selection) ---

# Example Usage (Integrating all parts with manual file selection):
zip_file_path = r'C:\Users\Anshd\Downloads\data\take_home_data.zip'  # Your zip file path
preprocessed_data = load_and_preprocess_data(zip_file_path)

# VERY long delay (e.g., 60 seconds or more) - adjust as needed
delay_seconds = 60

# Select a single file type and file index to process
file_type_to_process = "design"  # Change this to process a different file type
file_index_to_process = 0

if file_type_to_process in preprocessed_data:
    text_list = preprocessed_data[file_type_to_process]

    if file_index_to_process < len(text_list):
        text = text_list[file_index_to_process]
        shortened_text = text[:500]  # Use only the first 500 characters for testing

        # Part 2: Topic Extraction
        topics = extract_topics(shortened_text, delay_seconds)
        print(f"  Topics extracted for {file_type_to_process}: {topics}")

        # Part 3: Sentiment Analysis and Data Point Extraction
        sentiment = analyze_sentiment(shortened_text)
        data_points = extract_data_points(shortened_text)

        print(f"  Sentiment for {file_type_to_process}: {sentiment}")
        print(f"  Data points extracted for {file_type_to_process}: {data_points}")

        # Part 4: Story Generation
        story = generate_story("Concern", topics, sentiment, data_points, [f"{file_type_to_process}_{file_index_to_process + 1}.txt"], shortened_text)
        print(f"  Generated story:\n{story}")

    else:
        print(f"  No file found at index {file_index_to_process} for {file_type_to_process}")
else:
    print(f"  File type '{file_type_to_process}' not found in preprocessed data.")