In [8]:
from dotenv import load_dotenv
import os
load_dotenv()
api_key=os.environ.get("OPENAI_API_KEY"),  

In [2]:
import os
import glob
import re
from bs4 import BeautifulSoup
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Initialize NLTK (Run once)
# nltk.download('punkt')
# nltk.download('stopwords')
# nltk.download('wordnet')

In [31]:

def ensure_output_directory(output_dir):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
        print(f"Created output directory at: {output_dir}")
    else:
        print(f"Output directory already exists at: {output_dir}")

def read_txt_files(directory):
    file_paths = glob.glob(os.path.join(directory, '*.txt'))
    return file_paths

def detect_content_type(text):
    # Define regex patterns for HTML and CSS
    html_pattern = re.compile(r'<\s*html>|<\s*head>|<\s*body>|<!DOCTYPE\s+html>', re.IGNORECASE)
    css_pattern = re.compile(r'[^/\*]\.[\w\-]+\s*\{[^}]+\}|#[\w\-]+\s*\{[^}]+\}|[\w\-]+\s*:\s*[^;]+;', re.IGNORECASE)
    
    if html_pattern.search(text):
        return 'html'
    elif css_pattern.search(text):
        return 'css'
    else:
        return 'text'

def remove_unwanted_characters(text):
    pattern = r'[*#-]'
    cleaned_text = re.sub(pattern, '', text)
    return cleaned_text
def clean_css(text):
    """
    Removes all CSS content, including rules, comments, and selectors, leaving an empty string.
    """
    # Remove CSS comments (/* ... */)
    cleaned_text = re.sub(r'/\*.*?\*/', '', text, flags=re.DOTALL)
    
    # Remove all CSS rules and selectors
    cleaned_text = re.sub(r'{[^}]*}', '', cleaned_text)  # Remove rules enclosed in braces
    cleaned_text = re.sub(r'[^{}]+\{[^}]*\}', '', cleaned_text)  # Remove full CSS blocks with selectors
    
    # Remove remaining whitespace
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text)
    cleaned_text = cleaned_text.strip()
    return cleaned_text

def clean_html(text):
    """
    Removes all HTML content, including tags, inline styles, and scripts, leaving only plain text.
    """
    soup = BeautifulSoup(text, 'lxml')  # Use 'html.parser' if 'lxml' is unavailable
    
    for script_or_style in soup(['script', 'style']):
        script_or_style.decompose()
    
    # Get plain text
    text = soup.get_text(separator=' ')
    
    # Remove extra whitespace and newline characters
    text = re.sub(r'\s+', ' ', text)
    text = text.strip()
    return text


def clean_text_with_detection(content):
    content_type = detect_content_type(content)
    
    if content_type == 'html':
        text = clean_html(content)
    elif content_type == 'css':
        text = clean_css(content)
    else:
        text = content  # Assume plain text
    
    # Remove unwanted characters
    text = remove_unwanted_characters(text)
    
    # Convert to lowercase
    text = text.lower()
    
    # Trim whitespace
    text = text.strip()
    
    return text



def preprocess_and_save_file(input_path, output_path):
    try:
        with open(input_path, 'r', encoding='utf-8', errors='ignore') as file:
            content = file.read()
        
        cleaned = clean_text_with_detection(content)
        
        if cleaned:  # Ensure non-empty
            with open(output_path, 'w', encoding='utf-8') as out_file:
                out_file.write(cleaned)
            print(f"Processed and saved: {output_path}")
        else:
            print(f"Skipped empty content after cleaning: {input_path}")
    except Exception as e:
        print(f"Error processing {input_path}: {e}")



def preprocess_directory_save_individually(input_dir, output_dir):
    file_paths = read_txt_files(input_dir)
    total_files = len(file_paths)
    print(f"Found {total_files} '.txt' files to process.")
    
    for idx, input_path in enumerate(file_paths, start=1):
        filename = os.path.basename(input_path)
        output_path = os.path.join(output_dir, filename)
        preprocess_and_save_file(input_path, output_path)
        print(f"Progress: {idx}/{total_files} files processed.")




input_directory = 'C:/Users/keshav/Downloads/take_home_data'     # Replace with your actual input directory path
output_directory = 'Downloads\data_new'   # Replace with your desired output directory path
    
ensure_output_directory(output_directory)

preprocess_directory_save_individually(input_directory, output_directory)
    
   


Output directory already exists at: Downloads\data_new
Found 168 '.txt' files to process.
Processed and saved: Downloads\data_new\design_mockup_10.txt
Progress: 1/168 files processed.
Processed and saved: Downloads\data_new\design_mockup_100.txt
Progress: 2/168 files processed.
Processed and saved: Downloads\data_new\design_mockup_113.txt
Progress: 3/168 files processed.
Processed and saved: Downloads\data_new\design_mockup_126.txt
Progress: 4/168 files processed.
Processed and saved: Downloads\data_new\design_mockup_139.txt
Progress: 5/168 files processed.
Processed and saved: Downloads\data_new\design_mockup_152.txt
Progress: 6/168 files processed.
Processed and saved: Downloads\data_new\design_mockup_165.txt
Progress: 7/168 files processed.
Processed and saved: Downloads\data_new\design_mockup_22.txt
Progress: 8/168 files processed.
Processed and saved: Downloads\data_new\design_mockup_35.txt
Progress: 9/168 files processed.
Processed and saved: Downloads\data_new\design_mockup_48.t

In [7]:
import os
from pathlib import Path

# Define the path to the data directory
DATA_DIR = Path.home() / "Downloads" / "Downloads" / "data_new"

# Function to load text files
def load_files(data_dir):
    documents = {}
    
    if not data_dir.exists():
        print(f"The directory {data_dir} does not exist.")
        return documents
    
    if not data_dir.is_dir():
        print(f"The path {data_dir} is not a directory.")
        return documents
    
    # Iterate over all .txt files in the directory
    for file_path in data_dir.glob('*.txt'):
        if file_path.is_file():
            try:
                with file_path.open('r', encoding='utf-8') as f:
                    documents[file_path.name] = f.read()
            except Exception as e:
                print(f"Error reading {file_path}: {e}")
    
    return documents

# Load the documents
documents = load_files(DATA_DIR)
print(f"Loaded {len(documents)} documents.")



Loaded 142 documents.


In [14]:
client = OpenAI()
# Function to summarize text using GPT-4
def summarize_text_gpt4(text, max_length=150, min_length=50):
    max_chunk = 3000  # GPT-4 can handle more tokens, but split for safety
    text = text.replace("\n", " ")
    sentences = text.split('. ')
    current_chunk = ""
    chunks = []
    for sentence in sentences:
        if len(current_chunk.split()) + len(sentence.split()) <= max_chunk:
            current_chunk += sentence + ". "
        else:
            chunks.append(current_chunk)
            current_chunk = sentence + ". "
    if current_chunk:
        chunks.append(current_chunk)

    # Summarize each chunk and concatenate
    summary = ""
    for chunk in chunks:
        response = client.chat.completions.create(
            model="gpt-4",
            messages=[
                {"role": "system", "content": "You are a helpful assistant skilled in summarizing text."},
                {"role": "user", "content": f"Please summarize the following text:\n\n{chunk}"}
            ],
            max_tokens=300  # Adjust based on desired summary length
        )
        summary_chunk = response.choices[0].message.content
        summary += summary_chunk.strip() + " "
    return summary.strip()



summarized_documents = {}
for file_name, content in documents.items():
    print(f"Summarizing {file_name}...")
    summarized = summarize_text_gpt4(content)
    summarized_documents[file_name] = summarized

print("Summarization complete.")


Summarizing feature_specs_101.txt...
Summarizing feature_specs_11.txt...
Summarizing feature_specs_114.txt...
Summarizing feature_specs_127.txt...
Summarizing feature_specs_140.txt...
Summarizing feature_specs_153.txt...
Summarizing feature_specs_166.txt...
Summarizing feature_specs_23.txt...
Summarizing feature_specs_36.txt...
Summarizing feature_specs_49.txt...
Summarizing feature_specs_62.txt...
Summarizing feature_specs_75.txt...
Summarizing feature_specs_88.txt...
Summarizing feedback_analysis_1.txt...
Summarizing feedback_analysis_104.txt...
Summarizing feedback_analysis_117.txt...
Summarizing feedback_analysis_130.txt...
Summarizing feedback_analysis_14.txt...
Summarizing feedback_analysis_143.txt...
Summarizing feedback_analysis_156.txt...
Summarizing feedback_analysis_26.txt...
Summarizing feedback_analysis_39.txt...
Summarizing feedback_analysis_52.txt...
Summarizing feedback_analysis_65.txt...
Summarizing feedback_analysis_78.txt...
Summarizing feedback_analysis_91.txt...
Su

In [15]:
summarized_documents['feature_specs_101.txt']

'The document outlines the specifications for an appointment scheduling feature intended for mental healthcare providers. The feature needs to be compatible with various platforms, and should provide easy rescheduling and cancellation of appointments. The backend must have a scalable microservices architecture, low latency and fast response times. Accessibility and data security are emphasized. Integration with electronic health record systems, external calendar services, and notification services is a requirement. The features should prompt users to schedule follow-up sessions promptly, while ensuring all data handling processes comply with HIPAA, GDPR and state-specific regulations. The success criteria includes high user satisfaction, minimal scheduling conflicts, high feature adoption rate, and successful compliance audits. Overall goals are to enhance efficiency, improve patient engagement, and uphold security and compliance.'

In [26]:
# from bertopic import BERTopic

# # Initialize BERTopic with a specific embedding model (optional)
# # You can choose a different model based on your requirements
# topic_model = BERTopic(
#     embedding_model="all-MiniLM-L6-v2",
#     nr_topics="auto",  # Automatically determine the optimal number of topics
#     verbose=True
# )

# # Fit BERTopic on the summarized and preprocessed documents
# topics, probabilities = topic_model.fit_transform(df['Summary'])

# print(f"Number of topics found: {len(set(topics)) - (1 if -1 in topics else 0)}")
from bertopic import BERTopic

# Initialize BERTopic with a specific embedding model (optional)
topic_model = BERTopic(
    embedding_model="all-MiniLM-L6-v2",
    nr_topics="auto",  # Automatically determine the optimal number of topics
    verbose=True
)

# Fit BERTopic on the summarized and preprocessed documents
topics, probabilities = topic_model.fit_transform(summarized_documents.values())

print(f"Number of topics found: {len(set(topics)) - (1 if -1 in topics else 0)}")

# Get the topics with their top words
topic_info = topic_model.get_topic_info()
print(topic_info)

# Assign descriptive names to each topic based on top words
detailed_topics = topic_model.get_topics()

topic_names = {}
for topic_num, words in detailed_topics.items():
    if topic_num == -1:
        continue  # Skip outliers
    # Concatenate top words to form a descriptive name
    topic_name = ", ".join([word for word, _ in words[:5]])
    topic_names[topic_num] = topic_name

# Add topics to the DataFrame
df['Topic'] = topics
df['Theme'] = df['Topic'].map(topic_names)

print(df[['Document', 'Topic', 'Theme']])


2024-12-27 13:21:19,099 - BERTopic - Embedding - Transforming documents to embeddings.
Batches: 100%|██████████| 5/5 [00:00<00:00,  5.49it/s]
2024-12-27 13:21:33,189 - BERTopic - Embedding - Completed ✓
2024-12-27 13:21:33,189 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-12-27 13:21:33,339 - BERTopic - Dimensionality - Completed ✓
2024-12-27 13:21:33,339 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-12-27 13:21:33,349 - BERTopic - Cluster - Completed ✓
2024-12-27 13:21:33,349 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-12-27 13:21:33,388 - BERTopic - Representation - Completed ✓
2024-12-27 13:21:33,391 - BERTopic - Topic reduction - Reducing number of topics
2024-12-27 13:21:33,432 - BERTopic - Topic reduction - Reduced number of topics from 3 to 3


Number of topics found: 3
   Topic  Count                     Name  \
0      0     98         0_and_the_to_for   
1      1     26     1_the_to_and_therapy   
2      2     18  2_and_audio_the_quality   

                                      Representation  \
0  [and, the, to, for, user, of, with, in, system...   
1  [the, to, and, therapy, session, in, with, pat...   
2  [and, audio, the, quality, video, to, of, user...   

                                 Representative_Docs  
0  [This Product Requirements Document (PRD) deta...  
1  [The text describes three therapy sessions con...  
2  [The user journey map for video quality in onl...  
                  Document  Topic                            Theme
0    feature_specs_101.txt      0          and, the, to, for, user
1     feature_specs_11.txt      0          and, the, to, for, user
2    feature_specs_114.txt      0          and, the, to, for, user
3    feature_specs_127.txt      0          and, the, to, for, user
4    feature_spec

In [8]:
documents.items()

dict_items([('feature_specs_101.txt', 'appointment scheduling feature specification document\n\n 1. introduction\nthe appointment scheduling feature is designed to streamline the process of booking, managing, and tracking appointments for mental healthcare providers and their patients. this feature aims to enhance user experience, ensure compliance with healthcare regulations, and integrate seamlessly with existing systems.\n\n 2. technical requirements and constraints\n platform compatibility: \n   the feature must be compatible with web, ios, and android platforms.\n  \n user interface: \n   responsive design to accommodate various devices.\n   intuitive calendar view for both providers and patients.\n   features for rescheduling and canceling appointments easily.\n\n backend infrastructure: \n   utilize a scalable microservices architecture.\n   use restful apis for interaction between frontend and backend services.\n   persistent data storage in a secure, hipaacompliant cloud envir

In [18]:
import openai
from openai import OpenAI
import pandas as pd


# OpenAI API key
openai.api_key = "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"  # Replace with your OpenAI API key

client = OpenAI(api_key=openai.api_key)

# Function to call GPT-4 for summarization and theme generation
def generate_summary_and_theme(content):
    prompt = f"""
    You are an advanced summarization and theme generation assistant. For the provided content:
    1. Summarize the content into a concise representation of its main ideas.
    2. Generate a single theme based on the summary.

    The theme must:
    - Avoid the following vague words: user, session, data, patient, system, feedback, therapy, therapist, support, requirement, response, rating, segment, user segment, interviewer, dr.
    - Be meaningful and fewer than 3 words strictly.
    - Accurately reflect the essence of the content.

    Content:
    {content}

    Output format:
    Summary: {{Summarized_Content}}
    Theme: {{Concise_Theme}}
    """
    response = client.chat.completions.create(
        model="gpt-4",
        messages=[
            {"role": "system", "content": "You are an advanced text processing assistant."},
            {"role": "user", "content": prompt}
        ]
    )
    return response.choices[0].message.content

input_directory = "Downloads/data_new"
output_file = "hshs.csv"

# List to store results
results = []

# Process each file in the directory
for file_name in os.listdir(input_directory):
    if file_name.endswith(".txt"):  # Process only text files
        file_path = os.path.join(input_directory, file_name)
        with open(file_path, "r", encoding="utf-8") as file:
            content = file.read()
            # Generate summary and theme
            result = generate_summary_and_theme(content)
            # Append the result with file name
            results.append({"file_name": file_name, "output": result})

# Convert results to a DataFrame
df = pd.DataFrame(results)

# Save the DataFrame to a CSV file
df.to_csv(output_file, index=False)

print(f"Output saved to {output_file}")

Output saved to hshs.csv


In [1]:
import pandas as pd
df =pd.read_csv("1.csv")
df.head()

Unnamed: 0,file_name,output,topic
0,feature_specs_101.txt,The content discusses a feature for schedul...,Secure Integration\n User Satisfaction
1,feature_specs_11.txt,The Teen Therapy Feature document outlines ...,Teen Mental Health\nSecure Data Access
2,feature_specs_114.txt,The Therapy Companion App is a highly inter...,Mood Tracking\n Data Security
3,feature_specs_127.txt,The content discusses the specifications fo...,Real-time Payments\n Security Measures
4,feature_specs_140.txt,The Cultural Matching feature is intended t...,Cultural Alignment\n Personalized Matches


In [2]:
import ollama

def generate_topics_with_llama(text):
    prompt = f"""
    You are a topic extraction assistant specializing in refining topics based on predefined guidance. 
    Now, read the text below and extract up to 2 new concise topics. Ensure each topic consists of fewer than 3 words and closely aligns with the existing themes. 
    Output only the topics and nothing else, keep it strictly to few words.
    Input Text:
    {text}
    """
    response = ollama.chat(model='llama3.1', messages=[{'role': 'user', 'content': prompt}])
    return response['message']['content']


In [3]:
df['topic'] = df['output'].apply(lambda x: generate_topics_with_llama(x))

# Display the updated DataFrame
print(df)


                 file_name                                             output  \
0    feature_specs_101.txt     The content discusses a feature for schedul...   
1     feature_specs_11.txt     The Teen Therapy Feature document outlines ...   
2    feature_specs_114.txt     The Therapy Companion App is a highly inter...   
3    feature_specs_127.txt     The content discusses the specifications fo...   
4    feature_specs_140.txt     The Cultural Matching feature is intended t...   
..                     ...                                                ...   
137    user_journey_37.txt     The note-taking process for therapists invo...   
138    user_journey_50.txt     The user progress through a teletherapy aud...   
139    user_journey_63.txt     The content describes a user's journey map ...   
140    user_journey_76.txt     The content outlines an optimized onboardin...   
141    user_journey_89.txt     The journey of a user in online mental heal...   

                           

In [51]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

# Step 1: Vectorize topics using TF-IDF
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(df['topic'])

num_clusters = 5 # Example number of clusters; adjust based on your data
km = KMeans(n_clusters=num_clusters)
km.fit(tfidf_matrix)
clusters = km.labels_.tolist()

# Step 4: Assigning common labels to similar topics
df['Cluster'] = clusters

# Assign a common label based on majority terms in each cluster
def label_cluster(cluster_id):
    # Extract all topics in this cluster
    topics_in_cluster = df[df['Cluster'] == cluster_id]['topic']
    # Join and vectorize the cluster's topics
    text = ' '.join(topics_in_cluster)
    tfidf = TfidfVectorizer(stop_words='english').fit_transform([text])
    # Get the feature names which represent words in topics
    words = tfidf_vectorizer.get_feature_names_out()
    max_tfidf_idx = tfidf.toarray()[0].argmax()
    return words[max_tfidf_idx]

# Apply the labeling function to each cluster
cluster_labels = {i: label_cluster(i) for i in range(num_clusters)}
df['Common Label'] = df['Cluster'].apply(lambda x: cluster_labels[x])




In [52]:
df.head(7)

Unnamed: 0,file_name,output,topic,Cluster,Common Label
0,feature_specs_101.txt,The content discusses a feature for schedul...,1. Mental Health Scheduling\n2. Secure Integra...,2,feedback
1,feature_specs_11.txt,The Teen Therapy Feature document outlines ...,Teen Mental Care\nSecure Access,2,feedback
2,feature_specs_114.txt,The Therapy Companion App is a highly inter...,AI Mental Health\nPersonalized Therapy,1,conversion
3,feature_specs_127.txt,The content discusses the specifications fo...,Multicurrency Support \nReal-time Security,0,communication
4,feature_specs_140.txt,The Cultural Matching feature is intended t...,Cultural Alignment\nMental Health Outcomes,2,feedback
5,feature_specs_153.txt,This document details the specifications fo...,Mood Tracker\nData Security,0,communication
6,feature_specs_166.txt,The text outlines the specifications of a t...,1. Patient Matching\n2. Data Security,0,communication


In [53]:
from openai import OpenAI
client = OpenAI()

# Directory containing the files
file_directory = "Downloads/data_new"

# Step 1: Filter files with Cluster 0
cluster_0_files = df[df['Cluster'] == 0]['file_name'].tolist()

cluster_0_files


['feature_specs_127.txt',
 'feature_specs_153.txt',
 'feature_specs_166.txt',
 'feature_specs_23.txt',
 'feature_specs_62.txt',
 'feature_specs_75.txt',
 'feature_specs_88.txt',
 'feedback_analysis_104.txt',
 'feedback_analysis_26.txt',
 'interview_transcripts_32.txt',
 'jira_tickets_29.txt',
 'product_metrics_54.txt',
 'product_requirements_111.txt',
 'product_requirements_33.txt',
 'product_requirements_59.txt',
 'product_requirements_8.txt',
 'user_journey_115.txt',
 'user_journey_141.txt',
 'user_journey_63.txt']

In [54]:
# Step 1: Filter rows with Cluster = 0
cluster_0_data = df[df['Cluster'] == 0]

# Step 2: Combine summaries and topics into a structured format
combined_text = " ".join(
    f"Topic: {row['topic']}\nSummary: {row['output']}"
    for _, row in cluster_0_data.iterrows()
)

# Step 3: Feed the combined text into the LLM
def generate_story_from_summary_and_topic(text):
    prompt = f"""
    You are a creative writing assistant and journalist. Your task is to write a cohesive, professional report based on the topics and summaries provided. The report should:
    - Be well-structured and follow a clear narrative arc.
    - Integrate relevant values, dates, metrics, or key insights seamlessly into the story.
    - Maintain a formal tone and professionalism throughout in 200-300 words.

    Incorporate key elements from the provided topics and summaries, ensuring clarity and logical progression. Where applicable, include actionable recommendations or insights. Follow the structure of the provided example below:

    Example:
    **Headline**: Video Quality Concerns Impact Rural Therapy Sessions

    **Introduction**:
    Recent analysis reveals significant challenges with video quality in rural areas, affecting therapy session effectiveness. According to product_metrics.txt, 40% of rural users experience video freezing during sessions, with peak issues occurring during high-traffic hours (2-4 PM EST).

    **Key Findings**:
    - **Impact on Therapeutic Care**: Patient feedback indicates serious clinical implications. session_transcripts.txt shows 28% of affected sessions required rescheduling, while progress_notes.txt reveals therapists report "significant disruption in therapeutic momentum" during critical moments.
    - **Patient Feedback**: Analysis of interview_transcripts.txt highlights patient frustration, with 65% mentioning connectivity as a primary concern.
    - **Technical Analysis**: Investigation of jira_tickets.txt indicates the root cause lies in bandwidth optimization. Current system requires 2.5Mbps stable connection, while user_journey.txt data shows rural users average 1.8Mbps during peak hours. feature_specs.txt suggests potential for adaptive bitrate implementation.

    **Recommendations**:
    1. Implement adaptive bitrate streaming (product_requirements.txt).
    2. Add offline mode for progress notes (feedback_analysis.txt).
    3. Develop automated session recovery protocol (design_mockup.txt).

    Based on this example, generate a report using the information below:

    {text}

    Provide the output in the following format:
    **Headline**: [Your Headline]
    **Introduction**: [Your Introduction]
    **Key Findings**:
    - [Finding 1]
    - [Finding 2]
    ...
    **Recommendations**:
    1. [Recommendation 1]
    2. [Recommendation 2]
    ...
    """

    response = ollama.chat(model='llama3.1', messages=[
            {"role": "system", "content": "You are a story-writing assistant."},
            {"role": "user", "content": prompt}
        ])
    return response['message']['content']

# Step 4: Generate the story
if combined_text.strip():
    story = generate_story_from_summary_and_topic(combined_text)
    print("Generated Story:\n", story)
else:
    print("No summaries or topics were found for Cluster 0.")


Generated Story:
 **Mental Health and Wellness: A Review of Current Systems and Recommendations for Improvement**

**Introduction**
The mental health and wellness industry is rapidly evolving, with a growing need for innovative solutions to address the complexities of human emotions and behaviors. This review aims to summarize key findings from various product requirements documents (PRDs), user journey maps, and system design proposals in the field of mental health and wellness.

**Key Findings**

- **Cultural Match**: Many systems prioritize cultural match, recognizing its significance in effective therapy outcomes.
- **Personalization**: Personalized mental healthcare approaches, such as AI empathetic capacity and culturally targeted information, are increasingly recognized for their effectiveness.
- **User Experience**: User-friendly interfaces, reliable performance, and scalability are critical components of successful mental health platforms.
- **Regulatory Compliance**: Ensuring

In [55]:
def classify_story(story):
    prompt = f"""
    You are a story analysis assistant. Read the following story and classify it into one of the four categories: 
    - INSIGHT: The story provides a deep understanding or realization.
    - OPPORTUNITY: The story highlights a chance for growth or a new possibility.
    - WIN: The story reflects a positive outcome or success.
    - CONCERN: The story raises a potential problem, issue, or danger.

    Story:
    {story}

    Based on the content, classify the story into one of the categories and explain your reasoning briefly:
    - Category: [INSIGHT/OPPORTUNITY/WIN/CONCERN]
    - Reason:
    """
    response = client.chat.completions.create(
        model="gpt-4",
        messages=[
            {"role": "system", "content": "You are a story analysis assistant."},
            {"role": "user", "content": prompt}
        ],
        max_tokens=150,
        temperature=0.5
    )
    return response.choices[0].message.content
# Classify the generated story
classification = classify_story(story)

# Print the classification result
print("Classification Result:\n", classification)


Classification Result:
 - Category: OPPORTUNITY
- Reason: The story presents an evaluation of the current systems in the mental health and wellness industry and provides recommendations for improvement. These suggestions highlight opportunities for growth and new possibilities in the field.


In [56]:
references_section = "\n**References**:\n" + "\n".join(f"- {file}" for file in cluster_0_data['file_name'])
story_with_references = story + "\n" + references_section

# Output the updated story
print(story_with_references)

**Mental Health and Wellness: A Review of Current Systems and Recommendations for Improvement**

**Introduction**
The mental health and wellness industry is rapidly evolving, with a growing need for innovative solutions to address the complexities of human emotions and behaviors. This review aims to summarize key findings from various product requirements documents (PRDs), user journey maps, and system design proposals in the field of mental health and wellness.

**Key Findings**

- **Cultural Match**: Many systems prioritize cultural match, recognizing its significance in effective therapy outcomes.
- **Personalization**: Personalized mental healthcare approaches, such as AI empathetic capacity and culturally targeted information, are increasingly recognized for their effectiveness.
- **User Experience**: User-friendly interfaces, reliable performance, and scalability are critical components of successful mental health platforms.
- **Regulatory Compliance**: Ensuring compliance with h

In [51]:
cluster_1_data = df[df['Cluster'] == 1]

# Step 2: Combine summaries and topics into a structured format
combined = " ".join(
    f"Topic: {row['topic']}\nSummary: {row['output']}"
    for _, row in cluster_1_data.iterrows()
)


def generate_story_from_summary_and_topic(text):
    prompt = f"""
    You are a creative writing assistant and journalist. Your task is to write a cohesive, professional report based on the topics and summaries provided. The report should:
    - Be well-structured and follow a clear narrative arc.
    - Integrate relevant values, dates, metrics, or key insights seamlessly into the story.
    - Maintain a formal tone and professionalism throughout in 200-300 words.

    Incorporate key elements from the provided topics and summaries, ensuring clarity and logical progression. Where applicable, include actionable recommendations or insights. Follow the structure of the provided example below:

    Example:
    **Headline**: Video Quality Concerns Impact Rural Therapy Sessions

    **Introduction**:
    Recent analysis reveals significant challenges with video quality in rural areas, affecting therapy session effectiveness. According to product_metrics.txt, 40% of rural users experience video freezing during sessions, with peak issues occurring during high-traffic hours (2-4 PM EST).

    **Key Findings**:
    - **Impact on Therapeutic Care**: Patient feedback indicates serious clinical implications. session_transcripts.txt shows 28% of affected sessions required rescheduling, while progress_notes.txt reveals therapists report "significant disruption in therapeutic momentum" during critical moments.
    - **Patient Feedback**: Analysis of interview_transcripts.txt highlights patient frustration, with 65% mentioning connectivity as a primary concern.
    - **Technical Analysis**: Investigation of jira_tickets.txt indicates the root cause lies in bandwidth optimization. Current system requires 2.5Mbps stable connection, while user_journey.txt data shows rural users average 1.8Mbps during peak hours. feature_specs.txt suggests potential for adaptive bitrate implementation.

    **Recommendations**:
    1. Implement adaptive bitrate streaming (product_requirements.txt).
    2. Add offline mode for progress notes (feedback_analysis.txt).
    3. Develop automated session recovery protocol (design_mockup.txt).

    Based on this example, generate a report using the information below:

    {text}

    Provide the output in the following format:
    **Headline**: [Your Headline]
    **Introduction**: [Your Introduction]
    **Key Findings**:
    - [Finding 1]
    - [Finding 2]
    ...
    **Recommendations**:
    1. [Recommendation 1]
    2. [Recommendation 2]
    ...
    """

    response = ollama.chat(model='llama3.1', messages=[
            {"role": "system", "content": "You are a story-writing assistant."},
            {"role": "user", "content": prompt}
        ])
    return response['message']['content']

# Step 4: Generate the story
if combined_text.strip():
    story = generate_story_from_summary_and_topic(combined)
    print("Generated Story:\n", story)
else:
    print("No summaries or topics were found for Cluster 0.")    


Generated Story:
 **Enhancing the User Experience: Key Insights and Recommendations**

**Introduction**: The following report summarizes key findings from various user journey maps and thematic analyses of mental health support platforms, therapist matching services, and teletherapy audio platforms. These insights highlight areas for improvement to enhance the user experience and provide recommendations for system enhancements.

**Key Findings**:

- **Peak Hour Challenges**: Many users experience anxiety during peak hours due to system slowdowns, note-saving delays, syncing failures, and long-term access concerns.
- **Lack of Personalization**: Users often express difficulty in finding the right therapist through SonderMind's ML-based matching process, indicating a need for improved personalization and user-friendliness in the matchmaking stage.
- **Mental Health Support Options**: Scalable therapy options, including group therapy, are essential for reducing costs and improving efficac

In [52]:
classification = classify_story(story)

# Print the classification result
print("Classification Result:\n", classification)

Classification Result:
 - Category: OPPORTUNITY
- Reason: The story is identifying areas for potential improvement in mental health support platforms, presenting a chance for growth and new possibilities. The recommendations provided in the story are opportunities for these platforms to enhance their user experience and efficacy.


In [54]:
cluster_2_data = df[df['Cluster'] == 2]

# Step 2: Combine summaries and topics into a structured format
combined = " ".join(
    f"Topic: {row['topic']}\nSummary: {row['output']}"
    for _, row in cluster_2_data.iterrows()
)
# cluster_2_data

In [55]:
# Step 4: Generate the story
if combined.strip():
    story = generate_story_from_summary_and_topic(combined)
    print("Generated Story:\n", story)
else:
    print("No summaries or topics were found for Cluster 0.")    

Generated Story:
 **Enhancing Teletherapy Experience: Addressing Rural Access Barriers and Video Quality Issues**

**Introduction**: Sondermind, an online mental healthcare platform, aims to provide accessible and high-quality teletherapy services to its users. However, several challenges persist, including video quality issues, rural access barriers, and difficulties in scheduling appointments.

**Key Findings**:

- **Video Quality Issues**: Users experience fluctuating emotions due to concerns about video quality and internet stability, especially in rural areas.
- **Rural Access Barriers**: Rural residents face unique challenges, including video freezing and slight lags during peak hours or in low lighting.
- **Scheduling Challenges**: Users encounter various obstacles while scheduling appointments, such as provider availability, remembering login credentials, syncing schedules, committing to a time, missing confirmations, and overlooked reminders.

**Recommendations**:

1.  **Inves

In [56]:
classification = classify_story(story)

# Print the classification result
print("Classification Result:\n", classification)

Classification Result:
 - Category: OPPORTUNITY
- Reason: The story presents a set of challenges that Sondermind is facing with its teletherapy services, particularly for rural users. However, it also outlines a series of recommendations for overcoming these issues, thus highlighting new possibilities for growth and improvement in their services. This makes the story fit best under the category of 'Opportunity'.


In [57]:
cluster_3_data = df[df['Cluster'] == 3]

combined = " ".join(
    f"Topic: {row['topic']}\nSummary: {row['output']}"
    for _, row in cluster_3_data.iterrows()
)
cluster_3_data

Unnamed: 0,file_name,output,topic,Cluster,Common Label
91,progress_notes_122.txt,"The patient, facing heightened anxiety due ...",• Cognitive Behavio\n• Mindfulness-Based,3,aid
94,progress_notes_161.txt,"Over three therapy sessions, an anxious fem...",Social Therapy Techniques \nAnxiety Relief Met...,3,aid
99,progress_notes_6.txt,In three sessions spanning October 2 to Oct...,Cognitive Restructuring\nSocial Confidence Bui...,3,aid
103,session_transcripts_108.txt,The transcripts cover three different thera...,• Cognitive therapy\n• Therapy techniques,3,aid
104,session_transcripts_121.txt,"Across three educational therapy sessions, ...",Cognitive Reframing\nDistress Tolerance,3,aid
112,session_transcripts_56.txt,These dialogue snippets from three therapy ...,Cognitive Behavioral \nDialectical Behavior,3,aid


In [58]:
if combined.strip():
    story = generate_story_from_summary_and_topic(combined)
    print("Generated Story:\n", story)
else:
    print("No summaries or topics were found for Cluster 0.")    

Generated Story:
 **Headline**: Effective Therapy Techniques for Anxiety Relief: A Comparative Analysis

**Introduction**: This report provides a comprehensive analysis of various therapy techniques used to address anxiety disorders, including Cognitive Behavioral Therapy (CBT), Mindfulness-Based Stress Reduction (MBSR), and Dialectical Behavior Therapy (DBT). Through a review of therapy sessions and patient feedback, this study highlights the effectiveness of these techniques in alleviating symptoms of anxiety.

**Key Findings**:
- **Cognitive Restructuring**: Patients who underwent CBT reported significant improvement in managing anxiety-provoking situations, with an average reduction of 30% in anxiety levels.
- **Mindfulness-Based Stress Reduction (MBSR)**: MBSR techniques such as thought record monitoring and cognitive restructuring skills showed a notable decrease in patient's anxiety levels, with 25% reporting improved sleep quality and concentration at work.
- **Dialectical Beha

In [59]:
classification = classify_story(story)

# Print the classification result
print("Classification Result:\n", classification)

Classification Result:
 - Category: INSIGHT
- Reason: The story provides a deep understanding or realization about various effective therapy techniques for anxiety relief. It presents a comparative analysis and offers recommendations for future implementation, which shows that it is providing insights rather than highlighting an opportunity, a win, or a concern.


In [64]:
references_section = "\n**References**:\n" + "\n".join(f"- {file}" for file in cluster_3_data['file_name'])
story_with_references = story + "\n" + references_section

# Output the updated story
print(story_with_references)

**Headline**: Effective Therapy Techniques for Anxiety Relief: A Comparative Analysis

**Introduction**: This report provides a comprehensive analysis of various therapy techniques used to address anxiety disorders, including Cognitive Behavioral Therapy (CBT), Mindfulness-Based Stress Reduction (MBSR), and Dialectical Behavior Therapy (DBT). Through a review of therapy sessions and patient feedback, this study highlights the effectiveness of these techniques in alleviating symptoms of anxiety.

**Key Findings**:
- **Cognitive Restructuring**: Patients who underwent CBT reported significant improvement in managing anxiety-provoking situations, with an average reduction of 30% in anxiety levels.
- **Mindfulness-Based Stress Reduction (MBSR)**: MBSR techniques such as thought record monitoring and cognitive restructuring skills showed a notable decrease in patient's anxiety levels, with 25% reporting improved sleep quality and concentration at work.
- **Dialectical Behavior Therapy (DBT)

In [57]:
df.to_csv('final_result.csv',index=False)