In [23]:
import os
import pandas as pd
import re
import json
from dotenv import load_dotenv 

In [76]:
# Define the paths

load_dotenv() # Load environment variables from .env file
DATASET_PATH = os.getenv('DATASET_PATH') # Get the DATASET_PATH stored in .env file

# Define file names
source_data_file = 'counsel_chat_data_ORIGINAL.csv'
source_data = os.path.join(DATASET_PATH, 'source_data', source_data_file)
output_file = os.path.join(DATASET_PATH, 'counsel_chat_data.json')

In [77]:
# Load the CSV file
df = pd.read_csv(source_data)

# Drop the unnecessary columns
df = df.drop(columns=['questionLink', 'therapistURL', 'upvotes', 'views'])

# Rename the columns
df = df.rename(columns={
    'questionID': 'question_id',
    'questionTitle': 'question_title',
    'questionText': 'question_full',
    'answerText': 'answer',
    'therapistInfo': 'source',
    'topic': 'topic'
})

# Update the question_id column by appending 'cc_' to the existing values
#df['question_id'] = 'cc_' + df['question_id'].astype(str)

In [78]:
# Replace text based on given replacements Note: Identified by inspection

replacements = {
    'â€™': '’',          # Fix incorrectly encoded apostrophe
    '\u2019': '’',       # Fix Unicode right single quote
    '\u201c': '"',       # Replace Unicode left double quote with regular double quote
    '\u201d': '"',       # Replace Unicode right double quote with regular double quote
    'fiancÃ©': 'fiancé', # Fix incorrectly encoded word
    'StrÃ¶m': 'Ström',   # Fix incorrectly encoded name
    'Â': ' '              # Remove encoding artifact (empty replacement)
}

# Apply the replacements explicitly across all text columns
for column in df.columns:
    if df[column].dtype == 'object':  # Only process text columns
        for key, value in replacements.items():
            df[column] = df[column].apply(lambda x: str(x).replace(key, value))

In [79]:
# Function to identify and remove control characters in a single text entry
def remove_control_characters(text):
    # Regular expression for control characters (ASCII control chars + some common Unicode ones)
    control_chars_pattern = r'[\x00-\x1F\x7F\u200b\xa0]'  # ASCII control chars, zero-width space, non-breaking space
    
    # Remove control characters by replacing them with an empty string
    cleaned_text = re.sub(control_chars_pattern, '', str(text))  # Convert text to string just in case
    
    return cleaned_text

# Apply the function to all text columns in the dataframe
for column in df.columns:
    if df[column].dtype == 'object':  # Only apply to text columns
        df[column] = df[column].apply(remove_control_characters) # Apply the removal function to each row in the column


In [80]:
# Initialize an empty list to hold the JSON output
final_json = []

# Group the dataframe by 'question_id'
grouped = df.groupby('question_id')

# Iterate over the groups (questions)
for question_id, group in grouped:
    # Get the first row for the common question data
    question_data = group.iloc[0]
    
    # Prepare the question-level information
    question_info = {
        "question_id": "cc_" + str(question_id),  # Ensure only one 'cc_' prefix
        "topic": question_data['topic'],
        "question_title": question_data['question_title'],
        "question_full": question_data['question_full'],
        "answers": []
    }
    
    # Collect all answers for this question
    for _, row in group.iterrows():
        answer_info = {
            "answer": row['answer'], # The actual answer
            "source": row['source']  # Therapist information
        }
        # Append each answer to the question's answer list
        question_info["answers"].append(answer_info)
    
    # Append the question info to the final JSON list
    final_json.append(question_info)

# Save the final JSON structure to a file
with open(output_file, 'w') as json_file:
    json.dump(final_json, json_file, indent=4)

print(f"Cleaned data saved to {output_file}")

Cleaned data saved to C:\\Users\\matth\\My_Projects\\2024_Zoom_Camp_RAG_Project\\data\counsel_chat_data_CLEAN.json


In [81]:
print(df['answer'].iloc[4])

I just want to acknowledge you for the courage to take the step to get support. It can be overwhelming to have so many things going on, and it might be hard to figure out where to start. I truly believe that one of the biggest advantages to working with a therapist is that a therapist can help you prioritize and work with those issues that need to be addressed first and foremost. A therapist will help you with the flow of dealing with different aspects that come up. Some issues may even be related to one another. For examples, some clients with depression may also feel anxiety about their depression. Also, rest assured, many clients go to therapy for multiple issues. We are complex beings. I encourage you to reach out to a therapist and talk about this concern. You may find out that even upon the first meeting you will feel hopeful that you will be able to create a plan with a therapist to address your issues. I don't believe you have too many issues, I think starting sooner rather tha