In [1]:
%pip install pandas
import json
import csv
import os
import random
import string
import pandas as pd
import numpy as np

Note: you may need to restart the kernel to use updated packages.


## Raw files to .csv:

In [2]:
# Define the directory containing the input files
input_dir = 'transcript'

def process_txt_file(input_file, output_file):
    with open(input_file, 'r') as infile, open(output_file, 'w', newline='') as outfile:
        csv_writer = csv.writer(outfile)
        
        header = ['time', 'emotion_1', 'emotion_2', 'emotion_3', 'emotion_4', 'emotion_5','emotion_6','emotion_7']
        csv_writer.writerow(header)
        
        for line in infile:
            data = json.loads(line.strip())
            row = [int(data['time'])] + [int(score) for score in data['scores']]
            csv_writer.writerow(row)

# Function to recursively process nested content
def process_content(csv_writer, content):
    if isinstance(content, list):
        for item in content:
            process_content(csv_writer, item)
    elif isinstance(content, dict):
        role = content.get('role', '')
        content_text = ''
        time = int(content.get('time', '0'))
        user_id = content.get('user_id', '')
        if 'content' in content:
            content_data = content['content']
            if isinstance(content_data, list):
                for sub_item in content_data:
                    if isinstance(sub_item, dict) and sub_item.get('type') == 'text':
                        content_text = sub_item.get('text', '')
            elif isinstance(content_data, str):
                content_text = content_data
        csv_writer.writerow([role, content_text, time, user_id])

# Function to process .json files and convert them to CSV
def process_json_file(input_file, output_file):
    with open(input_file, 'r') as infile, open(output_file, 'w', newline='') as outfile:
        csv_writer = csv.writer(outfile)
        
        header = ['role', 'content', 'time', 'user_id']
        csv_writer.writerow(header)
        
        data_list = json.load(infile)
        
        for item in data_list:
            process_content(csv_writer, item)

# Walk through all directories and files
for root, dirs, files in os.walk(input_dir):
    for filename in files:
        if filename.endswith('.txt'):
            input_file = os.path.join(root, filename)
            output_file = os.path.join(root, 'processed_' + filename.replace('.txt', '.csv'))
            process_txt_file(input_file, output_file)
        elif filename.endswith('.json'):
            input_file = os.path.join(root, filename)
            output_file = os.path.join(root, 'processed_' + filename.replace('.json', '.csv'))
            process_json_file(input_file, output_file)


## Calculating avg emotion scores

In [35]:
# Function to read CSV file into a list of dictionaries
def read_csv_file(file_path):
    data = []
    with open(file_path, 'r', newline='') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            data.append(row)
    return data

# Function to write data to a new CSV file
def write_csv_file(file_path, header, data):
    with open(file_path, 'w', newline='') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(header)
        for row in data:
            writer.writerow(row)

# Function to generate random alphanumeric ID of given length
def generate_random_id(length):
    return ''.join(random.choices(string.ascii_letters + string.digits, k=length))

# Function to calculate average scores for a given time range
def calculate_average_scores(raw_scores, target_time, role, time_window):
    num_emotions = 7
    sum_scores = [0] * num_emotions
    count = 0

    for score_data in raw_scores:
        score_time = int(score_data['time'])

        if role == 'user':
            if target_time - time_window <= score_time <= target_time + time_window:
                for i in range(num_emotions):
                    sum_scores[i] += int(score_data[f'emotion_{i+1}'])
                count += 1
        elif role == 'assistant':
            if target_time <= score_time <= target_time + time_window:
                for i in range(num_emotions):
                    sum_scores[i] += int(score_data[f'emotion_{i+1}'])
                count += 1
    
    if count > 0:
        avg_scores = [round(sum_score / count, 2) for sum_score in sum_scores]
    else:
        avg_scores = [0] * num_emotions
    
    return avg_scores

# Function to process each folder and its files
def process_folder(root):
    print(f"Processing folder: {root}")
    user_data_csv = None
    raw_scores_csv = None
    output_csv = None

    used_ids = set()  # Set to store used conv_ids

    for filename in os.listdir(root):
        if filename.startswith('processed_Emili_') and filename.endswith('.csv') and not filename.endswith('_condensed.csv'):
            if filename.startswith('processed_Emili_raw_'):
                raw_scores_csv = os.path.join(root, filename)
            else:
                user_data_csv = os.path.join(root, filename)

    if user_data_csv and raw_scores_csv:
        timestamp = os.path.basename(user_data_csv).split('_')[2]
        output_csv = os.path.join(root, f'scored_{timestamp}.csv')
        print(f"Found user data CSV: {user_data_csv}")
        print(f"Found raw scores CSV: {raw_scores_csv}")
        user_data = read_csv_file(user_data_csv)
        raw_scores_data = read_csv_file(raw_scores_csv)

        output_header = ['Conv_id','time', 'role','user_id','content','Anger', 'Disgust', 'Fear', 'Happy', 'Sad', 'Surprise', 'Neutral']
        output_data = []

        # Generate unique conv_id
        conv_id = generate_random_id(7)
        while conv_id in used_ids:
            conv_id = generate_random_id(7)
        used_ids.add(conv_id)

        for user_row in user_data:
            if user_row['role'] == 'user':
                time_window = 5
                target_time = int(user_row['time'])
                avg_scores = calculate_average_scores(raw_scores_data, target_time, user_row['role'], time_window)
                output_row = [conv_id,user_row['time'],user_row['role'], user_row['user_id'], user_row['content']] + avg_scores
                output_data.append(output_row)
            elif user_row['role'] == 'assistant':
                time_window = 10
                target_time = int(user_row['time'])
                avg_scores = calculate_average_scores(raw_scores_data, target_time, user_row['role'], time_window)
                output_row = [conv_id,user_row['time'],user_row['role'], user_row['user_id'], user_row['content']] + avg_scores
                output_data.append(output_row)
            elif user_row['role'] == 'system':
                output_row = [conv_id,user_row['time'],user_row['role'], user_row['user_id'], user_row['content']] + [0] * 7
                output_data.append(output_row)
        write_csv_file(output_csv, output_header, output_data)
        print(f"Processed data saved to {output_csv}")
    else:
        print("Required CSV files not found in this folder.")

# Walk through all directories and process files
input_dir = 'transcript'
for root, dirs, files in os.walk(input_dir):
    process_folder(root)


Processing folder: transcript
Required CSV files not found in this folder.
Processing folder: transcript/20240705_133046
Found user data CSV: transcript/20240705_133046/processed_Emili_20240705_133046.csv
Found raw scores CSV: transcript/20240705_133046/processed_Emili_raw_20240705_133046.csv
Processed data saved to transcript/20240705_133046/scored_20240705.csv
Processing folder: transcript/20240705_134402
Found user data CSV: transcript/20240705_134402/processed_Emili_20240705_134402.csv
Found raw scores CSV: transcript/20240705_134402/processed_Emili_raw_20240705_134402.csv
Processed data saved to transcript/20240705_134402/scored_20240705.csv
Processing folder: transcript/20240702_154302
Found user data CSV: transcript/20240702_154302/processed_Emili_20240702_154302.csv
Found raw scores CSV: transcript/20240702_154302/processed_Emili_raw_20240702_154302.csv
Processed data saved to transcript/20240702_154302/scored_20240702.csv
Processing folder: transcript/20240705_124936
Found use

## Calculating Temporal Difference and the flag

In [39]:
def process_scored_files_in_directory(directory):
    # Iterate through files in the directory
    for filename in os.listdir(directory):
        if filename.startswith('scored_') and filename.endswith('.csv'):
            file_path = os.path.join(directory, filename)
            print(f"Processing file: {file_path}")
            
            # Read the CSV file into a DataFrame
            df = pd.read_csv(file_path)
            
            # Compute tot_emo_score
            df['tot_emo_score'] = df['Happy'] * 5 + df['Neutral'] * 1 - df['Sad'] * 2 + df['Surprise'] * 1 - df['Anger'] * 2 - df['Fear'] * 2 - df['Disgust'] * 5
            
            # Initialize flag column with NaNs
            df['flag'] = np.nan
            
            # Explicitly cast flag column to boolean
            df['flag'] = df['flag'].astype('object')
            
            # Iterate through each row with role 'assistant'
            for idx, row in df.iterrows():
                if row['role'] == 'assistant':
                    # Search backwards to find the previous row with role 'user'
                    prev_user_idx = idx - 1
                    while prev_user_idx >= 0 and df.iloc[prev_user_idx]['role'] != 'user':
                        prev_user_idx -= 1
                    
                    # Check if a valid previous 'user' row was found
                    if prev_user_idx >= 0 and df.iloc[prev_user_idx]['role'] == 'user':
                        if row['tot_emo_score'] - df.iloc[prev_user_idx]['tot_emo_score'] >= 0:
                            df.at[idx, 'flag'] = True
                        else:
                            df.at[idx, 'flag'] = False
            
            # Extract file timestamp from filename
            timestamp = filename.split('_')[1].split('.')[0]  # Adjust this based on your filename pattern
            
            # Save the modified DataFrame to a new CSV file
            output_filename = f'flagged_{timestamp}.csv'
            output_path = os.path.join(directory, output_filename)
            df.to_csv(output_path, index=False)
            
            print(f"Processed data saved to {output_path}")

# Define the main directory to process
main_directory = 'transcript'

# Iterate through each directory in the main directory
for root, dirs, files in os.walk(main_directory):
    for directory in dirs:
        directory_path = os.path.join(root, directory)
        process_scored_files_in_directory(directory_path)

print("Processing complete for all directories.")


Processing file: transcript/20240705_133046/scored_20240705.csv
Processed data saved to transcript/20240705_133046/flagged_20240705.csv
Processing file: transcript/20240705_134402/scored_20240705.csv
Processed data saved to transcript/20240705_134402/flagged_20240705.csv
Processing file: transcript/20240702_154302/scored_20240702.csv
Processed data saved to transcript/20240702_154302/flagged_20240702.csv
Processing file: transcript/20240705_124936/scored_20240705.csv
Processed data saved to transcript/20240705_124936/flagged_20240705.csv
Processing complete for all directories.


# CSV to JSONL

In [None]:
def process_csv_file(csv_file_path, messages):
    with open(csv_file_path, 'r') as csv_file:
        csv_reader = csv.DictReader(csv_file)
        system_message = ""

        for row in csv_reader:
            id = ''

            if row["role"] == "user":
                id = f"user_id: {row['user_id']}. "
            # Process the content without the column name
            content = id + row["content"].replace('\n', ' ').replace('\r', ' ').strip()

            if row["role"] == "system":
                system_message += content + " "
                continue

            if system_message:
                messages.append({"role": "system", "content": system_message.strip()})
                system_message = ""

            message = {
                "role": row["role"],
                "content": content
            }
            if row["role"] == "assistant":
                message["weight"] = 1 if row["flag"].lower() == "true" else 0

            messages.append(message)

        if system_message:
            messages.append({"role": "system", "content": system_message.strip()})


def csv_to_jsonl(input_dir, jsonl_file_path):
    messages = []

    for root, dirs, files in os.walk(input_dir):
        for filename in files:
            if filename.startswith('flagged_') and filename.endswith('.csv') and not filename.endswith('_condensed.csv'):
                csv_file_path = os.path.join(root, filename)
                process_csv_file(csv_file_path, messages)

    with open(jsonl_file_path, 'w') as jsonl_file:
        jsonl_file.write(json.dumps({"messages": messages}) + '\n')


if __name__ == "__main__":
    input_dir = 'test_script'  # Replace with the path to your directory
    jsonl_file_path = 'Dataset_1.jsonl'  # Replace with the desired output JSONL file path
    csv_to_jsonl(input_dir, jsonl_file_path)
