## Membership Extraction

In [None]:
import os
import json
import json5
import pandas as pd
import re

company = "finance_corporation"
# company = "tech_company"
# company = "medical_institution"
folder_path = '/data2/visitor/ASE25/Chimera-Dataset/Finance-Company/generated_members'
# folder_path = '/data2/visitor/ASE25/Chimera-Dataset/Tech-Company/generated_members'
# folder_path = '/data2/visitor/ASE25/Chimera-Dataset/Medical-Institution/generated_members'

output_csv = f'{company}.csv'

data_list = []

for filename in os.listdir(folder_path):
    if filename.endswith('.jsonc'):
        file_path = os.path.join(folder_path, filename)
        with open(file_path, 'r', encoding='utf-8') as f:
            try:
                content = json5.load(f)
                data = {
                    "name": content.get("name"),
                    "id": content.get("id"),
                    "role": content.get("role"),
                    "ip": content.get("ip"),
                    "email": content.get("email"),
                    "container_id": content.get("container_id"),
                    "mbti": content.get("mbti"),
                    "interests": content.get("interests"),
                    "personality": content.get("personality")
                }
                data_list.append(data)
            except Exception as e:
                print(f"Error loading {filename}: {e}")

# 用pandas生成DataFrame
df = pd.DataFrame(data_list)

# save with column names
df.to_csv(output_csv, index=True, header=True)

## Email Data

In [45]:
import os 
import pandas as pd
import json
import ast

# read four dirs email.csv
weeks = range(1, 5)
week_dates = [5, 5, 5, 5]
week_pointer = 0

scenario = "Tech-Company"
company_type = "tech"

# Reading
root_dir = f'/data2/visitor/ASE25/Chimera-Dataset/'

# concat all the email.csv
email_csvs = pd.DataFrame()

for week in weeks:
    weekly_dir = os.path.join(root_dir, scenario, f'week{week}-gemini-{company_type}')
    weekly_email_log_path = os.path.join(weekly_dir, "execution_logs", 'email.csv')
    
    df = pd.read_csv(weekly_email_log_path, encoding='utf-8')

    # Concatenate all DataFrames
    email_csvs = pd.concat([email_csvs, df], ignore_index=True)

# update a new column "sim_timestamp" with date and the time. The date start from starting_date and add 1 day when the next cell(time) is smaller than the previous cell
starting_date = "2025-05-01"
# email_csvs['sim_timestamp'] = pd.to_datetime(starting_date) + pd.to_timedelta(email_csvs.index, unit='d')

#### settle the sim timestamp with proper date ####
date_counter = 1
for i in range(len(email_csvs)):
    sim_time = pd.to_datetime(email_csvs.loc[i, 'sim_timestamp'], format='%H:%M:%S')
    
    if i == 0:
        sim_date = starting_date
    else:
        prev_time = pd.to_datetime(email_csvs.loc[i-1, 'sim_timestamp'], format='%H:%M:%S')
        if (sim_time < prev_time) and ((prev_time - sim_time) > pd.Timedelta(hours=6)):
            if date_counter == week_dates[week_pointer]:
                # print("next week")
                add_date = 8 - week_dates[week_pointer]
                week_pointer += 1
                date_counter = 1
            else:
                date_counter += 1
                add_date = 1
            sim_date = (pd.to_datetime(sim_date) + pd.Timedelta(days=add_date)).strftime('%Y-%m-%d')
            # print(f"Date changed to {sim_date} at index {i}")
    email_csvs.loc[i, 'full_sim_timestamp'] = f"{sim_date} {email_csvs.loc[i, 'sim_timestamp']}"

In [41]:
email_csvs.head(5)

Unnamed: 0,email_from,real_timestamp,sim_timestamp,name,email_to,email_cc,subject,content,full_sim_timestamp
0,qtest-1,2025-05-15 15:04:55,08:00:10,Ethan Carter,LDES-1,"['SDES-1', 'SDES-2']",GDD Review and Clarification Needed,"Team,\n\nI've started my review of the current...",2025-05-01 08:00:10
1,sdes-1,2025-05-15 15:06:15,08:22:13,Rajan Patel,ldes-1,['prod-1'],"Action Items: Combat, Economy, and Progression...","Team,\n\nFollowing up on our initial alignment...",2025-05-01 08:22:13
2,prod-1,2025-05-15 15:06:20,08:24:26,Olivia Rodriguez,sdes-1,[],"Re: Action Items: Combat, Economy, and Progres...","Hi Rajan,\n\nThanks for the detailed breakdown...",2025-05-01 08:24:26
3,sdes-1,2025-05-15 15:06:25,08:25:09,Rajan Patel,prod-1,[],"Re: Action Items: Combat, Economy, and Progres...","Hi Olivia,\n\nThanks for your feedback. I unde...",2025-05-01 08:25:09
4,ldes-1,2025-05-15 15:06:42,08:28:52,Naomi Walker,sdes-1,[],"Re: Action Items: Combat, Economy, and Progres...","Rajan,\n\nThanks for the clear breakdown. Just...",2025-05-01 08:28:52


In [46]:
company_type = "tech_company"


# Add the missing columns

### 1. add id list with content from email_from insert next to the index
email_csvs.insert(0, 'id', email_csvs['email_from'])

id_role_map = {}
id_list = []
profile_list = []

profile_output_dir = f"experiment_output/gemini_{company_type}/generated_members"
for file in os.listdir(profile_output_dir):
    if file.endswith(".jsonc"):
        member_profile_path = os.path.join(profile_output_dir, file)
        with open(member_profile_path, 'r') as f:
            member_profile = json.load(f)
        id_role_map[member_profile['id']] = member_profile['role'] # add id-role map
        id_list.append(member_profile['id'])
        profile_list.append(member_profile) # add profile

id_email_map = {profile['id']: profile['email'] for profile in profile_list}
id_pc_map = {profile['id']: profile['container_id'] for profile in profile_list}

### 2. replace email_from with the email
email_csvs['email_from'] = email_csvs['email_from'].map(id_email_map)

### 3. replace email_to with the email
email_csvs['email_to'] = email_csvs['email_to'].map(id_email_map)
# change the column name to "to"
email_csvs.rename(columns={'email_to': 'to'}, inplace=True)

### 4. replace email_cc list
for i in range(len(email_csvs)):
    if email_csvs.loc[i, 'email_cc'] == '[]':
        email_cc_out = "NaN"
    else:
        email_cc_list = ast.literal_eval(email_csvs.loc[i, 'email_cc'])
        email_cc_out_list = []
        for email_des in email_cc_list:
            # go through map id_email_map to find the email
            # to lower case
            email_des = email_des.lower()
            email_cc_out_list.append(id_email_map.get(email_des, email_des))
            email_cc_out = ', '.join(email_cc_out_list)
    email_csvs.loc[i, 'email_cc'] = email_cc_out

### 5. add new column named "size" to calculate the word count of the email_content
content_idx = email_csvs.columns.get_loc('content')
email_csvs.insert(content_idx+1, 'size', email_csvs['content'].apply(lambda x: len(x.split()) if isinstance(x, str) else 0))

### 6. add new column named "attachment" with value of "0"
email_csvs.insert(email_csvs.columns.get_loc('size') + 1, 'attachments', 0)

### 7. add new column named "pc" with profile content of "container_id"
email_csvs.insert(email_csvs.columns.get_loc('name') + 1, 'pc', email_csvs['id'].map(id_pc_map))

### 8. add new column named "bcc"
email_csvs.insert(email_csvs.columns.get_loc('email_cc') + 1, 'bcc', 'NaN')

In [43]:
email_csvs.head(5)

Unnamed: 0,id,email_from,real_timestamp,sim_timestamp,name,pc,to,email_cc,bcc,subject,content,size,attachments,full_sim_timestamp
0,qtest-1,qa-234987@corp.com,2025-05-15 15:04:55,08:00:10,Ethan Carter,7b8a9c0d1e2f,,"sysdes-238759@corp.com, sysdes-981273@corp.com",,GDD Review and Clarification Needed,"Team,\n\nI've started my review of the current...",176,0,2025-05-01 08:00:10
1,sdes-1,sysdes-238759@corp.com,2025-05-15 15:06:15,08:22:13,Rajan Patel,9b5c6d7e82a3,gamedesign-195738@corp.com,produce-543210@corp.com,,"Action Items: Combat, Economy, and Progression...","Team,\n\nFollowing up on our initial alignment...",233,0,2025-05-01 08:22:13
2,prod-1,produce-543210@corp.com,2025-05-15 15:06:20,08:24:26,Olivia Rodriguez,c3d2e1f0a9b8,sysdes-238759@corp.com,,,"Re: Action Items: Combat, Economy, and Progres...","Hi Rajan,\n\nThanks for the detailed breakdown...",177,0,2025-05-01 08:24:26
3,sdes-1,sysdes-238759@corp.com,2025-05-15 15:06:25,08:25:09,Rajan Patel,9b5c6d7e82a3,produce-543210@corp.com,,,"Re: Action Items: Combat, Economy, and Progres...","Hi Olivia,\n\nThanks for your feedback. I unde...",270,0,2025-05-01 08:25:09
4,ldes-1,gamedesign-195738@corp.com,2025-05-15 15:06:42,08:28:52,Naomi Walker,f89a4b3c21d0,sysdes-238759@corp.com,,,"Re: Action Items: Combat, Economy, and Progres...","Rajan,\n\nThanks for the clear breakdown. Just...",186,0,2025-05-01 08:28:52


### Format Fitting

In [47]:
output_dir = f"/data2/visitor/ASE25/Chimera/Final-Output/{scenario}/"
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

sim_idx = email_csvs.columns.get_loc("sim_timestamp")
full_sim = email_csvs.pop("full_sim_timestamp")
email_csvs.insert(sim_idx + 1, "full_sim_timestamp", full_sim)

# rename the column "name" to "user"
email_csvs.rename(columns={'name': 'user'}, inplace=True)
# rename the column "full_sim_timestamp" to "time"
email_csvs.rename(columns={'full_sim_timestamp': 'date'}, inplace=True)
# rename the column "email_cc" to "cc"
email_csvs.rename(columns={'email_cc': 'cc'}, inplace=True)
email_csvs.rename(columns={'email_from': 'from'}, inplace=True)

# remove the column "real_timestamp" and "sim_timestamp"
email_csvs.drop(columns=['real_timestamp', 'sim_timestamp'], inplace=True)

# sorting by time
email_csvs = email_csvs.sort_values(by='date', ascending=True).reset_index(drop=True)

# save the email_csvs to csv
email_csvs.to_csv(f'{output_dir}/{company_type}-email.csv', index=True, encoding='utf-8')

In [48]:
email_csvs.head(5)

Unnamed: 0,id,from,date,user,pc,to,cc,bcc,subject,content,size,attachments
0,qtest-1,qa-234987@corp.com,2025-05-01 08:00:10,Ethan Carter,7b8a9c0d1e2f,,"sysdes-238759@corp.com, sysdes-981273@corp.com",,GDD Review and Clarification Needed,"Team,\n\nI've started my review of the current...",176,0
1,sdes-1,sysdes-238759@corp.com,2025-05-01 08:22:13,Rajan Patel,9b5c6d7e82a3,gamedesign-195738@corp.com,produce-543210@corp.com,,"Action Items: Combat, Economy, and Progression...","Team,\n\nFollowing up on our initial alignment...",233,0
2,prod-1,produce-543210@corp.com,2025-05-01 08:24:26,Olivia Rodriguez,c3d2e1f0a9b8,sysdes-238759@corp.com,,,"Re: Action Items: Combat, Economy, and Progres...","Hi Rajan,\n\nThanks for the detailed breakdown...",177,0
3,sdes-1,sysdes-238759@corp.com,2025-05-01 08:25:09,Rajan Patel,9b5c6d7e82a3,produce-543210@corp.com,,,"Re: Action Items: Combat, Economy, and Progres...","Hi Olivia,\n\nThanks for your feedback. I unde...",270,0
4,ldes-1,gamedesign-195738@corp.com,2025-05-01 08:28:52,Naomi Walker,f89a4b3c21d0,sysdes-238759@corp.com,,,"Re: Action Items: Combat, Economy, and Progres...","Rajan,\n\nThanks for the clear breakdown. Just...",186,0


## Logon Data

In [108]:
import os 
import random
import pandas as pd

# read four dirs logon.csv
weeks = range(1, 5)
week_dates = [5, 5, 5, 5]
week_pointer = 0

scenario = "Tech-Company"
company_type = "tech"

# Reading
root_dir = f'/data2/visitor/ASE25/Chimera-Dataset/'

# concat all the logon.csv
logon_csvs = pd.DataFrame()

#### load all data ####
for week in weeks:
    weekly_dir = os.path.join(root_dir, scenario, f'week{week}-gemini-{company_type}')
    weekly_logon_log_path = os.path.join(weekly_dir, "execution_logs", 'logon.csv')

    df = pd.read_csv(weekly_logon_log_path, encoding='utf-8')
    print(f"Processing {weekly_logon_log_path} with {len(df)} rows")

    # Concatenate all DataFrames
    logon_csvs = pd.concat([logon_csvs, df], ignore_index=True)

print(f"Total rows in logon_csvs: {len(logon_csvs)}")

#### fix if instant logon and logout ####
for i in range(len(logon_csvs)):
    if i > 0:
        sim_time = logon_csvs.loc[i, 'sim_timestamp']
        sim_id = logon_csvs.loc[i, 'id']
        prev_id = logon_csvs.loc[i-1, 'id']
        if sim_id == prev_id:
            prev_sim_time = logon_csvs.loc[i-1, 'sim_timestamp']
            if sim_time == prev_sim_time:
                # random add near 5 minutes to sim_id
                new_time = pd.to_datetime(logon_csvs.loc[i, 'sim_timestamp'], format='%H:%M:%S') + pd.Timedelta(seconds=random.randint(30, 200))
                logon_csvs.loc[i, 'sim_timestamp'] = new_time.strftime('%H:%M:%S')

starting_date = "2025-05-01"

#### settle the sim timestamp with proper date ####
date_counter = 1
for i in range(len(logon_csvs)):
    sim_time = pd.to_datetime(logon_csvs.loc[i, 'sim_timestamp'], format='%H:%M:%S')
    
    if i == 0:
        sim_date = starting_date
    else:
        prev_time = pd.to_datetime(logon_csvs.loc[i-1, 'sim_timestamp'], format='%H:%M:%S')
        if (sim_time < prev_time) and ((prev_time - sim_time) > pd.Timedelta(hours=6)):
            if date_counter == week_dates[week_pointer]:
                # print("next week")
                add_date = 8 - week_dates[week_pointer]
                week_pointer += 1
                date_counter = 1
            else:
                date_counter += 1
                add_date = 1
            sim_date = (pd.to_datetime(sim_date) + pd.Timedelta(days=add_date)).strftime('%Y-%m-%d')
            # print(f"Date changed to {sim_date} at index {i}")
    logon_csvs.loc[i, 'full_sim_timestamp'] = f"{sim_date} {logon_csvs.loc[i, 'sim_timestamp']}"


Processing /data2/visitor/ASE25/Chimera-Dataset/Tech-Company/week1-gemini-tech/execution_logs/logon.csv with 2855 rows
Processing /data2/visitor/ASE25/Chimera-Dataset/Tech-Company/week2-gemini-tech/execution_logs/logon.csv with 3059 rows
Processing /data2/visitor/ASE25/Chimera-Dataset/Tech-Company/week3-gemini-tech/execution_logs/logon.csv with 2805 rows
Processing /data2/visitor/ASE25/Chimera-Dataset/Tech-Company/week4-gemini-tech/execution_logs/logon.csv with 3538 rows
Total rows in logon_csvs: 12257


### Format Fitting

In [None]:
output_dir = f"/data2/visitor/ASE25/Chimera/Final-Output/{scenario}/"
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

sim_idx = logon_csvs.columns.get_loc("sim_timestamp")
full_sim = logon_csvs.pop("full_sim_timestamp")
logon_csvs.insert(sim_idx + 1, "full_sim_timestamp", full_sim)

# rename the column "name" to "user"
logon_csvs.rename(columns={'name': 'user'}, inplace=True)
# rename the column "full_sim_timestamp" to "time"
logon_csvs.rename(columns={'full_sim_timestamp': 'date'}, inplace=True)
# rename the column "container_id" to "pc"
logon_csvs.rename(columns={'container_id': 'pc'}, inplace=True)
# rename the column "status" to "activity"
logon_csvs.rename(columns={'status': 'activity'}, inplace=True)

# remove the column "real_timestamp" and "sim_timestamp"
logon_csvs.drop(columns=['real_timestamp', 'sim_timestamp'], inplace=True) 

# sorting by time
logon_csvs = logon_csvs.sort_values(by='date', ascending=True).reset_index(drop=True)

# save the email_csvs to csv
logon_csvs.to_csv(f'{output_dir}/{company_type}-logon.csv', index=False, encoding='utf-8')
# logon_csvs.to_csv(f'{output_dir}/{company_type}-logon-full.csv', index=False, encoding='utf-8')

In [61]:
logon_csvs.head(20)

Unnamed: 0,id,date,user,pc,activity
0,tdev-1,2025-05-01 06:51:26,Ingrid Müller,e61b3c5d78f9,login
1,tdev-1,2025-05-01 06:52:35,Ingrid Müller,e61b3c5d78f9,logout
2,sdes-1,2025-05-01 07:33:59,Rajan Patel,9b5c6d7e82a3,login
3,sdes-2,2025-05-01 07:50:18,Anika Schmidt,c12d3e4f56a7,login
4,uiux-1,2025-05-01 07:51:57,David Chen,9f8e7d6c5b4a,login
5,cart-1,2025-05-01 07:53:22,Leon Moreau,a2b3c4d5e6f7,login
6,sdes-2,2025-05-01 07:53:22,Anika Schmidt,c12d3e4f56a7,logout
7,tart-1,2025-05-01 07:53:31,Ayana Nakamura,5e7f8a9b0c1d,login
8,tart-1,2025-05-01 07:54:28,Ayana Nakamura,5e7f8a9b0c1d,logout
9,cart-1,2025-05-01 07:56:19,Leon Moreau,a2b3c4d5e6f7,logout


# Find all keywords in agent actions

In [None]:
import os 
import pandas as pd
import json
import ast
from tqdm import tqdm

# read four dirs email.csv
weeks = range(1, 5)
week_dates = [5, 5, 5, 5]
week_pointer = 0

scenario = "Tech-Company"
company_type_member = "tech_company"
company_type = "tech"


# Reading
root_dir = f'/data2/visitor/ASE25/Chimera-Dataset/'

id_role_map = {}
id_list = []
profile_list = []

#######################
profile_output_dir = f"experiment_output/gemini_{company_type_member}/generated_members"
for file in os.listdir(profile_output_dir):
    if file.endswith(".jsonc"):
        member_profile_path = os.path.join(profile_output_dir, file)
        with open(member_profile_path, 'r') as f:
            member_profile = json.load(f)
        id_role_map[member_profile['id']] = member_profile['role'] # add id-role map
        id_list.append(member_profile['id'])
        profile_list.append(member_profile) # add profile
#######################

week = 1
member_id = 'cdev-1'

function_set = set()

for week in weeks:
    weekly_dir = os.path.join(root_dir, scenario, f'week{week}-gemini-{company_type}')
    execution_log_path = os.path.join(weekly_dir, "execution_logs")
    
    for member_id in id_list:    
        user_log_dir = os.path.join(execution_log_path, member_id)

        # for every file inside the user_log_dir, read the file and extract the content and do not read the file with "solution"
        log_files = [f for f in os.listdir(user_log_dir) if f.endswith('.log') and 'solution' not in f]
        for member_action_log in tqdm(log_files, desc=f"Processing logs for {member_id}"):
            member_action_log_path = os.path.join(user_log_dir, member_action_log)
            # with open(member_action_log_path, 'r') as f:
            # if "cdev-1_week_1_Friday_executio_task_50" in member_action_log:
            with open(member_action_log_path, 'r') as f:
                content = f.readlines()
                for line in content:
                    if "'tool_calls': [{'id': 'null', 'type': 'function', 'function': {'name':" in line:
                        # find the word quoted in '' after 'tool_calls': [{'id': 'null', 'type': 'function', 'function': {'name':
                        match = re.search(r"'name': '([^']+)'", line)
                        if match:
                            function_name = match.group(1)
                            function_set.add(function_name)

print(function_set)
                    # extract the function name
                    # print(line)
                # if "start_url" in line:
                #     # extract all the url from the line, search https://
                #     urls = re.findall(r'https?://[^\s]+', line)
                #     # print(urls)
                #     # extract the time
                #     time = line.split(',')[0].strip()
                #     print(f"Time: {time}")


# 'tool_calls': [{'id': 'null', 'type':
# {'shell_exec', 'browse_url', 'search_duckduckgo', 'file_find_in_content', 'write_to_file', 'search_google', 'file_find_by_name'}


                                                                               

{'shell_exec', 'browse_url', 'search_duckduckgo', 'file_find_in_content', 'write_to_file', 'search_google', 'file_find_by_name'}




# HTTPS


In [120]:
def clean_url(url):
    return url.rstrip('",\' >}\]\n\r\t')

In [126]:
import os 
import pandas as pd
import json
import ast
from tqdm import tqdm

# read four dirs email.csv
weeks = range(1, 5)
week_dates = [5, 5, 5, 5]
week_pointer = 0

scenario = "Tech-Company"
company_type_member = "tech_company"
company_type = "tech"

output_dir = f"/data2/visitor/ASE25/Chimera/Final-Output/{scenario}/"
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

output_csv = f'{output_dir}/{company_type}-http-raw.csv'

# Reading
root_dir = f'/data2/visitor/ASE25/Chimera-Dataset/'

id_role_map = {}
id_list = []
profile_list = []

#######################
profile_output_dir = f"experiment_output/gemini_{company_type_member}/generated_members"
for file in os.listdir(profile_output_dir):
    if file.endswith(".jsonc"):
        member_profile_path = os.path.join(profile_output_dir, file)
        with open(member_profile_path, 'r') as f:
            member_profile = json.load(f)
        id_role_map[member_profile['id']] = member_profile['role'] # add id-role map
        id_list.append(member_profile['id'])
        profile_list.append(member_profile) # add profile
#######################

data_rows = []
last_url = None

for week in weeks:
    weekly_dir = os.path.join(root_dir, scenario, f'week{week}-gemini-{company_type}')
    execution_log_path = os.path.join(weekly_dir, "execution_logs")
    
    for member_id in tqdm(id_list, desc=f"Processing member logs for week {week}", leave=False):    
        user_log_dir = os.path.join(execution_log_path, member_id)

        # for every file inside the user_log_dir, read the file and extract the content and do not read the file with "solution"
        log_files = [f for f in os.listdir(user_log_dir) if f.endswith('.log') and 'solution' not in f]
        for member_action_log in tqdm(log_files, desc=f"Processing logs for {member_id}", leave=False):
            member_action_log_path = os.path.join(user_log_dir, member_action_log)

            with open(member_action_log_path, 'r') as f:
                content = f.readlines()
            for line in content:
                if "'tool_calls': [{'id': 'null', 'type': 'function', 'function': {'name':" in line:
                    match = re.search(r"'name': '([^']+)'", line)
                    if match:
                        function_name = match.group(1)

                        # extract the time
                        time = line.split(',')[0].strip()
                        
                        # clean_line = line.encode('utf-8').decode('unicode_escape')
                        # get the urls for accessing the function
                        urls = re.findall(r'https?://[^\s]+', line)
                        # deduplicate urls
                        urls = list(set(urls))
                        # print(urls)
                        # assert False
                        for url in urls:
                            # if url contains '}, remove i
                            url = url.split("\'}")[0]
                            url = url.split("\')")[0]
                            url = url.split("]")[0]
                            url = clean_url(url)
                            
                            row = {
                                'real_timestamp': time,
                                'id': member_id,
                                'url': url
                            }
                        # if row url is not equal to the previous row url, append the row
                        if url != last_url:
                            data_rows.append(row)
                            last_url = url

df = pd.DataFrame(data_rows)
# sort by real_timestamp before saving
df['real_timestamp'] = pd.to_datetime(df['real_timestamp'], format='%Y-%m-%d %H:%M:%S')
df = df.sort_values(by='real_timestamp', ascending=True).reset_index(drop=True)

# check whether previous url is same as current url
dupe_mask = (df['url'] == df['url'].shift(1)) & (df['id'] == df['id'].shift(1))
df = df[~dupe_mask].reset_index(drop=True)

df.to_csv(output_csv, index=False, encoding='utf-8')

                                                                                  

In [155]:
import os 
import pandas as pd
import json
import ast
import random
import requests
from tqdm import tqdm

# read four dirs email.csv
weeks = range(1, 5)
week_dates = [5, 5, 5, 5]
week_pointer = 0

scenario = "Tech-Company"
company_type_member = "tech_company"
company_type = "tech"

output_dir = f"/data2/visitor/ASE25/Chimera/Final-Output/{scenario}/"
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

raw_csv = f'{output_dir}/{company_type}-http-raw.csv'

# reading the raw csv
raw_df = pd.read_csv(raw_csv, encoding='utf-8')

In [156]:
raw_df.head(5)

Unnamed: 0,real_timestamp,id,url
0,2025-05-14 19:29:45,sdes-1,https://www.researchgate.net/publication/38196...
1,2025-05-14 19:29:53,sdes-1,https://www.sciencedirect.com/journal/games-an...
2,2025-05-14 19:32:28,sdes-1,https://www.economist.com/
3,2025-05-14 19:32:33,sdes-1,https://www.msn.com/en-us/money/markets/the-ec...
4,2025-05-14 19:32:53,uiux-1,https://architizer.com/blog/inspiration/collec...


In [None]:
# 1. match up with the actual time
logon_csv = f'{output_dir}/{company_type}-logon-full.csv'
logon_df = pd.read_csv(logon_csv, encoding='utf-8')

logon_df['real_timestamp'] = pd.to_datetime(logon_df['real_timestamp'])
logon_df['date'] = pd.to_datetime(logon_df['date'])
raw_df['real_timestamp'] = pd.to_datetime(raw_df['real_timestamp'])

http_dates = []

raw_df = raw_df.sort_values('real_timestamp')
logon_df = logon_df.sort_values('real_timestamp')

result = pd.merge_asof(
    raw_df,
    logon_df,
    on='real_timestamp',
    by='id',
    direction='nearest'
)

# 2. mutate the date time by up to 1 minute
def mutate_time(dt):
    # Randomly add or subtract up to 1 minute
    delta = pd.Timedelta(seconds=random.randint(-30, 30))
    return dt + delta
result['date'] = result['date'].apply(mutate_time)

# 3. remove sim_timestamp, real_timestamp, activity
result.drop(columns=['sim_timestamp', 'real_timestamp', 'activity'], inplace=True)

# 4. sort by date
result = result.sort_values(by='date', ascending=True).reset_index(drop=True)

# 5. content column
def fetch_content(url):
    try:
        resp = requests.get(url, timeout=10)
        resp.raise_for_status()
        return resp.text[:5000]  # Limit content to first 1000 characters
    except Exception as e:
        return f"Error: {e}"
tqdm.pandas()
df['content'] = df['url'].progress_apply(fetch_content)

# 6. save to csv
output_csv = f'{output_dir}/{company_type}-http.csv'
result.to_csv(output_csv, index=False, encoding='utf-8')

 28%|██▊       | 2669/9421 [38:59<1:20:38,  1.40it/s] 

# Files

In [None]:
import os 
import pandas as pd
import json
import ast
import re
import json
from tqdm import tqdm
import lorem

# read four dirs email.csv
weeks = range(1, 5)

scenario = "Tech-Company"
company_type_member = "tech_company"
company_type = "tech"

output_dir = f"/data2/visitor/ASE25/Chimera/Final-Output/{scenario}/"
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

output_csv = f'{output_dir}/{company_type}-file-raw.csv'

# Reading
root_dir = f'/data2/visitor/ASE25/Chimera-Dataset/'

id_role_map = {}
id_list = []
profile_list = []

#######################
profile_output_dir = f"experiment_output/gemini_{company_type_member}/generated_members"
for file in os.listdir(profile_output_dir):
    if file.endswith(".jsonc"):
        member_profile_path = os.path.join(profile_output_dir, file)
        with open(member_profile_path, 'r') as f:
            member_profile = json.load(f)
        id_role_map[member_profile['id']] = member_profile['role'] # add id-role map
        id_list.append(member_profile['id'])
        profile_list.append(member_profile) # add profile
#######################

data_rows = []
last_url = None

for week in weeks:
    weekly_dir = os.path.join(root_dir, scenario, f'week{week}-gemini-{company_type}')
    execution_log_path = os.path.join(weekly_dir, "execution_logs")
    
    for member_id in tqdm(id_list, desc=f"Processing member logs for week {week}", leave=False):    
        user_log_dir = os.path.join(execution_log_path, member_id)

        # for every file inside the user_log_dir, read the file and extract the content and do not read the file with "solution"
        log_files = [f for f in os.listdir(user_log_dir) if f.endswith('.log') and 'solution' not in f]
        for member_action_log in tqdm(log_files, desc=f"Processing logs for {member_id}", leave=False):
            member_action_log_path = os.path.join(user_log_dir, member_action_log)

            with open(member_action_log_path, 'r') as f:
                content = f.readlines()
            for line in content:
                if "'tool_calls': [{'id': 'null', 'type': 'function', 'function': {'name':" in line:
                    match = re.search(r"'name': '([^']+)'", line)

                    time = line.split(',')[0].strip()
                    if match:
                        function_name = match.group(1)
                        # 'tool_calls': [{'id': 'null', 'type':
                        # {'shell_exec', 'browse_url', 'search_duckduckgo', 'file_find_in_content', 'write_to_file', 'search_google', 'file_find_by_name'}
                        if function_name == 'shell_exec':
                            match = re.search(r'"command"\s*:\s*"([^"]+)"', line)
                            if match:
                                command_data = match.group(1)
                                # pass

                        elif function_name == 'file_find_in_content':
                            match = re.search(r"'arguments':\s*'\{\"file\":\s*\"([^\"]+)\"", line)
                            if match:
                                file_read = match.group(1)
                                row = {
                                    'real_timestamp': time,
                                    'id': member_id,
                                    'filename': file_read,
                                    'type': 'read',
                                    'content': lorem.sentence()
                                }
                            data_rows.append(row)
                        
                        elif function_name == 'file_find_by_name':
                            matches = re.findall(r"'arguments':\s*'\{\"glob\":\s*\"([^\"]+)\"", line)
                            for match in matches:
                                file_name = match
                                row = {
                                    'real_timestamp': time,
                                    'id': member_id,
                                    'filename': file_name,
                                    'type': 'read',
                                    'content': lorem.sentence()
                                }
                                data_rows.append(row)
                        
                        elif function_name == 'write_to_file':
                            content_match = re.search(r"'arguments':\s*'\{\"content\":\s*\"([^\"]+)\"", line)
                            filename_match = re.search(r'"filename"\s*:\s*"([^"]+)"', line)

                            row = {
                                'real_timestamp': time,
                                'id': member_id,
                                'filename': filename_match.group(1),
                                'type': 'write',
                                'content': content_match.group(1) if content_match else lorem.sentence()
                            }
                            data_rows.append(row)
                
                if "Content successfully written to file" in line:
                    match = re.search(r"'content': 'Content successfully written to file: ([^']+)'", line)
                    if match:
                        file_written = match.group(1)
                        row = {
                            'real_timestamp': time,
                            'id': member_id,
                            'filename': file_written,
                            'type': 'write',
                            'content': lorem.sentence()
                        }
                        data_rows.append(row)

df = pd.DataFrame(data_rows)

df.to_csv(output_csv, index=False, encoding='utf-8')

### Postprocess

In [48]:
import os 
import pandas as pd
import json
import ast
import re
import json
from tqdm import tqdm
import lorem
import random

# read four dirs email.csv
weeks = range(1, 5)

scenario = "Tech-Company"
company_type_member = "tech_company"
company_type = "tech"

output_dir = f"/data2/visitor/ASE25/Chimera/Final-Output/{scenario}/"
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

output_csv = f'{output_dir}/{company_type}-file-raw.csv'

# Reading
root_dir = f'/data2/visitor/ASE25/Chimera-Dataset/'

raw_csv = f'{output_dir}/{company_type}-file-raw.csv'

# reading the raw csv
raw_df = pd.read_csv(raw_csv, encoding='utf-8')

In [49]:
# sort based on real_timestamp
# raw_df['real_timestamp'] = pd.to_datetime(raw_df['real_timestamp'])
# raw_df = raw_df.sort_values('real_timestamp').reset_index(drop=True)
raw_df.head(5)

Unnamed: 0,real_timestamp,id,filename,type,content
0,2025-05-15 18:37:47,cart-1,character_color_rendering_backup.txt,write,Conversation history and task plan regarding c...
1,2025-05-15 18:37:47,cart-1,/data/Chimera/demo/execution_logs/cart-1...,write,Quiquia tempora amet dolorem.
2,2025-05-15 18:37:51,cart-1,character_color_rendering_backup.txt,write,Conversation history and task plan regarding c...
3,2025-05-15 18:37:51,cart-1,/data/Chimera/demo/execution_logs/cart-1...,write,Quisquam non non adipisci aliquam etincidunt v...
4,2025-05-15 18:37:58,cart-1,character_color_rendering_backup.txt,write,Conversation history and task plan regarding c...


In [None]:
# 1. match up with the actual time
logon_csv = f'{output_dir}/{company_type}-logon-full.csv'
logon_df = pd.read_csv(logon_csv, encoding='utf-8')

logon_df['real_timestamp'] = pd.to_datetime(logon_df['real_timestamp'])
logon_df['date'] = pd.to_datetime(logon_df['date'])
raw_df['real_timestamp'] = pd.to_datetime(raw_df['real_timestamp'])

raw_df = raw_df.sort_values('real_timestamp')
logon_df = logon_df.sort_values('real_timestamp')

result = pd.merge_asof(
    raw_df,
    logon_df,
    on='real_timestamp',
    by='id',
    direction='nearest'
)

# # 2. mutate the date time by up to 1 minute
def mutate_time(dt):
    # Randomly add or subtract up to 1 minute
    delta = pd.Timedelta(seconds=random.randint(-10, 10))
    return dt + delta
result['date'] = result['date'].apply(mutate_time)

# # 3. remove sim_timestamp, real_timestamp, activity
result.drop(columns=['sim_timestamp', 'real_timestamp', 'activity'], inplace=True)

# # 4. sort by date
result = result.sort_values(by='date', ascending=True).reset_index(drop=True)

In [52]:
result.head(5)
# 5. save to csv
output_csv = f'{output_dir}/{company_type}-file.csv'
result.to_csv(output_csv, index=False, encoding='utf-8')