In [18]:
import re
import json
import pandas as pd
import numpy as np
from openai import OpenAI
from tqdm import tqdm

In [19]:
df = pd.read_csv("../anonymized_texts.csv")
custext_df = pd.read_csv("../anonymized_texts_custext.csv")

In [20]:
# Filter the dataframe
df = df[(df['loc_and_org'] == True) & (df['dp_type'] == 'metric')]
custext_df = custext_df[(custext_df['loc_and_org'] == True) & (custext_df['dp_type'] == 'metric')]

In [21]:
# Creating the new 'Custome_ID' column
# custext_df['Custome_ID'] = ['request-{}'.format(i) for i in range(1, len(custext_df) + 1)]
# custext_df = custext_df.rename(columns={'Custome_ID': 'Custom_ID'})
# custext_df.to_csv("../anonymized_texts_custext.csv", index=False)

In [22]:
print(df.shape)
print(custext_df.shape)

(25167, 11)
(4445, 11)


In [23]:
custext_df.head()

Unnamed: 0,loc_and_org,epsilon,num_cluster,K,dp_type,anonymized_text,Grammar,Common Sense,Coherence,Cohesiveness,Custom_ID
0,True,0.1,1,,metric,PROCEDURE\n\nThe case originated in an applica...,,,,,request-1
1,True,0.1,1,,metric,PROCEDURE\n\nThe case originated in an applica...,,,,,request-2
2,True,0.1,1,,metric,PROCEDURE\n\nThe case originated in an applica...,,,,,request-3
3,True,0.1,1,,metric,PROCEDURE\n\nThe case originated in an applica...,,,,,request-4
4,True,0.1,1,,metric,PROCEDURE\n\nThe case originated in an applica...,,,,,request-5


In [24]:
def get_number_of_unique_combinations(any_df):
    unique_combinations = any_df.drop_duplicates(subset=['loc_and_org', 'epsilon', 'num_cluster', 'K', 'dp_type'])

    # Get the number of unique combinations
    return unique_combinations.shape[0]

# Function to filter and sample 45 rows from each group
def filter_and_sample_rows(group):
    # Filter rows where the length of "anonymized_text" is less than 5000 characters
    filtered_group = group[group['anonymized_text'].str.len() < 5000]
    # Sample 45 rows from the filtered group
    return filtered_group.sample(n=45, replace=False)

In [25]:
# Group by the specified columns and apply the filter and sampling function
sampled_df = df.groupby(['loc_and_org', 'epsilon', 'num_cluster', 'K', 'dp_type']).apply(filter_and_sample_rows).reset_index(drop=True)

# Assert check to ensure the number of unique combinations is the same
assert get_number_of_unique_combinations(df) == get_number_of_unique_combinations(sampled_df), "The number of unique combinations does not match!"

  sampled_df = df.groupby(['loc_and_org', 'epsilon', 'num_cluster', 'K', 'dp_type']).apply(filter_and_sample_rows).reset_index(drop=True)


In [26]:
# Group by the specified columns and apply the filter and sampling function
sampled_df_custext = custext_df.groupby(['epsilon', 'num_cluster']).apply(filter_and_sample_rows).reset_index(drop=True)

# Assert check to ensure the number of unique combinations is the same
assert get_number_of_unique_combinations(custext_df) == get_number_of_unique_combinations(sampled_df_custext), "The number of unique combinations does not match!"

  sampled_df_custext = custext_df.groupby(['epsilon', 'num_cluster']).apply(filter_and_sample_rows).reset_index(drop=True)


In [27]:
print(custext_df.shape[0])
print(sampled_df_custext.shape[0])

4445
1575


In [28]:
filtered_df = sampled_df_custext[sampled_df_custext['Grammar'].isnull() & sampled_df_custext['Common Sense'].isnull() & sampled_df_custext['Coherence'].isnull() & sampled_df_custext['Cohesiveness'].isnull()]
# random_df = filtered_df.sample(n=1000, random_state=42)
random_df = filtered_df.copy()
print(random_df.shape[0])

1575


In [29]:
client = OpenAI(organization="org-5FDaIDe2hzj7FGqPPiL6V4Jk")

In [58]:
system_message = (
    "Could you please evaluate the following passage for its grammar, common sense, "
    "coherence, and cohesiveness? Score it on a scale from 1 to 5, where 1 is the lowest "
    "(poor quality) and 5 is the highest (excellent quality). "
    "You should score based on these criteria:\n"
    "grammar: Are the sentences structured correctly?\n"
    "common sense: Does the content make logical sense in the real world?\n"
    "coherence: Do the ideas flow logically from one sentence to another?\n"
    "cohesiveness: Do all parts of the text come together in a unified whole?\n"
    "Please ONLY respond in JSON format with the only four keys 'grammar', 'common sense', "
    "'coherence', and 'cohesiveness', each with a score attached to them."
)

In [59]:
# Prepare to write to a JSONL file
with open('requests_custext.jsonl', 'w') as file:
    for index, row in tqdm(random_df.iterrows(), total=random_df.shape[0]):
        # Get custom_id
        custom_id = row['Custom_ID']

        # Clean and process comment
        comment = str(row['anonymized_text']).strip()
        comment = re.sub(r'\s+', ' ', comment)

        user_message = f"passage: {comment}"

        # Construct JSON object
        json_object = {
            "custom_id": custom_id,
            "method": "POST",
            "url": "/v1/chat/completions",
            "body": {
                "model": "gpt-4o",
                "messages": [
                    {"role": "system", "content": system_message},
                    {"role": "user", "content": user_message}
                ],
                "max_tokens": 500
            }
        }

        # Write JSON object to file
        json.dump(json_object, file)
        file.write('\n')

100%|██████████| 1575/1575 [00:00<00:00, 6271.38it/s]


In [60]:
batch_input_file = client.files.create(
  file=open("requests_custext.jsonl", "rb"),
  purpose="batch"
)

batch_input_file_id = batch_input_file.id

response = client.batches.create(
    input_file_id=batch_input_file_id,
    endpoint="/v1/chat/completions",
    completion_window="24h",
    metadata={
      "description": "nightly CluSanT evaluating job"
    }
)

print(response)

Batch(id='batch_wXpX3fHS3L6TK6gwsDr8uCC5', completion_window='24h', created_at=1717986586, endpoint='/v1/chat/completions', input_file_id='file-eOCJgJuOf5ecWpcF65gPvXCJ', object='batch', status='validating', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=None, expired_at=None, expires_at=1718072986, failed_at=None, finalizing_at=None, in_progress_at=None, metadata={'description': 'nightly CluSanT evaluating job'}, output_file_id=None, request_counts=BatchRequestCounts(completed=0, failed=0, total=0))


In [30]:
print(client.batches.retrieve("batch_wXpX3fHS3L6TK6gwsDr8uCC5"))

Batch(id='batch_wXpX3fHS3L6TK6gwsDr8uCC5', completion_window='24h', created_at=1717986586, endpoint='/v1/chat/completions', input_file_id='file-eOCJgJuOf5ecWpcF65gPvXCJ', object='batch', status='in_progress', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=None, expired_at=None, expires_at=1718072986, failed_at=None, finalizing_at=None, in_progress_at=1718035500, metadata={'description': 'nightly CluSanT evaluating job'}, output_file_id=None, request_counts=BatchRequestCounts(completed=1573, failed=0, total=1575))


In [44]:
content = client.files.content("file-S5K0gUZ692qKg8co22Nv8c6F")
with open("requests_output_custext.jsonl", "ab") as file:
    file.write(content.content)

In [46]:
custext_df = pd.read_csv("../anonymized_texts_custext.csv")

# Read the JSONL file
data = []
with open('requests_output_custext.jsonl', 'r') as file:
    for line in file:
        data.append(json.loads(line))

# Function to clean and parse the content from the markdown code block
def parse_content(content):
    try:
        # Strip the triple backticks and any extra whitespace or newlines
        clean_content = content.replace("```json\n", "").replace("\n```", "").strip()
        parsed_content = json.loads(clean_content)
        # Convert the keys of the dictionary to lowercase
        return {k.lower(): v for k, v in parsed_content.items()}
    except json.JSONDecodeError:
        # Return None or an empty dictionary if JSON decoding fails
        return {}

# Iterate through the loaded JSON data and update the DataFrame
for item in data:
    custom_id = item['custom_id']
    content = parse_content(item['response']['body']['choices'][0]['message']['content'])
    if custom_id in custext_df['Custom_ID'].values:
        # Convert the content dictionary back to a JSON string to store in the DataFrame
        custext_df.loc[custext_df['Custom_ID'] == custom_id, 'Grammar'] = json.dumps(content['grammar']) if 'grammar' in content else None
        custext_df.loc[custext_df['Custom_ID'] == custom_id, 'Common Sense'] = json.dumps(content['common sense']) if 'common sense' in content else None
        custext_df.loc[custext_df['Custom_ID'] == custom_id, 'Coherence'] = json.dumps(content['coherence']) if 'coherence' in content else None
        custext_df.loc[custext_df['Custom_ID'] == custom_id, 'Cohesiveness'] = json.dumps(content['cohesiveness']) if 'cohesiveness' in content else None
    else:
        print(f"Custom_ID {custom_id} not found in DataFrame.")

print(custext_df['Grammar'].notna().sum())


  df.loc[df['Custom_ID'] == custom_id, 'Grammar'] = json.dumps(content['grammar']) if 'grammar' in content else None
  df.loc[df['Custom_ID'] == custom_id, 'Common Sense'] = json.dumps(content['common sense']) if 'common sense' in content else None
  df.loc[df['Custom_ID'] == custom_id, 'Coherence'] = json.dumps(content['coherence']) if 'coherence' in content else None
  df.loc[df['Custom_ID'] == custom_id, 'Cohesiveness'] = json.dumps(content['cohesiveness']) if 'cohesiveness' in content else None


9954


In [47]:
custext_df.to_csv("../anonymized_texts.csv", index=False)

## Generate Results

In [59]:
df = pd.read_csv("../anonymized_texts.csv")
filtered_df = df[(df['loc_and_org'] == True) & (df['dp_type'] == 'metric')]
filtered_df = filtered_df[filtered_df['Grammar'].notna() | filtered_df['Common Sense'].notna() | filtered_df['Coherence'].notna() | filtered_df['Cohesiveness'].notna()]

In [60]:
filtered_df.columns

Index(['loc_and_org', 'epsilon', 'num_cluster', 'K', 'dp_type',
       'anonymized_text', 'Custom_ID', 'Grammar', 'Common Sense', 'Coherence',
       'Cohesiveness'],
      dtype='object')

In [61]:
filtered_df.shape[0]

9237

In [62]:
# Group by the specified columns and count the number of records in each group
grouped_df = filtered_df.groupby(['loc_and_org', 'epsilon', 'num_cluster', 'K', 'dp_type']).size().reset_index(name='counts')

# Iterate over the grouped dataframe and print each combination and the number of records
for index, row in grouped_df.iterrows():
    combination = (row['loc_and_org'], row['epsilon'], row['num_cluster'], row['K'], row['dp_type'])
    count = row['counts']
    print(f"Combination: {combination}, Number of records: {count}")

# Check if all combinations have the same number of records
if grouped_df['counts'].nunique() == 1:
    print("All combinations have the same number of records.")
else:
    print("Different combinations have different numbers of records.")

Combination: (True, 0.1, 1, 1, 'metric'), Number of records: 48
Combination: (True, 0.1, 1, 8, 'metric'), Number of records: 45
Combination: (True, 0.1, 1, 16, 'metric'), Number of records: 43
Combination: (True, 0.1, 1, 32, 'metric'), Number of records: 43
Combination: (True, 0.1, 1, 64, 'metric'), Number of records: 44
Combination: (True, 0.1, 40, 1, 'metric'), Number of records: 45
Combination: (True, 0.1, 40, 8, 'metric'), Number of records: 47
Combination: (True, 0.1, 40, 16, 'metric'), Number of records: 48
Combination: (True, 0.1, 40, 32, 'metric'), Number of records: 46
Combination: (True, 0.1, 40, 64, 'metric'), Number of records: 45
Combination: (True, 0.1, 40, 128, 'metric'), Number of records: 48
Combination: (True, 0.1, 180, 1, 'metric'), Number of records: 48
Combination: (True, 0.1, 180, 8, 'metric'), Number of records: 46
Combination: (True, 0.1, 180, 16, 'metric'), Number of records: 50
Combination: (True, 0.1, 180, 32, 'metric'), Number of records: 45
Combination: (Tr

In [63]:
# Replace the None values with NaN for proper handling
filtered_df[['Grammar', 'Common Sense', 'Coherence', 'Cohesiveness']] = filtered_df[['Grammar', 'Common Sense', 'Coherence', 'Cohesiveness']].apply(pd.to_numeric, errors='coerce')

# Group by the specified columns and calculate the average for the specified columns
grouped_averages = filtered_df.groupby(['loc_and_org', 'epsilon', 'num_cluster', 'K', 'dp_type']).agg({
    'Grammar': 'mean',
    'Common Sense': 'mean',
    'Coherence': 'mean',
    'Cohesiveness': 'mean'
}).reset_index()

# Iterate over the grouped dataframe and print each combination and the average values
for index, row in grouped_averages.iterrows():
    combination = (row['loc_and_org'], row['epsilon'], row['num_cluster'], row['K'], row['dp_type'])
    grammar_avg = row['Grammar']
    common_sense_avg = row['Common Sense']
    coherence_avg = row['Coherence']
    cohesiveness_avg = row['Cohesiveness']
    print(f"Combination: {combination}, Averages -> Grammar: {grammar_avg}, Common Sense: {common_sense_avg}, Coherence: {coherence_avg}, Cohesiveness: {cohesiveness_avg}")

Combination: (True, 0.1, 1, 1, 'metric'), Averages -> Grammar: 3.1875, Common Sense: 2.382978723404255, Coherence: 2.5625, Cohesiveness: 2.5
Combination: (True, 0.1, 1, 8, 'metric'), Averages -> Grammar: 3.3555555555555556, Common Sense: 2.4545454545454546, Coherence: 2.6444444444444444, Cohesiveness: 2.5555555555555554
Combination: (True, 0.1, 1, 16, 'metric'), Averages -> Grammar: 3.302325581395349, Common Sense: 2.4651162790697674, Coherence: 2.5348837209302326, Cohesiveness: 2.4186046511627906
Combination: (True, 0.1, 1, 32, 'metric'), Averages -> Grammar: 3.3255813953488373, Common Sense: 2.441860465116279, Coherence: 2.6744186046511627, Cohesiveness: 2.5813953488372094
Combination: (True, 0.1, 1, 64, 'metric'), Averages -> Grammar: 3.227272727272727, Common Sense: 2.4545454545454546, Coherence: 2.5454545454545454, Cohesiveness: 2.5681818181818183
Combination: (True, 0.1, 40, 1, 'metric'), Averages -> Grammar: 3.4, Common Sense: 2.5348837209302326, Coherence: 2.6222222222222222, C

In [64]:
# Save the results to a CSV file
grouped_averages.to_csv('grouped_averages.csv', index=False)

In [65]:
# Replace the None values with NaN for proper handling
filtered_df[['Grammar', 'Common Sense', 'Coherence', 'Cohesiveness']] = filtered_df[['Grammar', 'Common Sense', 'Coherence', 'Cohesiveness']].apply(pd.to_numeric, errors='coerce')

# Group by the specified columns and calculate the average for the specified columns
grouped_averages = filtered_df.groupby(['loc_and_org', 'epsilon', 'num_cluster', 'K', 'dp_type']).agg({
    'Grammar': 'mean',
    'Common Sense': 'mean',
    'Coherence': 'mean',
    'Cohesiveness': 'mean'
}).reset_index()

# Pivot the data to get the desired format
pivot_df = grouped_averages.pivot_table(
    index=['loc_and_org', 'K'],
    columns=['epsilon', 'num_cluster'],
    values=['Grammar', 'Common Sense', 'Coherence', 'Cohesiveness']
)

# Flatten the multi-level columns
pivot_df.columns = ['_'.join(map(str, col)).strip() for col in pivot_df.columns.values]

# Reset the index to flatten the dataframe
pivot_df = pivot_df.reset_index()

# Save the resulting dataframe to a CSV file
pivot_df.to_csv('formatted_grouped_averages.csv', index=False)

print("Results have been saved to 'formatted_grouped_averages.csv'.")

Results have been saved to 'formatted_grouped_averages.csv'.
