In [14]:
import pandas as pd
import re

# File paths
input_file_path = '/data/arselane/cncsharedtask/subtask2/data/generated_examples (2).txt'
output_file_path = '/data/arselane/cncsharedtask/subtask2/data/augmented_train_subtask2.csv'

# Read the text file
with open(input_file_path, 'r') as file:
    lines = file.readlines()

# Initialize lists to store the extracted data
domains = []
causal_text_pairs = []

# Variables to hold current domain and causal pairs
current_domain = None
current_pairs = []

# Process each line in the file
for line in lines:
    line = line.strip()
    
    if line.startswith('</s>'):
        # If we reach a new domain section, save the previous data if any
        if current_domain and current_pairs:
            domains.append(current_domain)
            causal_text_pairs.append(' '.join(current_pairs))
            current_pairs = []
    elif line.startswith('in the domain of'):
        # Extract the domain
        current_domain = line.replace('in the domain of', '').strip()
    elif line:
        # Collect the causal text pairs
        current_pairs.append(line)

# Add the last collected domain and pairs if any
if current_domain and current_pairs:
    domains.append(current_domain)
    causal_text_pairs.append(' '.join(current_pairs))

# Create a DataFrame from the extracted data
df = pd.DataFrame({
    'num_rs': domains,
    'causal_text_w_pairs': causal_text_pairs
})

# Save the DataFrame to a new CSV file with swapped columns
df.to_csv(output_file_path, index=False, columns=['causal_text_w_pairs', 'num_rs'])

print(f"Processed data has been saved to {output_file_path}")

Processed data has been saved to /data/arselane/cncsharedtask/subtask2/data/augmented_train_subtask2.csv


In [15]:
import pandas as pd

# Reading the given CSV file
df = pd.read_csv("/data/arselane/cncsharedtask/subtask2/data/augmented_train_subtask2.csv")

# Adding a new column 'sent_id' starting from 10000 and incrementing by 1
df['sent_id'] = range(10000, 10000 + len(df))

# Adding a new column 'corpus' with constant value 'cnc'
df['corpus'] = 'cnc'

# Adding a new column 'eg_id' with constant value 0
df['eg_id'] = 0

# Adding a new column 'doc_id' with values 'train_1000_10000' where 'train_' is constant, and the other two values are incremented by 1
df['doc_id'] = [f'train_{i}_{j}' for i, j in zip(range(1000, 1000 + len(df)), range(10000, 10000 + len(df)))]

# Adding a new column 'index' with values based on variables i, j, k
df['index'] = [f'cnc_train_{i}_{j}_{k}_10' for i, j, k in zip(range(100, 100 + len(df)), range(1000, 1000 + len(df)), range(10000, 10000 + len(df)))]

# Reordering columns
df = df[['corpus', 'doc_id', 'sent_id', 'eg_id', 'index', 'causal_text_w_pairs', 'num_rs']]

# Store the DataFrame into  CSV file with the added 'sent_id' column
df.to_csv('/data/arselane/cncsharedtask/subtask2/data/augmented_train_subtask2.csv', index=None)

In [16]:
import pandas as pd
import re

# Reading the given CSV file
df = pd.read_csv("/data/arselane/cncsharedtask/subtask2/data/augmented_train_subtask2.csv")

# Adding a new column 'text' with the same data as 'causal_text_w_pairs' but without <arg1>, <arg0>, <sig0> tags
df['text'] = df['causal_text_w_pairs'].apply(lambda x: re.sub(r'<(/?ARG[01]|/?SIG0)>', '', x))

# Remove square brackets from the 'text' column sentences
df['text'] = df['text'].apply(lambda x: re.sub(r'[\[\]]', '', x))
# Reordering columns
df = df[['corpus', 'doc_id', 'sent_id', 'eg_id', 'index', 'text', 'causal_text_w_pairs', 'num_rs']]

# Store the DataFrame into a CSV file with the added columns
df.to_csv('/data/arselane/cncsharedtask/subtask2/data/augmented_train_subtask2.csv', index=None)

In [18]:
# Add the 'has_causality' column
df['has_causality'] = df['causal_text_w_pairs'].apply(lambda x: '1' if x else '0')

In [19]:
df2 = pd.read_csv('/data/arselane/cncsharedtask/subtask2/data/CNC_chatgpt_output.csv')

In [21]:
df3 = pd.concat([df2, df], ignore_index=True)

# Save the combined DataFrame to a new CSV file
output_csv_path = df3.to_csv('/data/arselane/cncsharedtask/subtask2/data/train_subtask2_augmented_roberta.csv', index=False)

print(f"Processed data has been saved to {output_csv_path}")

Processed data has been saved to None
