In [None]:
import pandas as pd
import re

# File paths
input_file_path = '/content/generated_examples (2).txt'
output_file_path = '/content/augmented_train_subtask2.csv'

# Read the text file
with open(input_file_path, 'r') as file:
    lines = file.readlines()

# Initialize lists to store the extracted data
domains = []
text_pairs = []

# Variables to hold current domain and causal pairs
current_domain = None
current_pairs = []

# Process each line in the file
for line in lines:
    line = line.strip()

    if line.startswith('</s>'):
        # If we reach a new domain section, save the previous data if any
        if current_domain and current_pairs:
            domains.append(current_domain)
            text_pairs.append(' '.join(current_pairs))
            current_pairs = []
    elif line.startswith('in the domain of'):
        # Extract the domain
        current_domain = line.replace('in the domain of', '').strip()
    elif line:
        # Collect the causal text pairs
        current_pairs.append(line)

# Add the last collected domain and pairs if any
if current_domain and current_pairs:
    domains.append(current_domain)
    text_pairs.append(' '.join(current_pairs))

# Create a DataFrame from the extracted data
df = pd.DataFrame({
    'num_rs': domains,
    'text_w_pairs': text_pairs
})

# Save the DataFrame to a new CSV file with swapped columns
df.to_csv(output_file_path, index=False, columns=['text_w_pairs', 'num_rs'])

print(f"Processed data has been saved to {output_file_path}")

Processed data has been saved to /content/augmented_train_subtask2.csv


In [None]:
import pandas as pd

# Reading the given CSV file
df = pd.read_csv("/content/augmented_train_subtask2.csv")

# Adding a new column 'sent_id' starting from 10000 and incrementing by 1
df['sent_id'] = range(10000, 10000 + len(df))

# Adding a new column 'corpus' with constant value 'cnc'
df['corpus'] = 'cnc'

# Adding a new column 'eg_id' with constant value 0
df['eg_id'] = 0

# Adding a new column 'doc_id' with values 'train_1000_10000' where 'train_' is constant, and the other two values are incremented by 1
df['doc_id'] = [f'train_{i}_{j}' for i, j in zip(range(1000, 1000 + len(df)), range(10000, 10000 + len(df)))]

# Adding a new column 'index' with values based on variables i, j, k
df['index'] = [f'cnc_train_{i}_{j}_{k}_10' for i, j, k in zip(range(100, 100 + len(df)), range(1000, 1000 + len(df)), range(10000, 10000 + len(df)))]

# Reordering columns
df = df[['corpus', 'doc_id', 'sent_id', 'eg_id', 'index', 'text_w_pairs', 'num_rs']]

# Store the DataFrame into  CSV file with the added 'sent_id' column
df.to_csv('/content/augmented_train_subtask2.csv', index=None)

In [None]:
import pandas as pd
import re

# Reading the given CSV file
df = pd.read_csv("/content/augmented_train_subtask2.csv")

# Adding a new column 'text' with the same data as 'causal_text_w_pairs' but without <arg1>, <arg0>, <sig0> tags
df['text'] = df['text_w_pairs'].apply(lambda x: re.sub(r'<(/?ARG[01]|/?SIG0)>', '', x))

# Remove square brackets from the 'text' column sentences
df['text'] = df['text'].apply(lambda x: re.sub(r'[\[\]]', '', x))
# Reordering columns
df = df[['corpus', 'doc_id', 'sent_id', 'eg_id', 'index', 'text', 'text_w_pairs', 'num_rs']]

# Store the DataFrame into a CSV file with the added columns
df.to_csv('/content/augmented_train_subtask2.csv', index=None)

In [None]:
# Add the 'has_causality' column
df['num_rs'] = df['text_w_pairs'].apply(lambda x: '1' if x else '0')

In [None]:
df2 = pd.read_csv('/content/train_subtask2.csv')

In [None]:
df3 = pd.concat([df2, df], ignore_index=True)

# Save the combined DataFrame to a new CSV file
output_csv_path = df3.to_csv('/content/train_subtask2_augmented.csv', index=False)

print(f"Processed data has been saved to {output_csv_path}")

Processed data has been saved to None


In [None]:
import pandas as pd
import re

# Load the CSV file to understand its structure
csv_file_path = '/content/train_subtask2.csv'
csv_data = pd.read_csv(csv_file_path)

# Load the text file to understand its structure
text_file_path = '/content/generated_examples (2).txt'

with open(text_file_path, 'r') as file:
    text_data = file.read()

# Define a function to parse the text file and extract data
def parse_text_data(text):
    sections = re.split(r'</s>', text)
    data = []

    doc_id_counter = 1

    for section in sections:
        section = section.strip()
        if not section:
            continue

        domain_match = re.search(r'in the domain of (.+)', section)
        if not domain_match:
            continue

        domain = domain_match.group(1).strip()
        doc_id = f"{domain.replace(' ', '_').lower()}_{doc_id_counter}"
        doc_id_counter += 1

        events = re.findall(r'<ARG0>(.*?)</ARG0>\s*<SIG0>(.*?)</SIG0>\s*<ARG1>(.*?)</ARG1>', section, re.DOTALL)
        for sent_id, (arg0, sig0, arg1) in enumerate(events, 1):
            eg_id = sent_id - 1
            text_w_pairs = f"<ARG0>{arg0.strip()}</ARG0> <SIG0>{sig0.strip()}</SIG0> <ARG1>{arg1.strip()}</ARG1>"
            seq_label = 1  # Placeholder value, as original labels are not provided
            pair_label = 1  # Placeholder value, as original labels are not provided
            context = None
            num_sents = 1

            data.append([domain, doc_id, sent_id, eg_id, text_w_pairs, seq_label, pair_label, context, num_sents])

    return data

# Parse the text data
parsed_data = parse_text_data(text_data)

# Create a DataFrame with the required columns
columns = ['corpus', 'doc_id', 'sent_id', 'eg_id', 'text_w_pairs', 'seq_label', 'pair_label', 'context', 'num_sents']
df = pd.DataFrame(parsed_data, columns=columns)

# Save the DataFrame to a new CSV file
output_csv_path = '/content/augmented_train_subtask2.csv'
df.to_csv(output_csv_path, index=False)

output_csv_path

'/content/augmented_train_subtask2.csv'

In [None]:
import pandas as pd

# Reading the given CSV file
df = pd.read_csv("/content/augmented_train_subtask2.csv")

# Adding a new column 'sent_id' starting from 10000 and incrementing by 1
df['sent_id'] = range(10000, 10000 + len(df))

# Adding a new column 'corpus' with constant value 'cnc'
df['corpus'] = 'cnc'

# Adding a new column 'eg_id' with constant value 0
df['eg_id'] = 0

# Adding a new column 'doc_id' with values 'train_1000_10000' where 'train_' is constant, and the other two values are incremented by 1
df['doc_id'] = [f'train_{i}_{j}' for i, j in zip(range(1000, 1000 + len(df)), range(10000, 10000 + len(df)))]

# Adding a new column 'index' with values based on variables i, j, k
df['index'] = [f'cnc_train_{i}_{j}_{k}_10' for i, j, k in zip(range(100, 100 + len(df)), range(1000, 1000 + len(df)), range(10000, 10000 + len(df)))]

# Reordering columns
df = df[['corpus', 'doc_id', 'sent_id', 'eg_id','index', 'text_w_pairs', 'seq_label', 'pair_label', 'context', 'num_sents']]

# Store the DataFrame into  CSV file with the added 'sent_id' column
df.to_csv('/content/augmented_train_subtask2.csv', index=None)

In [None]:
import pandas as pd
import re

# Reading the given CSV file
df = pd.read_csv("/content/augmented_train_subtask2.csv")

# Adding a new column 'text' with the same data as 'causal_text_w_pairs' but without <arg1>, <arg0>, <sig0> tags
df['text'] = df['text_w_pairs'].apply(lambda x: re.sub(r'<(/?ARG[01]|/?SIG0)>', '', x))

# Remove square brackets from the 'text' column sentences
df['text'] = df['text'].apply(lambda x: re.sub(r'[\[\]]', '', x))
# Reordering columns
df = df[['corpus', 'doc_id', 'sent_id', 'eg_id', 'index', 'text', 'text_w_pairs', 'seq_label', 'pair_label', 'context', 'num_sents']]

# Store the DataFrame into a CSV file with the added columns
df.to_csv('/content/augmented_train_subtask2.csv', index=None)

In [None]:
df2 = pd.read_csv('/content/train_subtask2.csv')

In [None]:
df3 = pd.concat([df2, df], ignore_index=True)

# Save the combined DataFrame to a new CSV file
output_csv_path = df3.to_csv('/content/combined_data.csv', index=False)

print(f"Processed data has been saved to {output_csv_path}")

Processed data has been saved to None
