In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip3 install autoawq

Collecting autoawq
  Downloading autoawq-0.2.5-cp310-cp310-manylinux2014_x86_64.whl (84 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.3/84.3 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
Collecting accelerate (from autoawq)
  Downloading accelerate-0.30.0-py3-none-any.whl (302 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.4/302.4 kB[0m [31m10.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets (from autoawq)
  Downloading datasets-2.19.0-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m14.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting zstandard (from autoawq)
  Downloading zstandard-0.22.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (5.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.4/5.4 MB[0m [31m25.7 MB/s[

In [3]:
!git clone https://github.com/casper-hansen/AutoAWQ

Cloning into 'AutoAWQ'...
remote: Enumerating objects: 3065, done.[K
remote: Counting objects: 100% (1578/1578), done.[K
remote: Compressing objects: 100% (725/725), done.[K
remote: Total 3065 (delta 1146), reused 1054 (delta 833), pack-reused 1487[K
Receiving objects: 100% (3065/3065), 7.52 MiB | 20.82 MiB/s, done.
Resolving deltas: 100% (1905/1905), done.


In [4]:
!cd AutoAWQ

In [5]:
# Function to extract causal text from the generated examples
def extract_causal_text(lines):
    causal_text = []
    i = 0
    while i < len(lines):
        if "LLM output:" in lines[i]:
            causal_text.append(lines[i+1])
            i += 2
        else:
            i += 1
    return causal_text

In [6]:
def conv_s2(sentence, cause, effect, signal):
    """
    Create an example based on the tags used in the appropriate scheme.

    Parameters:
    - sentence (str): The original sentence.
    - cause (str): The cause enclosed between tags.
    - effect (str): The effect enclosed between tags.
    - signal (str): The signal enclosed between tags.

    Returns:
    - str: Example with tags replaced by their values.
    """
    example = f"Here is an example of the tags used in the appropriate scheme:\n{sentence}\n" \
              f"Where the cause in the scheme is: {cause}\n" \
              f"The effect in the scheme is: {effect}\n" \
              f"The signal in the scheme is: {signal}\n"
    return example

In [7]:
from awq import AutoAWQForCausalLM
from transformers import AutoTokenizer

# Load tokenizer and model
model_name_or_path = "TheBloke/zephyr-7B-beta-AWQ"
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=False)
model = AutoAWQForCausalLM.from_quantized(model_name_or_path, fuse_layers=True,
                                          trust_remote_code=False, safetensors=True)

# Define domains
domains = ['economic events', 'social events', 'political unrest', 'elections', 'natural disasters',
           'transport accidents', 'crimes', 'industrial accidents', 'international relations',
           'home policy', 'MUC-like events', 'KBP-like events']

# Request template
request = "generate me 5 causal examples"

# Definition of tags
definition = """Below are the definitions for the tags in the sentence:
  Cause: The reason for an event happening, to be enclosed between <ARG0> and </ARG0>.
  Effect: The event that occurs due to the cause, to be enclosed between <ARG1> and </ARG1>.
  Signal: Words that transition the cause to the effect, to be enclosed between <SIG0> and </SIG0>.

  Please generate causal sentences within this domain."""

# Example based on an entry of the original training dataset
example_sentence = "A decision by the United Nations to impose economic sanctions on a country led to a decrease in the country's economic activity and standard of living"
example_cause = "<ARG0> A decision by the United Nations to impose economic sanctions on a country."
example_signal = "<SIG0> Led to."
example_effect = "<ARG1> A decrease in the country's economic activity and standard of living."

# Saving generated examples to a text file
file_path = "generated_text.txt"

with open(file_path, 'w') as file:
    for d in domains:
        # Prompt template with revised structure
        prompt_template = f'''
        </s>
        {request} in the domain of {d}

        {definition}

        {conv_s2(example_sentence, example_cause, example_signal, example_effect)}
        '''

        # Tokenize prompt
        token_input = tokenizer(
            prompt_template,
            return_tensors='pt'
        ).input_ids.cuda()

        # Generate causal examples
        generation_output = model.generate(
            token_input,
            do_sample=True,
            temperature=0.7,
            top_p=0.95,
            top_k=40,
            max_new_tokens=1024
        )

        # Decode and structure generated text
        token_output = generation_output[0]
        text_output = tokenizer.decode(token_output)
        # Remove the prompt
        text_output = text_output.replace(request, "").strip()
        text_output = text_output.replace(definition, "").strip()
        text_output = text_output.replace(prompt_template, "").strip()
        structured_text_output = text_output.replace("<ARG0>", "<ARG0>").replace("</ARG0>", "</ARG0>") \
            .replace("<ARG1>", "<ARG1>").replace("</ARG1>", "</ARG1>") \
            .replace("<SIG0>", "<SIG0>").replace("</SIG0>", "</SIG0>").strip()
        # Write to file
        file.write(f"{structured_text_output}\n\n")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/1.43k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/168 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/828 [00:00<?, ?B/s]

Fetching 13 files:   0%|          | 0/13 [00:00<?, ?it/s]

all_results.json:   0%|          | 0.00/728 [00:00<?, ?B/s]

eval_results.json:   0%|          | 0.00/553 [00:00<?, ?B/s]

train_results.json:   0%|          | 0.00/195 [00:00<?, ?B/s]

quant_config.json:   0%|          | 0.00/90.0 [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/32.8k [00:00<?, ?B/s]

.gitattributes:   0%|          | 0.00/1.52k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/4.15G [00:00<?, ?B/s]

Replacing layers...: 100%|██████████| 32/32 [00:08<00:00,  3.98it/s]
Fusing layers...: 100%|██████████| 32/32 [00:01<00:00, 23.14it/s]


In [12]:
from awq import AutoAWQForCausalLM
from transformers import AutoTokenizer

# Load tokenizer and model
model_name_or_path = "TheBloke/zephyr-7B-beta-AWQ"
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=False)
model = AutoAWQForCausalLM.from_quantized(model_name_or_path, fuse_layers=True,
                                          trust_remote_code=False, safetensors=True)

# Define domains
domains = ['economic events', 'social events', 'political unrest', 'elections', 'natural disasters',
           'transport accidents', 'crimes', 'industrial accidents', 'international relations',
           'home policy', 'MUC-like events', 'KBP-like events']

# Request template
request = "generate me 5 causal examples"

# Definition of tags
definition = """Below are the definitions for the tags in the sentence:
  Cause: The reason for an event happening, to be enclosed between <ARG0> and </ARG0>.
  Effect: The event that occurs due to the cause, to be enclosed between <ARG1> and </ARG1>.
  Signal: Words that transition the cause to the effect, to be enclosed between <SIG0> and </SIG0>.

  Please generate causal sentences within this domain."""

# Example based on an entry of the original training dataset
example_sentence = "A decision by the United Nations to impose economic sanctions on a country led to a decrease in the country's economic activity and standard of living"
example_cause = "<ARG0> A decision by the United Nations to impose economic sanctions on a country."
example_signal = "<SIG0> Led to."
example_effect = "<ARG1> A decrease in the country's economic activity and standard of living."

# Function to format generated text
def format_generated_text(generated_text):
    return f"{example_cause} {example_signal} {generated_text} {example_effect}"

# Saving generated examples to a text file
file_path = "generated_text2.txt"

with open(file_path, 'w') as file:
    for d in domains:
        # Prompt template
        prompt_template = f'''
        </s>
        {request} in the domain of {d}

        {definition}

        {conv_s2(example_sentence, example_cause, example_signal, example_effect)}
        '''

        # Tokenize prompt
        token_input = tokenizer(
            prompt_template,
            return_tensors='pt'
        ).input_ids.cuda()

        # Generate causal examples
        generation_output = model.generate(
            token_input,
            do_sample=True,
            temperature=0.7,
            top_p=0.95,
            top_k=40,
            max_new_tokens=1024
        )

        # Decode and structure generated text
        generated_text = tokenizer.decode(generation_output[0])
        formatted_text = format_generated_text(generated_text)
        # Remove the prompt
        formatted_text = formatted_text.replace(request, "").strip()
        formatted_text = formatted_text.replace(definition, "").strip()
        formatted_text = formatted_text.replace(prompt_template, "").strip()

        # Write to file
        file.write(f"{formatted_text}\n\n")

Fetching 13 files:   0%|          | 0/13 [00:00<?, ?it/s]

Replacing layers...: 100%|██████████| 32/32 [00:08<00:00,  3.80it/s]
Fusing layers...: 100%|██████████| 32/32 [00:00<00:00, 70.49it/s]


In [None]:
from awq import AutoAWQForCausalLM
from transformers import AutoTokenizer

# Load tokenizer and model
model_name_or_path = "TheBloke/zephyr-7B-beta-AWQ"
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=False)
model = AutoAWQForCausalLM.from_quantized(model_name_or_path, fuse_layers=True,
                                          trust_remote_code=False, safetensors=True)

# Define domains
domains = ['economic events', 'social events', 'political unrest', 'elections', 'natural disasters',
           'transport accidents', 'crimes', 'industrial accidents', 'international relations',
           'home policy', 'MUC-like events', 'KBP-like events']

# Define request and definition
request = "generate me 5 causal examples"
definition = """Below are the definitions for the tags in the sentence:
  Cause: The reason for an event happening, to be enclosed between <ARG0> and </ARG0>.
  Effect: The event that occurs due to the cause, to be enclosed between <ARG1> and </ARG1>.
  Signal: Words that transition the cause to the effect, to be enclosed between <SIG0> and </SIG0>.

  Please generate causal sentences within this domains."""

# Saving generated examples to a text file
file_path = "generated_examples.txt"
# Define the target number of outputs
target_outputs = 500
generated_count = 0
with open(file_path, 'w') as file:
    for d in domains:
        prompt_template = f'''</s>

        {request} in the domain of {d} {definition}

        '''

        # Tokenize prompt
        token_input = tokenizer(
            prompt_template,
            return_tensors='pt'
        ).input_ids.cuda()

        # Generate causal examples
        generation_output = model.generate(
            token_input,
            do_sample=True,
            temperature=0.7,
            top_p=0.95,
            top_k=40,
            max_new_tokens=512
        )

        # Decode and structure generated text
        token_output = generation_output[0]
        text_output = tokenizer.decode(token_output)
        structured_text_output = text_output.replace("<ARG0>", "<ARG0>").replace("</ARG0>", "</ARG0>") \
            .replace("<ARG1>", "<ARG1>").replace("</ARG1>", "</ARG1>") \
            .replace("<SIG0>", "<SIG0>").replace("</SIG0>", "</SIG0>")
        # Write to file
        file.write(d + "\n")
        file.write(structured_text_output + "\n")

Fetching 13 files:   0%|          | 0/13 [00:00<?, ?it/s]

Replacing layers...: 100%|██████████| 32/32 [00:09<00:00,  3.40it/s]
Fusing layers...: 100%|██████████| 32/32 [00:00<00:00, 90.63it/s]


In [None]:
from awq import AutoAWQForCausalLM
from transformers import AutoTokenizer

# Load tokenizer and model
model_name_or_path = "TheBloke/zephyr-7B-beta-AWQ"
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=False)
model = AutoAWQForCausalLM.from_quantized(model_name_or_path, fuse_layers=True,
                                          trust_remote_code=False, safetensors=True)

# Define domains
domains = ['economic events', 'social events', 'political unrest']

# Definition of tags
definition = """Below are the definitions for the tags in the sentence:
  Cause: The reason for an event happening, to be enclosed between <ARG0> and </ARG0>.
  Effect: The event that occurs due to the cause, to be enclosed between <ARG1> and </ARG1>.
  Signal: Words that transition the cause to the effect, to be enclosed between <SIG0> and </SIG0>.

  Please generate causal sentences within this domain."""

# Generate augmented content for each domain
for d in domains:
    # Prompt template
    prompt_template = f'''
    </s>
    {request} in the domain of {d}

    {definition}
    '''

    # Tokenize prompt
    token_input = tokenizer(
        prompt_template,
        return_tensors='pt'
    ).input_ids.cuda()

    # Generate causal examples
    generation_output = model.generate(
        token_input,
        do_sample=True,
        temperature=0.7,
        top_p=0.95,
        top_k=40,
        max_new_tokens=1024
    )

    # Decode and structure generated text
    generated_text = tokenizer.decode(generation_output[0])

    # Print the augmented content
    print(f"Domain: {d}\n{generated_text}\n")

load the txt file, concatenat the examples in a csv fil.


In [10]:
import csv

def append_to_csv_file(filename, data):
    """
    Append the given data to a CSV file.

    Parameters:
    - filename (str): The name of the CSV file.
    - data (list): The data to be appended to the file.
    """
    try:
        # Use 'a' mode to open the file in append mode
        with open(filename, 'a', newline='') as csvfile:
            # Create a CSV writer object
            csv_writer = csv.writer(csvfile)

            # Append each line from the data list to the CSV file
            for line in data:
                csv_writer.writerow([line.strip()])  # Remove newline characters before writing

        print(f'Data appended to {filename} successfully.')
    except Exception as e:
        print(f'Error: {e}')

# Path to the generated examples text file
text_file_path = "/content/generated_text.txt"

# Path to the CSV file
csv_file_path = "/content/generated_examples.csv"

# Read the content of the text file
with open(text_file_path, 'r') as file:
    text_content = file.readlines()

# Append the content to the CSV file
append_to_csv_file(csv_file_path, text_content)

Data appended to /content/generated_examples.csv successfully.


In [10]:
import pandas as pd

file_path = "/content/generated_text.txt"
# Load the generated text from the file
with open(file_path, 'r') as file:
    generated_text = file.readlines()

# Initialize lists to store data
corpus_list = []
doc_id_list = []
sent_id_list = []
eg_id_list = []
index_list = []
text_list = []
causal_text_list = []
num_rs_list = []

# Populate the lists with data
for index, text in enumerate(generated_text):
    corpus_list.append("Generated")
    doc_id_list.append("1")  # Assuming all belong to the same document
    sent_id_list.append(index + 1)  # Assuming each line corresponds to a sentence
    eg_id_list.append("1")  # Assuming all examples belong to the same example group
    index_list.append(index)
    text_list.append(text)
    causal_text_list.append(text)  # Assuming the causal text is the same as the generated text
    num_rs_list.append(0)  # Assuming there are no root causes for now

# Create the DataFrame
df = pd.DataFrame({
    'corpus': corpus_list,
    'doc_id': doc_id_list,
    'sent_id': sent_id_list,
    'eg_id': eg_id_list,
    'index': index_list,
    'text': text_list,
    'causal_text': causal_text_list,
    'num_rs': num_rs_list
})

# Remove HTML tags from each line and update the 'text' column
cleaned_text = []
for text in generated_text:
    cleaned_line = text.replace("<s>", "").replace("</s>", "").replace("<ARG0>", "").replace("</ARG0>", "").replace("<ARG1>", "").replace("</ARG1>", "").replace("<SIG0>", "").replace("</SIG0>", "")
    cleaned_text.append(cleaned_line.strip())

# Replace the existing 'text' column in the DataFrame
df['text'] = cleaned_text

# Save the DataFrame to a CSV file
df.to_csv('generated_data_cleaned.csv', index=False)

In [None]:
import csv
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize

# Function to extract causal relations from a sentence
def extract_causal_relations(sentence):
    # Placeholder function, you would implement this according to your specific causal relation extraction method
    # For demonstration purposes, let's assume no causal relations are extracted
    return []

# Function to write data to CSV file
def write_to_csv(data, output_filename):
    with open(output_filename, 'w', newline='') as csvfile:
        fieldnames = ['corpus', 'doc_id', 'sent_id', 'eg_id', 'index', 'text', 'causal_text', 'num_rs']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        for row in data:
            writer.writerow(row)

# Load causal text file
with open('/content/domain.txt', 'r') as file:
    causal_text = file.read()

# Tokenize into sentences
sentences = sent_tokenize(causal_text)

# Initialize data list to hold rows for CSV
csv_data = []

# Process each sentence
for sent_id, sentence in enumerate(sentences, start=1):
    causal_relations = extract_causal_relations(sentence)
    eg_id = 0
    for index, causal_text_w_pairs in enumerate(causal_relations, start=1):
        eg_id += 1
        num_rs = len(causal_text_w_pairs)
        row = {
            'corpus': 'Your Corpus Name',
            'doc_id': 'Your Document Name',
            'sent_id': sent_id,
            'eg_id': eg_id,
            'index': f'{sent_id}_{eg_id}',
            'text': sentence,
            'causal_text': causal_text_w_pairs,
            'num_rs': num_rs
        }
        csv_data.append(row)

# Write data to CSV file
write_to_csv(csv_data, '/content/train_subtask2_grouped.csv')

print("CSV file saved successfully.")

In [None]:
import csv
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize

# Function to extract causal relations from a sentence
def extract_causal_relations(sentence):
    # Placeholder function, you would implement this according to your specific causal relation extraction method
    # For demonstration purposes, let's assume no causal relations are extracted
    return []

# Function to write data to CSV file
def write_to_csv(data, output_filename):
    with open(output_filename, 'w', newline='') as csvfile:
        fieldnames = ['corpus', 'doc_id', 'sent_id', 'eg_id', 'index', 'text', 'causal_text', 'num_rs']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(data)

# Load causal text file
with open('/content/domain.txt', 'r') as file:
    causal_text = file.read()

# Tokenize into sentences
sentences = sent_tokenize(causal_text)

# Initialize data list to hold rows for CSV
csv_data = []

# Process each sentence
for sent_id, sentence in enumerate(sentences, start=1):
    causal_relations = extract_causal_relations(sentence)
    eg_id = 0
    for index, causal_text_w_pairs in enumerate(causal_relations, start=1):
        eg_id += 1
        num_rs = len(causal_text_w_pairs)
        row = {
            'corpus': 'Your Corpus Name',
            'doc_id': 'Your Document Name',
            'sent_id': sent_id,
            'eg_id': eg_id,
            'index': f'{sent_id}_{eg_id}',
            'text': sentence,
            'causal_text': causal_text_w_pairs,
            'num_rs': num_rs
        }
        csv_data.append(row)

# Write data to CSV file
write_to_csv(csv_data, '/content/train_subtask2_grouped.csv')

print("CSV file saved successfully.")

In [None]:
import csv

# Function to extract causal examples and format them
def extract_causal_examples(text):
    examples = []
    domain_start = 0
    while True:
        domain_start = text.find("<ARG0>", domain_start)
        if domain_start == -1:
            break

        domain_end = text.find("</s>", domain_start)
        domain_text = text[domain_start:domain_end]

        cause_start = domain_text.find("<ARG0>") + len("<ARG0>")
        cause_end = domain_text.find("<SIG0>")
        cause = domain_text[cause_start:cause_end].strip()

        signal_start = cause_end + len("<SIG0>")
        signal_end = domain_text.find("</SIG0>")
        signal = domain_text[signal_start:signal_end].strip()

        effect_start = signal_end + len("</SIG0>")
        effect_end = domain_text.find("</ARG1>")
        effect = domain_text[effect_start:effect_end].strip()

        examples.append({
            'text': cause + ' ' + signal + ' ' + effect,
            'causal_text': [cause, signal, effect],
            'num_rs': 3
        })

        domain_start = domain_end + len("</s>")

    return examples

# Function to write data to CSV file
def write_to_csv(data, output_filename):
    with open(output_filename, 'w', newline='') as csvfile:
        fieldnames = ['corpus', 'doc_id', 'sent_id', 'eg_id', 'index', 'text', 'causal_text', 'num_rs']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        for row in data:
            writer.writerow(row)

# Read the text file
with open('/content/domain.txt', 'r') as file:
    text = file.read()

# Extract causal examples
examples = extract_causal_examples(text)

# Initialize data list to hold rows for CSV
csv_data = []

# Add examples to data list
for i, example in enumerate(examples, start=1):
    csv_data.append({
        'corpus': 'Your Corpus Name',
        'doc_id': f'Document {i}',
        'sent_id': 1,
        'eg_id': i,
        'index': f'{i}_1',
        'text': example['text'],
        'causal_text': example['causal_text'],
        'num_rs': example['num_rs']
    })

# Write data to CSV file
write_to_csv(csv_data, '/content/train_subtask2_grouped.csv')

print("CSV file saved successfully.")

In [None]:
import csv

# Load the text file
with open('domain.txt', 'r') as file:
    examples = file.readlines()

# Concatenate the examples into a single string
concatenated_examples = ' '.join(examples)

# Split the concatenated examples by domain
split_examples = concatenated_examples.split('*** Running model.generate:')

# Remove the empty first element
split_examples = split_examples[1:]

# Open a file named 'examples.csv' in write mode
with open('examples.csv', 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow([domains, text_output])

    for domain in domains:
        # Find the example corresponding to the current domain
        for example in split_examples:
            if f"in the domain of {domain}" in example:
                # Extract the generated output for this domain
                _, generated_output = example.split('\n', 1)
                generated_output = generated_output.strip()

                # Write domain name and generated output to CSV file
                writer.writerow([domain, generated_output])
                break  # Stop searching for this domain once found

prompt using the examples you have in your csv. try injecting 2, 3, 5 examples at a time and lets observe the difference by mnaual annotation of the answers


In [None]:
from transformers import pipeline
import pandas as pd

# Load examples from the CSV file
examples_df = pd.read_csv('/content/train_subtask2_grouped.csv')

# Create a function to generate prompts with varying numbers of examples
def generate_prompt(examples, num_examples):
    # Select the specified number of examples
    selected_examples = examples[:num_examples]
    prompt = "\n\n".join(selected_examples)
    return prompt

# Function to generate and print outputs for different number of examples
def generate_outputs(examples_df, num_examples):
    prompt = generate_prompt(examples_df['causal_text'].tolist(), num_examples)
    print("Prompt:\n", prompt)
    print("\n*** Generating output with", num_examples, "examples:")
    output = generator(prompt)
    print("Generated output:\n", output[0]['generated_text'])
    print("--------------------------------------------------\n")

# Load the text generation pipeline
generator = pipeline("text-generation")

# Generate outputs for 2, 3, and 5 examples
generate_outputs(examples_df, 2)
#generate_outputs(examples_df, 3)
#generate_outputs(examples_df, 5)

In [None]:
import pandas as pd

# Load the CSV file
df = pd.read_csv('/content/train_subtask2_grouped.csv')

# Function to inject examples and observe manual annotations
def inject_and_observe(num_examples):
    # Inject examples
    injected_examples = df.head(num_examples)

    # Print injected examples
    print(f"\nInjected Examples ({num_examples} examples):")
    for index, row in injected_examples.iterrows():
        print(f"Example {index + 1}:")
        print(f"Text: {row['text']}")
        print(f"Causal Text: {row['causal_text']}")
        print()  # Add a newline for better readability

    # Here you would manually annotate the injected examples and observe the differences

# Inject and observe 2 examples
inject_and_observe(2)

# Inject and observe 3 examples
inject_and_observe(3)

# Inject and observe 5 examples
inject_and_observe(5)

In [None]:
#Concatenation of the previous dataset and the new dataset
# Load the previous dataset
import pandas as pd
previous_df = pd.read_csv('/content/drive/MyDrive/Semester Project/Data/train_subtask2_grouped.csv')
print(previous_df.shape)

# Load the new dataset
new_df = pd.read_csv('/content/train_subtask2_grouped.csv')

# Concatenate the previous and new datasets
concatenated_df = pd.concat([previous_df, new_df], ignore_index=True)

# Save the concatenated dataset to a new CSV file
concatenated_df.to_csv('concatenated_dataset.csv', index=False)
print(concatenated_df.shape)

Checking manually if there's a causal relation in the text:


In [None]:




import pandas as pd

def manual_check_causal_relation(df):
    for index, row in df.iterrows():
        print(f"\nExample {index + 1}:")
        print(f"Text: {row['text']}")
        causal_relation = input("Does this example contain a causal relation? (Yes/No): ").strip().lower()
        while causal_relation not in ['yes', 'no']:
            print("Invalid input. Please enter 'Yes' or 'No'.")
            causal_relation = input("Does this example contain a causal relation? (Yes/No): ").strip().lower()
        df.at[index, 'causal_relation'] = causal_relation

    return df

# Load the dataset
df = pd.read_csv('/content/concatenated_dataset.csv')

# Add a column for manual annotations
df['causal_relation'] = ""

# Manually check causal relations
df = manual_check_causal_relation(df)

# Save the annotated dataset
df.to_csv('annotated_dataset.csv', index=False)

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
# Load the annotated dataset
df = pd.read_csv('/content/concatenated_dataset.csv')

# Check for NaN values in the original DataFrame
print("NaN values in original DataFrame:", df.isnull().values.any())

# Drop rows with NaN values
df.dropna(inplace=True)

# Check if the dataset is empty after dropping NaN values
if len(df) == 0:
    print("Dataset is empty after dropping NaN values. Adjust the data or preprocessing steps.")
    exit()

# Fill NaN values in the 'text' column with an empty string
df['text'].fillna('', inplace=True)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['causal_text_w_pairs'], test_size=0.2, random_state=42)

# Vectorize the text data using TF-IDF
vectorizer = TfidfVectorizer(max_features=1000)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# Train a RandomForestClassifier
classifier = RandomForestClassifier(n_estimators=100, random_state=42)
classifier.fit(X_train_vec, y_train)

# Predict on the test set
y_pred = classifier.predict(X_test_vec)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Apply the classifier to the entire dataset
X_vec = vectorizer.transform(df['text'])
df['predicted_causal_relation'] = classifier.predict(X_vec)

# Save the updated dataset
df.to_csv('predicted_dataset.csv', index=False)