In [4]:
import json
from emoji import demojize
from nltk.tokenize import TweetTokenizer
import random
from collections import defaultdict
import sys
import os
sys.path.append("../src")
import prompt_utils
import random
from sklearn.model_selection import train_test_split
random.seed(42)

def normalizeToken(token):
    lowercased_token = token.lower()
    if token.startswith("@"):
        return "@USER"
    elif lowercased_token.startswith("http") or lowercased_token.startswith("www"):
        return "[url]"
    elif len(token) == 1:
        return demojize(token)
    else:
        if token == "’":
            return "'"
        elif token == "…":
            return "..."
        else:
            return token
    
def normalizeTweet(tweet):
    tokens = TweetTokenizer().tokenize(tweet.replace("’", "'"))
    normTweet = " ".join([normalizeToken(token) for token in tokens])

    normTweet = (
        normTweet.replace("n 't", "n't")
            .replace("ca n't", "can't")
            .replace("ai n't", "ain't")
    )
    normTweet = (
        normTweet.replace("p . m .", "pm")
            .replace("p . m", "pm")
            .replace("a . m .", "am")
            .replace("a . m", "am")
    )
    return " ".join(normTweet.split())

def normalizeTweet_v02(tweet):
    tokens = TweetTokenizer().tokenize(tweet.replace("’", "'"))
    normTweet = " ".join([normalizeToken(token) for token in tokens])

    normTweet = (
        normTweet.replace("n 't", "n't")
            .replace("ca n't", "can't")
            .replace("ai n't", "ain't")
    )
    normTweet = (
        normTweet.replace("p . m .", "pm")
            .replace("p . m", "pm")
            .replace("a . m .", "am")
            .replace("a . m", "am")
    )
    normTweet = (
        normTweet.replace("p . m .", "pm")
    )
    return " ".join(normTweet.split())

def reformat_json_v01(input_file, output_file, dataset_type="train"):
    with open(input_file, 'r') as f:
        data = json.load(f)

    formatted_data = []
    for item in data[dataset_type]:
        formatted_item = {
            "instruction": "Assign multilabel labels to the following tweet. Choose out of the following list of labels where 'Others' is only assigned if no other label fits: ['War/Terror', 'Conspiracy Theory', Education', 'Election Campaign', 'Environment', 'Government/Public', 'Health', 'Immigration/Integration', 'Justice/Crime', 'Labor/Employment', 'Macroeconomics/Economic Regulation', 'Media/Journalism', 'Religion', 'Science/Technology', 'Others']",
            "input": normalizeTweet(item["text"]),
            "output": str(item["annotations"])
        }
        formatted_data.append(formatted_item)

    with open(output_file, 'w') as f:
        json.dump(formatted_data, f, indent=4)

def reformat_json_v01_retest(input_file, output_file, dataset_type="train"):
    with open(input_file, 'r') as f:
        data = json.load(f)

    formatted_data = []
    for item in data[dataset_type]:
        tweet_text = normalizeTweet(item["text"])
        instruction = "Out of the following list of topics, choose topics that best fit the tweet in your opinion. Output in format ['topic 1', 'topic 2', ...].\n\nTopics:\n['War/Terror', 'Conspiracy Theory', 'Education', 'Election Campaign', 'Environment', 'Government/Public', 'Health', 'Immigration/Integration', 'Justice/Crime', 'Labor/Employment', 'Macroeconomics/Economic Regulation', 'Media/Journalism', 'Religion', 'Science/Technology']"
        prompt = f"### Human:\n{instruction}\n\nTweet: {tweet_text}\n\n### Assistant:\n"
        formatted_item = {
            "instruction": prompt,
            "output": str(item["annotations"])
        }
        formatted_data.append(formatted_item)

    with open(output_file, 'w') as f:
        json.dump(formatted_data, f, indent=4)

def reformat_json_v02(input_file, output_file, dataset_type="train"):
    with open(input_file, 'r') as f:
        data = json.load(f)

    formatted_data = []
    for item in data[dataset_type]:
        formatted_item = {
            "instruction": "Out of the following list of topics, choose topics that best fit the text in your opinion. Here are the rules:\n- Use clear indicators for labeling.\n- If unclear, output 'Others'. \n- No speculation.\n- Don't mix content from different operations.\n- Refer to Oxford definitions for topics.\n\nTopics: 'War/Terror', 'Conspiracy Theory', 'Education', 'Election Campaign', 'Environment', 'Government/Public', 'Health', 'Immigration/Integration', 'Justice/Crime', 'Labor/Employment', 'Macroeconomics/Economic Regulation', 'Media/Journalism', 'Religion', 'Science/Technology'",
            "input": normalizeTweet(item["text"]),
            "output": str(item["annotations"])
        }
        formatted_data.append(formatted_item)

    with open(output_file, 'w') as f:
        json.dump(formatted_data, f, indent=4)

def reformat_json_binary_v01(label, input_file):
    
    output_file_train = f"../data/labeled_data/lora_binary_train_{label.split('/')[0]}_v01.json"
    output_file_valid = f"../data/labeled_data/lora_binary_valid_{label.split('/')[0]}_v01.json"
    
    # Load both train and validation data
    with open(input_file, 'r') as f:
        data = json.load(f)

    positive_samples = []
    negative_samples = []
    
    # Combine train and validation data for processing
    combined_data = data["train"] + data["valid"]
    
    for item in combined_data:
        formatted_item = {
            "instruction": f"Classify the input based on if it's about {label}. Use 1 or 0 as output.",
            "input": normalizeTweet(item["text"]),
            "output": str(int(label in item["annotations"]))
        }
        if label in item["annotations"]:
            positive_samples.append(formatted_item)
        else:
            negative_samples.append(formatted_item)

    # Balance the positive and negative samples
    if len(negative_samples) > len(positive_samples):
        negative_samples = random.sample(negative_samples, len(positive_samples), random_state=42)

    # Merge positive and negative samples
    formatted_data = positive_samples + negative_samples

    print("---------------------")
    print("Label:", label)
    print("Positive samples:", len(positive_samples))
    print("Negative samples:", len(negative_samples))
    print("Total samples:", len(formatted_data))

    # Split the data into train and validation sets (80/20 ratio)
    train_data, valid_data = train_test_split(formatted_data, test_size=0.2, random_state=42)

    print("Train samples:", len(train_data))
    print("Valid samples:", len(valid_data))
    print("---------------------")

    # Save the train and validation data to their respective files
    with open(output_file_train, 'w') as f:
        json.dump(train_data, f, indent=4)
    
    with open(output_file_valid, 'w') as f:
        json.dump(valid_data, f, indent=4)

In [5]:
# Example usage binary:
for class_ in prompt_utils.ALL_LABELS: 
    label = class_
    input_file = "../data/labeled_data/generic_test_0.json"

    reformat_json_binary_v01(label, input_file)
    #reformat_json_binary_v01(label, input_file, dataset_type="test")
    #reformat_json_binary_v01(label, input_file, dataset_type="valid")

---------------------
Label: War/Terror
Positive samples: 939
Negative samples: 939
Total samples: 1878
Train samples: 1502
Valid samples: 376
---------------------
---------------------
Label: Conspiracy Theory
Positive samples: 254
Negative samples: 254
Total samples: 508
Train samples: 406
Valid samples: 102
---------------------
---------------------
Label: Education
Positive samples: 63
Negative samples: 63
Total samples: 126
Train samples: 100
Valid samples: 26
---------------------
---------------------
Label: Election Campaign
Positive samples: 133
Negative samples: 133
Total samples: 266
Train samples: 212
Valid samples: 54
---------------------
---------------------
Label: Environment
Positive samples: 58
Negative samples: 58
Total samples: 116
Train samples: 92
Valid samples: 24
---------------------
---------------------
Label: Government/Public
Positive samples: 1248
Negative samples: 1248
Total samples: 2496
Train samples: 1996
Valid samples: 500
---------------------
---

# Print the amount of 1 or 0s in the binary files

In [4]:
def count_labels(file):
    with open(file, 'r') as f:
        data = json.load(f)

    label_count = {'1': 0, '0': 0}
    for item in data:
        label_count[item["output"]] += 1

    return label_count

# Check the distribution of labels in the output files
print(count_labels(output_file_train))
print(count_labels(output_file_test))
print(count_labels(output_file_valid))

{'1': 852, '0': 852}
{'1': 271, '0': 271}
{'1': 205, '0': 205}
