In [6]:
import json
from emoji import demojize
from nltk.tokenize import TweetTokenizer
import random
from collections import defaultdict

def normalizeToken(token):
    lowercased_token = token.lower()
    if token.startswith("@"):
        return "@USER"
    elif lowercased_token.startswith("http") or lowercased_token.startswith("www"):
        return "[url]"
    elif len(token) == 1:
        return demojize(token)
    else:
        if token == "’":
            return "'"
        elif token == "…":
            return "..."
        else:
            return token
    
def normalizeTweet(tweet):
    tokens = TweetTokenizer().tokenize(tweet.replace("’", "'"))
    normTweet = " ".join([normalizeToken(token) for token in tokens])

    normTweet = (
        normTweet.replace("n 't", "n't")
            .replace("ca n't", "can't")
            .replace("ai n't", "ain't")
    )
    normTweet = (
        normTweet.replace("p . m .", "pm")
            .replace("p . m", "pm")
            .replace("a . m .", "am")
            .replace("a . m", "am")
    )
    return " ".join(normTweet.split())

def normalizeTweet_v02(tweet):
    tokens = TweetTokenizer().tokenize(tweet.replace("’", "'"))
    normTweet = " ".join([normalizeToken(token) for token in tokens])

    normTweet = (
        normTweet.replace("n 't", "n't")
            .replace("ca n't", "can't")
            .replace("ai n't", "ain't")
    )
    normTweet = (
        normTweet.replace("p . m .", "pm")
            .replace("p . m", "pm")
            .replace("a . m .", "am")
            .replace("a . m", "am")
    )
    normTweet = (
        normTweet.replace("p . m .", "pm")
    )
    return " ".join(normTweet.split())

def reformat_json_v01(input_file, output_file, dataset_type="train"):
    with open(input_file, 'r') as f:
        data = json.load(f)

    formatted_data = []
    for item in data[dataset_type]:
        formatted_item = {
            "instruction": "Assign multilabel labels to the following tweet. Choose out of the following list of labels where 'Others' is only assigned if no other label fits: ['War/Terror', 'Conspiracy Theory', Education', 'Election Campaign', 'Environment', 'Government/Public', 'Health', 'Immigration/Integration', 'Justice/Crime', 'Labor/Employment', 'Macroeconomics/Economic Regulation', 'Media/Journalism', 'Religion', 'Science/Technology', 'Others']",
            "input": normalizeTweet(item["text"]),
            "output": str(item["annotations"])
        }
        formatted_data.append(formatted_item)

    with open(output_file, 'w') as f:
        json.dump(formatted_data, f, indent=4)

def reformat_json_v01_retest(input_file, output_file, dataset_type="train"):
    with open(input_file, 'r') as f:
        data = json.load(f)

    formatted_data = []
    for item in data[dataset_type]:
        tweet_text = normalizeTweet(item["text"])
        instruction = "Out of the following list of topics, choose topics that best fit the tweet in your opinion. Output in format ['topic 1', 'topic 2', ...].\n\nTopics:\n['War/Terror', 'Conspiracy Theory', 'Education', 'Election Campaign', 'Environment', 'Government/Public', 'Health', 'Immigration/Integration', 'Justice/Crime', 'Labor/Employment', 'Macroeconomics/Economic Regulation', 'Media/Journalism', 'Religion', 'Science/Technology']"
        prompt = f"### Human:\n{instruction}\n\nTweet: {tweet_text}\n\n### Assistant:\n"
        formatted_item = {
            "instruction": prompt,
            "output": str(item["annotations"])
        }
        formatted_data.append(formatted_item)

    with open(output_file, 'w') as f:
        json.dump(formatted_data, f, indent=4)

def reformat_json_v02(input_file, output_file, dataset_type="train"):
    with open(input_file, 'r') as f:
        data = json.load(f)

    formatted_data = []
    for item in data[dataset_type]:
        formatted_item = {
            "instruction": "Out of the following list of topics, choose topics that best fit the text in your opinion. Here are the rules:\n- Use clear indicators for labeling.\n- If unclear, output 'Others'. \n- No speculation.\n- Don't mix content from different operations.\n- Refer to Oxford definitions for topics.\n\nTopics: 'War/Terror', 'Conspiracy Theory', 'Education', 'Election Campaign', 'Environment', 'Government/Public', 'Health', 'Immigration/Integration', 'Justice/Crime', 'Labor/Employment', 'Macroeconomics/Economic Regulation', 'Media/Journalism', 'Religion', 'Science/Technology'",
            "input": normalizeTweet(item["text"]),
            "output": str(item["annotations"])
        }
        formatted_data.append(formatted_item)

    with open(output_file, 'w') as f:
        json.dump(formatted_data, f, indent=4)

def reformat_json_binary_v01(label, input_file, output_file, dataset_type="train"):
    with open(input_file, 'r') as f:
        data = json.load(f)

    positive_samples = []
    negative_samples = []
    
    for item in data[dataset_type]:
        formatted_item = {
            "instruction": f"Classify the input based on if it's about {label}. Use 1 (True) or 0 (False) as output.",
            "input": normalizeTweet(item["text"]),
            "output": str(int(label in item["annotations"]))
        }
        if label in item["annotations"]:
            positive_samples.append(formatted_item)
        else:
            negative_samples.append(formatted_item)

    # Choose the same amount of negative samples as there are positive samples
    if len(negative_samples) > len(positive_samples):
        negative_samples = random.sample(negative_samples, len(positive_samples))

    # Merge positive and negative samples
    formatted_data = positive_samples + negative_samples

    with open(output_file, 'w') as f:
        json.dump(formatted_data, f, indent=4)

In [7]:
# Example usage binary:
label = "Election Campaign"
input_file = "../data/labeled_data/generic_test_0.json"
output_file_train = f"../data/labeled_data/lora_binary_train_{label.split('/')[0]}_v01.json"
output_file_test = f"../data/labeled_data/lora_binary_test_{label.split('/')[0]}_v01.json"
output_file_valid = f"../data/labeled_data/lora_binary_valid_{label.split('/')[0]}_v01.json"

reformat_json_binary_v01(label, input_file, output_file_train, dataset_type="train")
reformat_json_binary_v01(label, input_file, output_file_test, dataset_type="test")
reformat_json_binary_v01(label, input_file, output_file_valid, dataset_type="valid")

In [8]:
# Example usage multilabel:
input_file = "../data/labeled_data/generic_test_0.json" # generic "test" refers to previous report test and not the actual test set, this file has train test and validation set included
output_file_train = f"../data/labeled_data/lora_no_context_multilabel_v01_train.json"
output_file_test = f"../data/labeled_data/lora_no_context_multilabel_v01_test.json"
output_file_valid = f"../data/labeled_data/lora_no_context_multilabel_v01_valid.json"

reformat_json_v01(input_file, output_file_train, dataset_type="train")
reformat_json_v01(input_file, output_file_test, dataset_type="test")
reformat_json_v01(input_file, output_file_valid, dataset_type="valid")

# Print the amount of 1 or 0s in the binary files

In [7]:
def count_labels(file):
    with open(file, 'r') as f:
        data = json.load(f)

    label_count = {'1': 0, '0': 0}
    for item in data:
        label_count[item["output"]] += 1

    return label_count

# Check the distribution of labels in the output files
print(count_labels(output_file_train))
print(count_labels(output_file_test))
print(count_labels(output_file_valid))

{'1': 754, '0': 754}
{'1': 255, '0': 255}
{'1': 185, '0': 185}
