In [20]:
import pandas as pd
import numpy as np
import torch



def read_file(fname: str, correct_labels=False) -> pd.DataFrame:
    """Reads a filename, return df with text and labels."""

    df = pd.read_table(fname, sep="\t", header=None, names="text,labels,role".split(","))
    if correct_labels:
        offensive_ids = df.labels != "Acceptable speech"
        df.labels[offensive_ids] = True
        df.labels[~offensive_ids] = False

    #f = df.drop(columns=["role"])
    return df



for language in "hr,sl,en".split(","):
    for segment in ["train", "test"]:
        first_fname = f"../data/lgbt-{language}.{segment}.tsv"
        second_fname = f"../data/migrants-{language}.{segment}.tsv"
        
        first = read_file(first_fname)
        second = read_file(second_fname)
        
        joined = pd.concat([first, second])
        shuffled = joined.sample(frac=1, random_state=42)
        
        final_fname = f"../data/merged-{language}.{segment}.tsv"
        shuffled.to_csv(final_fname, index=False, sep="\t", header=False)

In [18]:
!ls ../data/merged*

../data/merged-en.test.tsv   ../data/merged-hr.train.tsv
../data/merged-en.train.tsv  ../data/merged-sl.test.tsv
../data/merged-hr.test.tsv   ../data/merged-sl.train.tsv


In [21]:
!head -n 5 ../data/merged-en.test.tsv

That still means that about 10% of society is receiving 51% of provision. Fine if we actually had a comprehensive care system but in effect it will lead to people being discriminated against based on gender\sexuality.	Acceptable speech	No target
You idiot	Other offensive	Commenter
Thank you for helping changing the term https://www.youtube.com/watch?v=dKfdPw1bR-A	Acceptable speech	No target
You heard it here folks , as houris has spoken , their intentions are clear , to overthrow European governments and inhabitants , with their own misinterpreted views , we need to act now , the fret is real , Europe is been invaded , close each border , don't feed them ,give them water and tell them to move along and go back where they came and gets refund from the people smugglers !	Background offensive	Topic
Good signs for people awaiting for divine punishment of the devilish american satanic regime	Background offensive	Topic
