In [None]:
import os
import pandas as pd

def extract_propaganda_span(sentence):
  """
  Extracts the propaganda span from a sentence.

  @param sentence: An input sentence containing '<BOS>' and '<EOS>' markers.
  @return: The propaganda span between '<BOS>' and '<EOS>' markers.
  """

  #Find positions of <BOS> and <EOS>
  start_index = sentence.find('<BOS>') + len('<BOS> ') if '<BOS>' in sentence else 0
  end_index = sentence.find(' <EOS>') if '<EOS>' in sentence else len(sentence)

  span = sentence[start_index:end_index] # Extract words between <BOS> and <EOS>

  return span

def create_folders_from_tsv(tsv_file):
  """
  Separates tsv into preprocessed folder structure. All classes are separated into separate txt files for usage in later dataset fns.
  """

    # Read TSV into a pd DF
    df = pd.read_csv(tsv_file, sep='\t')
    df['tagged_in_context'] = df['tagged_in_context'].apply(extract_propaganda_span)

    # Create folder to store subfolders
    output_folder = os.path.splitext(tsv_file)[0]  # Get output folder name
    os.makedirs(output_folder, exist_ok=True)


    # Iterate over unique labels
    for label in df['label'].unique():
        # Create subfolder for each label
        label_folder = os.path.join(output_folder, str(label))
        os.makedirs(label_folder, exist_ok=True)

        #get rows with current label
        label_rows = df[df['label'] == label]

        # iterate over rows and write contents to text files
        for index, row in label_rows.iterrows():
            # create a text file for each row
            file_name = f"{index}.txt"
            file_path = os.path.join(label_folder, file_name)


            # write the contents of the 'tagged_in_context' column to the text file
            with open(file_path, 'w') as file:
                file.write(row['tagged_in_context'])

    print(f"Folder structure created successfully in '{output_folder}'")


In [None]:
create_folders_from_tsv("/content/drive/MyDrive/ANLE_Coursework/Data/propaganda_val.tsv")
create_folders_from_tsv("/content/drive/MyDrive/ANLE_Coursework/Data/propaganda_train.tsv")