First task: My detector code wants three .jsonl files for test, train and val. I have one csv that need to be split accordingly and converted to jsonl. This is the abstract data.

In [1]:
import csv
import json
import random

In [7]:


paths_to_convert = ["/Users/dustinhayes/Desktop/DEEP LEARNING FINAL PROJECT/test.csv",
                    "/Users/dustinhayes/Desktop/DEEP LEARNING FINAL PROJECT/validation.csv",
                    "/Users/dustinhayes/Desktop/DEEP LEARNING FINAL PROJECT/train.csv"]
paths_to_gain = ["detector/TestData/test.jsonl", "detector/TestData/validation.jsonl", "detector/TestData/train.jsonl"]

for start_path, end_path in zip(paths_to_convert, paths_to_gain):
    with open(start_path, mode='r', encoding='utf-8') as csv_file, \
        open(end_path, mode='w', encoding='utf-8') as jsonl_file:

        reader = csv.DictReader(csv_file)
        for row in reader:
            row['label'] = str(1 - int(row['label'].strip()))
            row['text'] = row['abstract']
            del row['abstract'] # replace 'abstract' key with 'text' to be consistent with other data sources.

            cleaned_row = {k: v.strip() for k, v in row.items() if k != 'label'}  # Clean other fields
            cleaned_row['label'] = row['label']  # Add the converted label

            jsonl_file.write(json.dumps(cleaned_row) + '\n')


Second task: I have downloaded the original GPT-2 output detector data and need to processes it such that the format matched the abtract data.

We are coming from: A 'webtext' set of 3 JSONL files (training, text, val) which contains human-written content, and a set of GPT-2 generated 'large-762' files which contain machine written examples. These JSONL files have the fields: ['id', 'text', 'length', 'ended']


We need to go to: Three JSONL files, which contain ['text', 'label'], where label is 1 = HUMAN, 0 = ROBOT. I think I would like to keep 'length' hanging around.

In [3]:
path_to_files_to_convert = 'gpt-2-output-dataset/Deep Learning Final Project Data Pile'
machine_written_files_to_convert = ['/Users/dustinhayes/Desktop/DEEP LEARNING FINAL PROJECT/gpt-2-output-dataset/Deep Learning Final Project Data Pile/xl-1542M-k40.test.jsonl',
                                    '/Users/dustinhayes/Desktop/DEEP LEARNING FINAL PROJECT/gpt-2-output-dataset/Deep Learning Final Project Data Pile/xl-1542M-k40.train.jsonl',
                                    '/Users/dustinhayes/Desktop/DEEP LEARNING FINAL PROJECT/gpt-2-output-dataset/Deep Learning Final Project Data Pile/xl-1542M-k40.valid.jsonl']
human_written_files_to_convert = ['/Users/dustinhayes/Desktop/DEEP LEARNING FINAL PROJECT/gpt-2-output-dataset/Deep Learning Final Project Data Pile/webtext.test.jsonl',
                                  '/Users/dustinhayes/Desktop/DEEP LEARNING FINAL PROJECT/gpt-2-output-dataset/Deep Learning Final Project Data Pile/webtext.train.jsonl',
                                  '/Users/dustinhayes/Desktop/DEEP LEARNING FINAL PROJECT/gpt-2-output-dataset/Deep Learning Final Project Data Pile/webtext.valid.jsonl']
paths_to_gain = ['/Users/dustinhayes/Desktop/DEEP LEARNING FINAL PROJECT/gpt-2-output-dataset/detector/OriginalGPTDataLarge/test.jsonl',
                 '/Users/dustinhayes/Desktop/DEEP LEARNING FINAL PROJECT/gpt-2-output-dataset/detector/OriginalGPTDataLarge/train.jsonl',
                 '/Users/dustinhayes/Desktop/DEEP LEARNING FINAL PROJECT/gpt-2-output-dataset/detector/OriginalGPTDataLarge/validation.jsonl']
# I am going to attempt to make a very short version to confirm that my GPT-2 detector is getting loaded correctly
# Let's say 1000 examples, 500 machine 500 human

# Remember, HUMAN = 1, ROBOT = 0
RECORDS_TO_FETCH = 1000

for machine_file, human_file, new_path in zip(machine_written_files_to_convert,
                                                 human_written_files_to_convert,
                                                 paths_to_gain):
    # machine_file = path_to_files_to_convert + '/' + machine_file
    # human_file = path_to_files_to_convert + '/' + human_file
    record_list = []
    with open(machine_file, mode='r', encoding='utf-8') as machine_f, \
         open(human_file, mode='r', encoding='utf-8') as human_f, \
         open (new_path, mode='w', encoding='utf-8') as output_f:
                # Process machine-written records
        for i, line in enumerate(machine_f):
            if i >= RECORDS_TO_FETCH // 2:
                break
            record = json.loads(line)
            new_record = {
                'text': record['text'].replace('\n\n', " "),
                'label': '0',  # ROBOT
                'length': record['length']
            }
            record_list.append(new_record)
            # output_f.write(json.dumps(new_record) + '\n')

        # Process human-written records
        for i, line in enumerate(human_f):
            if i >= RECORDS_TO_FETCH // 2:
                break
            record = json.loads(line)
            new_record = {
                'text': record['text'].replace("\n\n", " "), # Remove new line chars, I am worried they will mess with the paraphraser
                'label': '1',  # HUMAN
                'length': record['length']
            }
            record_list.append(new_record)
            # output_f.write(json.dumps(new_record) + '\n')
        random.shuffle(record_list)

        for record in record_list:
            output_f.write(json.dumps(record) + '\n')





I want a small version of the abstract dataset.

In [3]:


# Paths to the original datasets
paths_to_convert = [
    "/Users/dustinhayes/Desktop/DEEP LEARNING FINAL PROJECT/test.csv",
    "/Users/dustinhayes/Desktop/DEEP LEARNING FINAL PROJECT/validation.csv",
    "/Users/dustinhayes/Desktop/DEEP LEARNING FINAL PROJECT/train.csv"
]

# Paths where the smaller datasets will be saved
paths_to_gain = [
    "detector/TestData/test.jsonl",
    "detector/TestData/validation.jsonl",
    "detector/TestData/train.jsonl"
]

# Number of samples per dataset type
sample_size = 100

for start_path, end_path in zip(paths_to_convert, paths_to_gain):
    with open(start_path, mode='r', encoding='utf-8') as csv_file, \
         open(end_path, mode='w', encoding='utf-8') as jsonl_file:

        reader = csv.DictReader(csv_file)
        count = 0  # Initialize counter for each dataset

        for row in reader:
            if count >= sample_size:  # Check if the sample size is reached
                break  # Stop reading more rows

            # Convert label and restructure row
            row['label'] = str(1 - int(row['label'].strip()))
            row['text'] = row['abstract']
            del row['abstract']

            # Clean other fields
            cleaned_row = {k: v.strip() for k, v in row.items() if k != 'label'}
            cleaned_row['label'] = row['label']

            # Write to JSONL file
            jsonl_file.write(json.dumps(cleaned_row) + '\n')

            count += 1  # Increment counter
