<a href="https://colab.research.google.com/github/Amanda9805/Detecting-Machine-Generated-Texts/blob/development/data_preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

COS760 Detecting Machine-Generated Text in African
Languages

Phase 1 -  Data Collection and Preprocessing

In [None]:
!pip install --upgrade datasets fsspec

Collecting fsspec
  Using cached fsspec-2025.5.1-py3-none-any.whl.metadata (11 kB)


In [None]:
from google.colab import drive, auth, files
drive.mount('/content/drive', force_remount=True)


Mounted at /content/drive


In [None]:
import os
import pandas as pd
import numpy as np
import torch
import nltk
import random
from nltk.tokenize import sent_tokenize
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from datasets import load_dataset
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter



In [None]:
nltk.download('punkt')

# Load the isiZulu monolingual dataset
zulu_dataset = load_dataset("dsfsi/vukuzenzele-monolingual", "zul")

# Print information about the dataset
print(f"Available splits: {zulu_dataset.keys()}")
print(f"Number of examples in train: {len(zulu_dataset['train'])}")

# Check the first few examples
print(f"Dataset: {zulu_dataset}")
for split in zulu_dataset:
    print(f"Number of examples in {split}: {len(zulu_dataset[split])}")

# just to see what the data looks like
print("\nSample data:")
zuldf = pd.DataFrame(zulu_dataset['train'])
zuldf.head()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Available splits: dict_keys(['train', 'test', 'eval'])
Number of examples in train: 129
Dataset: DatasetDict({
    train: Dataset({
        features: ['title', 'author', 'text', 'edition', 'language_code', '__index_level_0__'],
        num_rows: 129
    })
    test: Dataset({
        features: ['title', 'author', 'text', 'edition', 'language_code', '__index_level_0__'],
        num_rows: 28
    })
    eval: Dataset({
        features: ['title', 'author', 'text', 'edition', 'language_code', '__index_level_0__'],
        num_rows: 28
    })
})
Number of examples in train: 129
Number of examples in test: 28
Number of examples in eval: 28

Sample data:


Unnamed: 0,title,author,text,edition,language_code,__index_level_0__
0,Zivikele kwi-COVID-19 ngaphambi kobusika \n,Allison Cooper\n,Ngokusho kukaNgqongqoshe Wezempilo uDkt. Joe P...,2022-05-ed2,zul,155
1,Kwethulwe ngokusemthethweni uhlelo lweWoza Ma...,Dale Hes \n,Abafundi BakaMa tikuletsheni bakulonyaka kumel...,2020-09-ed2,zul,85
2,Uhlelo lokudla esikoleni luqinisekisa ukuthi a...,Vukuzenzele Unnamed\n,Lolu hlelo luhlose ukuthuthukisa ikhono labant...,2022-04-ed2,zul,150
3,Izisu ezigcwele ziholela emiphumeleni engcono\n,More Matshediso\n,hulumeni eNingizimu Afrika yonkana abakwazi uk...,2018-08-ed1,zul,26
4,Ukwakha kabusha impilo nendlela yokuphila emuv...,Vukuzenzele Unnamed\n,Kamuva nje bengivakashele eThekwini KwaZulu-Na...,2022-06-ed1,zul,158


In [None]:
columns = list(zulu_dataset['train'].features.keys())
print(f"Column names: {columns}")

# Check for missing values
print("\nChecking for missing values on training data:")
for column in columns:
    missing_count = sum(1 for item in zulu_dataset['train'] if not item[column])
    print(f"  {column}: {missing_count} missing values")

Column names: ['title', 'author', 'text', 'edition', 'language_code', '__index_level_0__']

Checking for missing values on training data:
  title: 0 missing values
  author: 0 missing values
  text: 3 missing values
  edition: 0 missing values
  language_code: 0 missing values
  __index_level_0__: 1 missing values


In [None]:
import re
def preprocess_text(text):
    if not text:
        return ""

    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Convert to lowercase
    text = text.lower()

    # Remove excessive whitespace
    text = ' '.join(text.split())

    return text

1 Processed text for train: ngokusho kukangqongqoshe wezempilo udkt joe phaahla mhla zingama kumbasa iningizimu afrika yaqopha ukukhula kwezigameko ezintsha zecovid ngama uma kuqhathaniswa nezinsuku eziyisikhombisa ezedlule okuholwa kakhulu ngamazinga aphezulu okusuleleka ezifundazweni zasegauteng kwazulunatali kanye nasentshonalanga kapa icovid isalokhu iyingozi futhi ngeke sikwazi ukuyekela zonke izindlela zokuyivikela kusho ungqongqoshe uphaahla siyaphinda futhi sinxusa bonke labo abangakagomi ukuthi beze bezogoma sisalokhu singaphansi kwama wabantu abadala asebethole okungenani umjovo owodwa kanti lokho akukuhle neze sinxusa intsha esikhulile ukuthi isondele ukuqina kwamasotsha omzimba ngokwemvelo kuyafadalala ngokuhamba kwesikhathi futhi akufani nokugoma ngeke ukubekele isikhathi sinxusa bonke labo abafanelekile ukuthola umjovo wokuvuselela ukuthi basebenzise leli thuba kungakafiki ubusika kwengeza yena umnyango wezempilo entshonalanga kapa uthi imininingwane yalapha kuleli lizwe 

Unnamed: 0,text,title,author,source,language,label
0,ngokusho kukangqongqoshe wezempilo udkt joe ph...,Zivikele kwi-COVID-19 ngaphambi kobusika \n,Allison Cooper\n,vukuzenzele,zul,0
1,abafundi bakama tikuletsheni bakulonyaka kumel...,Kwethulwe ngokusemthethweni uhlelo lweWoza Ma...,Dale Hes \n,vukuzenzele,zul,0
2,lolu hlelo luhlose ukuthuthukisa ikhono labant...,Uhlelo lokudla esikoleni luqinisekisa ukuthi a...,Vukuzenzele Unnamed\n,vukuzenzele,zul,0
3,hulumeni eningizimu afrika yonkana abakwazi uk...,Izisu ezigcwele ziholela emiphumeleni engcono\n,More Matshediso\n,vukuzenzele,zul,0
4,kamuva nje bengivakashele ethekwini kwazulunat...,Ukwakha kabusha impilo nendlela yokuphila emuv...,Vukuzenzele Unnamed\n,vukuzenzele,zul,0


In [None]:
# Extract and preprocess human text
zulu_texts = []
next = 0
for split in zulu_dataset:
  for item in zulu_dataset[split]:
    if 'text' in item and item['text']:
        processed_text = preprocess_text(item['text'])
        next = next+1
        print(f"{next} Processed text for {split}: {processed_text}")

        # Filter out very short texts
        if len(processed_text) >= 50:
            zulu_texts.append({
                'text': processed_text,
                'title': item.get('title', ''),  # Include title if available
                'author': item.get('author', ''),
                'source': 'vukuzenzele',
                'language': 'zul',
                'label': 0  # 0 indicates human-written text
            })

print(f"Total processed texts: {len(zulu_texts)}")
zulu_df = pd.DataFrame(zulu_texts)
zulu_df.head()

Dealing with Machine generated text
-----------------------------------
The purpose of this section is to preprocess the machine generated texts so that we can have another lable for the classification

In [None]:
import json

# Load data from JSONL file and return list of dictionaries
def load_jsonl_data(file_path):
    data = []
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            for line in file:
                line = line.strip()
                if line:  # Skip empty lines
                    try:
                        data.append(json.loads(line))
                    except json.JSONDecodeError as e:
                        print(f"Error parsing JSON line: {e}")
                        continue
    except FileNotFoundError:
        print(f"File not found: {file_path}")
        return []

    return data

# Preprocess JSONL data to match the format of existing zulu_texts
def preprocess_jsonl_data(jsonl_data):
    processed_texts = []

    for idx, item in enumerate(jsonl_data):
        if 'text' in item and item['text']:
            processed_text = preprocess_text(item['text'])

            # Filter out very short texts (same threshold as zulu_texts)
            if len(processed_text) >= 50:
                processed_texts.append({
                    'text': processed_text,
                    'title': item.get('title', ''),
                    'author': item.get('author', ''),
                    'source': 'machine_generated',
                    'language': 'zu',
                    'label': 1  # 1 indicates machine-generated text
                })
    processed_text_df = pd.DataFrame(processed_texts)
    processed_text_df.head()

    return processed_texts

In [None]:
# manually upload a mg json file
uploaded = files.upload()

# Get the filename from the uploaded dictionary
jsonl_file_name = list(uploaded.keys())[0]

jsonl_data = load_jsonl_data(jsonl_file_name)

print(f"Loaded {len(jsonl_data)} items from JSONL file")

In [None]:
# Preprocess the JSONL data
machine_generated_texts = preprocess_jsonl_data(jsonl_data)
print(f"Processed {len(machine_generated_texts)} machine-generated texts")

# Combine both datasets, this will be bad for us if we have more than 70% differnce in data length
all_texts = zulu_texts + machine_generated_texts

print(f"Total texts after combining: {len(all_texts)}")
print(f"Human-written texts: {len(zulu_texts)}")
print(f"Machine-generated texts: {len(machine_generated_texts)}")

# Create a DataFrame for easier analysis
combined_df = pd.DataFrame(all_texts)

# Basic statistics about text lengths
print("\nText length statistics:")
combined_df['text_length'] = combined_df['text'].str.len()
print(combined_df.groupby('label')['text_length'].describe())
processed_text_df = pd.DataFrame(machine_generated_texts)
processed_text_df.head()

In [None]:
# Visualize text length distribution just for debugging
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.hist(combined_df[combined_df['label'] == 0]['text_length'],
         alpha=0.7, label='Human-written', bins=30, color='blue')
plt.hist(combined_df[combined_df['label'] == 1]['text_length'],
         alpha=0.7, label='Machine-generated', bins=30, color='red')
plt.xlabel('Text Length (characters)')
plt.ylabel('Frequency')
plt.title('Text Length Distribution')
plt.legend()

plt.subplot(1, 2, 2)
sns.boxplot(data=combined_df, x='label', y='text_length')
plt.xlabel('Label (0=Human, 1=Machine)')
plt.ylabel('Text Length (characters)')
plt.title('Text Length by Source')

plt.tight_layout()
plt.show()

In [None]:
# Create a balanced dataset with equal samples from each class
# we can choose to use this of the combined one
def create_balanced_dataset(df, target_size_per_class=None):

    class_counts = df['label'].value_counts()
    min_class_size = class_counts.min()

    if target_size_per_class:
        sample_size = min(target_size_per_class, min_class_size)
    else:
        sample_size = min_class_size

    balanced_df = df.groupby('label').sample(n=sample_size, random_state=42)
    return balanced_df.reset_index(drop=True)

# Create balanced dataset, because we need the same number of samples for each class
balanced_df = create_balanced_dataset(combined_df)
print(f"\nBalanced dataset created with {len(balanced_df)} samples")
print(f"Label distribution in balanced dataset: \n{balanced_df['label'].value_counts()}")

# Save balanced dataset
balanced_output_path = '/content/drive/MyDrive/balanced_zulu_texts.csv'
balanced_df.to_csv(balanced_output_path, index=False, encoding='utf-8')
print(f"Balanced dataset saved to: {balanced_output_path}")