In [2]:
#install packege for language detection
!pip install langdetect

Collecting langdetect
  Downloading langdetect-1.0.9.tar.gz (981 kB)
     ---------------------------------------- 0.0/981.5 kB ? eta -:--:--
     ---------------------------------------- 0.0/981.5 kB ? eta -:--:--
     ---------------------------------------- 10.2/981.5 kB ? eta -:--:--
     ---------------------------------------- 10.2/981.5 kB ? eta -:--:--
     - ----------------------------------- 30.7/981.5 kB 217.9 kB/s eta 0:00:05
     - ----------------------------------- 30.7/981.5 kB 217.9 kB/s eta 0:00:05
     -- ---------------------------------- 61.4/981.5 kB 297.7 kB/s eta 0:00:04
     ---- ------------------------------- 112.6/981.5 kB 467.6 kB/s eta 0:00:02
     ---- ------------------------------- 122.9/981.5 kB 481.4 kB/s eta 0:00:02
     -------- --------------------------- 225.3/981.5 kB 689.2 kB/s eta 0:00:02
     ----------- ------------------------ 307.2/981.5 kB 827.2 kB/s eta 0:00:01
     ------------- ---------------------- 368.6/981.5 kB 919.0 kB/s eta 0:00:

In [10]:
#import libraries
import pandas as pd
from langdetect import detect
from langdetect.lang_detect_exception import LangDetectException
import re

In [22]:
#load raw dataset
df = pd.read_csv('C:/Users/Muditha/OneDrive - University of Eastern Finland/Documents/UEF/Thesis/AI-writing-detector/data/rawdataset.csv')
print("Total records before cleaning:", len(df))

#drop missing abstracts
df = df.dropna(subset=['Abstract'])
print("After dropping missing abstracts:", len(df))

#detect and keep abstracts only written in English language
def is_english(text):
    try:
        return detect(text) == 'en'
    except LangDetectException:
            return False

df['is_english'] = df['Abstract'].apply(is_english)
df = df[df['is_english']]
print("After filtering English Abstracts:", len(df))

#Clean abstract text(remove XML/HTML tags, remove non-ASCII, normalize whitespace)
def clean_text(text):
    text = re.sub(r'<[^>]+>', '', text) #remove XML/HTML tags
    text = text.encode("ascii","ignore").decode() #remove non-ASCII
    text = re.sub(r'\s+', ' ', text).strip() #normalize whitespace
    return text

df['Abstract'] = df['Abstract'].apply(clean_text)

#remove noice/garbage abstracts
def is_clean_abstract(text):
    if not text or len(text.split()) < 5:
        return False
    if re.match(r'^[^a-zA-Z0-9]+$', text.strip().split()[0]):
        return False
    total_chars = len(text)
    punct_chars = sum(1 for c in text if c in r"""!@#$%^&*()_+[]{};:'"\|,<.>/?`~-=—…""")
    if punct_chars / total_chars > 0.1:
        return False
    if len(re.findall(r'(,,|;;|\.{2,}|\-{2,})', text)) > 3:
        return False
    return True

df = df[df['Abstract'].apply(is_clean_abstract)]
print("After removing garbage/irregular abstracts:", len(df))

#drop rows where year is missing or non-numeric
df = df[pd.to_numeric(df['Publication Year'], errors='coerce').notnull()]
df['Publication Year'] = df['Publication Year'].astype(int)

#filtering valid year range(1812-2025)
df = df[df['Publication Year'].between(1812,2025,inclusive = 'both')]
print("After filtering by valid year range:",len(df))

#drop duploicate abstracts 
df = df.drop_duplicates(subset = ['Abstract'])
print("After removing duplicate abstracts:", len(df))

#keep only relevant coloumns 
df =  df[['Lens ID', 'Title', 'Abstract', 'Publication Year']].reset_index(drop=True)

  df = pd.read_csv('C:/Users/Muditha/OneDrive - University of Eastern Finland/Documents/UEF/Thesis/AI-writing-detector/data/rawdataset.csv')


Total records before cleaning: 38123
After dropping missing abstracts: 29809
After filtering English Abstracts: 28168
After removing garbage/irregular abstracts: 27884
After filtering by valid year range: 27649
After removing duplicate abstracts: 27048


In [24]:
#save cleaned dataset
df.to_csv('C:/Users/Muditha/OneDrive - University of Eastern Finland/Documents/UEF/Thesis/AI-writing-detector/data/cleaned_dataset.csv', index=False)
print("Cleaned file saved! Final total records:", len(df))

Cleaned file saved! Final total records: 27048
