In [1]:
#import all the necessary packages to preprocess the data
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

#import all the necessary string and text related libraries
import re
import string
import textstat


#import all the nlp related libraries
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize, word_tokenize

#import datasets module to load the dataset from hugging face website
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
data = load_dataset("artem9k/ai-text-detection-pile")

In [3]:
#get the train data from the dataset and convert to dataframe
train = data["train"].to_pandas() 
train.head()

Unnamed: 0,source,id,text
0,human,0,12 Years a Slave: An Analysis of the Film Essa...
1,human,1,20+ Social Media Post Ideas to Radically Simpl...
2,human,2,2022 Russian Invasion of Ukraine in Global Med...
3,human,3,533 U.S. 27 (2001) Kyllo v. United States: The...
4,human,4,A Charles Schwab Corporation Case Essay\n\nCha...


In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1392522 entries, 0 to 1392521
Data columns (total 3 columns):
 #   Column  Non-Null Count    Dtype 
---  ------  --------------    ----- 
 0   source  1392522 non-null  object
 1   id      1392522 non-null  int64 
 2   text    1392522 non-null  object
dtypes: int64(1), object(2)
memory usage: 31.9+ MB


In [5]:
# Find the label names to convert them to 1 and 0
train['source'].unique()

array(['human', 'ai'], dtype=object)

In [6]:
#replace human with 0 and ai with 1
train['source'].replace(['human','ai'],[0.0,1.0],inplace = True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train['source'].replace(['human','ai'],[0.0,1.0],inplace = True)
  train['source'].replace(['human','ai'],[0.0,1.0],inplace = True)


In [7]:
train.head()

Unnamed: 0,source,id,text
0,0.0,0,12 Years a Slave: An Analysis of the Film Essa...
1,0.0,1,20+ Social Media Post Ideas to Radically Simpl...
2,0.0,2,2022 Russian Invasion of Ukraine in Global Med...
3,0.0,3,533 U.S. 27 (2001) Kyllo v. United States: The...
4,0.0,4,A Charles Schwab Corporation Case Essay\n\nCha...


In [8]:
# drop the id column since it might not provide any meaning in the model training
train.drop('id',axis = 1, inplace = True)
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1392522 entries, 0 to 1392521
Data columns (total 2 columns):
 #   Column  Non-Null Count    Dtype  
---  ------  --------------    -----  
 0   source  1392522 non-null  float64
 1   text    1392522 non-null  object 
dtypes: float64(1), object(1)
memory usage: 21.2+ MB


In [9]:
#rename source column to generated
train.rename(columns = {'source':'generated'},inplace= True)
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1392522 entries, 0 to 1392521
Data columns (total 2 columns):
 #   Column     Non-Null Count    Dtype  
---  ------     --------------    -----  
 0   generated  1392522 non-null  float64
 1   text       1392522 non-null  object 
dtypes: float64(1), object(1)
memory usage: 21.2+ MB


In [10]:
# get the value count to check if there is any unbalance in the classes
class_counts = train['generated'].value_counts()

In [11]:
class_counts

generated
0.0    1028146
1.0     364376
Name: count, dtype: int64

In [12]:
#Balance the dataset since the human generated data is huge

class_1 = train[train['generated'] == 1.0].sample(200000, random_state=42)
class_0 = train[train['generated'] == 0.0].sample(200000, random_state=42)

train = pd.concat([class_0, class_1])

# Shuffle the data
train = train.sample(frac=1, random_state=42).reset_index(drop=True)

In [13]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400000 entries, 0 to 399999
Data columns (total 2 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   generated  400000 non-null  float64
 1   text       400000 non-null  object 
dtypes: float64(1), object(1)
memory usage: 6.1+ MB


In [14]:
# Function to find the average length of the sentence
def average_sentence_length(text):
    sentence_endings = r"(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|\!)\s"
    sentences = re.split(sentence_endings, text)
    lengths = [len(words.split()) for words in sentences]
    avg = sum(lengths)/len(sentences)
    return avg

#Apply the function to the 'text' column
train['avg_len_sentences'] = train['text'].apply(average_sentence_length)
train.head()

Unnamed: 0,generated,text,avg_len_sentences
0,0.0,of nomadic violence and instability after the ...,23.857143
1,0.0,Google Company’s Personal Development Plan Ess...,17.027778
2,0.0,"n ``Come on now, are you guys still trying to ...",21.375
3,0.0,"I loved him. I loved him fast, I loved him har...",7.111111
4,1.0,\n\n\n There are a number of important factors...,22.428571


In [15]:
# Function to find the word count in each row
def word_count(text):
    return len(text.split())
    
#Apply the function to the 'text' column
train['words_count'] = train['text'].apply(word_count)
train.head()

Unnamed: 0,generated,text,avg_len_sentences,words_count
0,0.0,of nomadic violence and instability after the ...,23.857143,167
1,0.0,Google Company’s Personal Development Plan Ess...,17.027778,613
2,0.0,"n ``Come on now, are you guys still trying to ...",21.375,171
3,0.0,"I loved him. I loved him fast, I loved him har...",7.111111,192
4,1.0,\n\n\n There are a number of important factors...,22.428571,157


In [16]:
# Function to get the punctuation count in each text
def punctuation_count(text):
    punctuation_pattern = r"[.,!?;:'\"()\[\]{}]"
    matches = re.findall(punctuation_pattern, text)
    return len(matches)

#Apply the function to the 'text' column
train['punctuations_count'] = train['text'].apply(punctuation_count)
train.head()

Unnamed: 0,generated,text,avg_len_sentences,words_count,punctuations_count
0,0.0,of nomadic violence and instability after the ...,23.857143,167,22
1,0.0,Google Company’s Personal Development Plan Ess...,17.027778,613,97
2,0.0,"n ``Come on now, are you guys still trying to ...",21.375,171,38
3,0.0,"I loved him. I loved him fast, I loved him har...",7.111111,192,54
4,1.0,\n\n\n There are a number of important factors...,22.428571,157,24


In [17]:
#get the readability score for each text
def readability_score(text):
    return textstat.flesch_reading_ease(text)

#Apply the function to the 'text' column
train['readability_score'] = train['text'].apply(readability_score)

In [18]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Aishu\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [19]:
stop_words = set(stopwords.words('english'))

# Function to find the stop words ratio to the original number of words
def stop_words_ratio(text):
    stop_count = 0
    new = text.split()
    for i in new:
        if(i in stop_words):
            stop_count+=1
    return stop_count/len(new) if len(new)>0 else 0

#Apply the function to the 'text' column
train['stop_word_ratio'] = train['text'].apply(stop_words_ratio)

In [20]:
train.head()

Unnamed: 0,generated,text,avg_len_sentences,words_count,punctuations_count,readability_score,stop_word_ratio
0,0.0,of nomadic violence and instability after the ...,23.857143,167,22,50.36,0.389222
1,0.0,Google Company’s Personal Development Plan Ess...,17.027778,613,97,44.24,0.373573
2,0.0,"n ``Come on now, are you guys still trying to ...",21.375,171,38,82.34,0.409357
3,0.0,"I loved him. I loved him fast, I loved him har...",7.111111,192,54,88.63,0.3125
4,1.0,\n\n\n There are a number of important factors...,22.428571,157,24,28.88,0.382166


In [21]:
# Function to convert the text to lower case
def lower_case(text):
    return text.lower()

# Apply the function to the 'text' column
train['text'] = train['text'].apply(lower_case)
train.head()

Unnamed: 0,generated,text,avg_len_sentences,words_count,punctuations_count,readability_score,stop_word_ratio
0,0.0,of nomadic violence and instability after the ...,23.857143,167,22,50.36,0.389222
1,0.0,google company’s personal development plan ess...,17.027778,613,97,44.24,0.373573
2,0.0,"n ``come on now, are you guys still trying to ...",21.375,171,38,82.34,0.409357
3,0.0,"i loved him. i loved him fast, i loved him har...",7.111111,192,54,88.63,0.3125
4,1.0,\n\n\n there are a number of important factors...,22.428571,157,24,28.88,0.382166


In [22]:
# Function to remove stop words
def remove_stopwords(text):
    for i in stop_words:
        text.replace(i,'')
    return text

# Apply the function to the 'text' column
train['text'] = train['text'].apply(remove_stopwords)

In [23]:
# Function to remove tags such as new line
def remove_tags(text):
    tags = ['\n', '\'']
    for tag in tags:
        text = text.replace(tag, '')
    return text

#Apply the function to the 'text' column
train['text'] = train['text'].apply(remove_tags)
train.head()

Unnamed: 0,generated,text,avg_len_sentences,words_count,punctuations_count,readability_score,stop_word_ratio
0,0.0,of nomadic violence and instability after the ...,23.857143,167,22,50.36,0.389222
1,0.0,google company’s personal development plan ess...,17.027778,613,97,44.24,0.373573
2,0.0,"n ``come on now, are you guys still trying to ...",21.375,171,38,82.34,0.409357
3,0.0,"i loved him. i loved him fast, i loved him har...",7.111111,192,54,88.63,0.3125
4,1.0,there are a number of important factors that ...,22.428571,157,24,28.88,0.382166


In [24]:
# Function to remove punctuations from the text
def remove_punc(text):
    new_text = [x for x in text if x not in string.punctuation]
    new_text = ''.join(new_text)
    return new_text

#Apply the function to the 'text' column
train['text']=train['text'].apply(remove_punc)
train.head()

Unnamed: 0,generated,text,avg_len_sentences,words_count,punctuations_count,readability_score,stop_word_ratio
0,0.0,of nomadic violence and instability after the ...,23.857143,167,22,50.36,0.389222
1,0.0,google company’s personal development plan ess...,17.027778,613,97,44.24,0.373573
2,0.0,n come on now are you guys still trying to act...,21.375,171,38,82.34,0.409357
3,0.0,i loved him i loved him fast i loved him hard ...,7.111111,192,54,88.63,0.3125
4,1.0,there are a number of important factors that ...,22.428571,157,24,28.88,0.382166


In [25]:
#Save the Processed Data
train.to_csv('DataSets/Processed_Data_small.csv',index = False,sep = ',',encoding='utf-8', quoting=1)