In [1]:
import pandas as pd
import numpy as np
import string
import enchant
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.feature_extraction.text import TfidfTransformer


In [2]:
nltk.download('wordnet')
nltk.download('stopwords')


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\amosg\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\amosg\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
# Load and preprocess the data
data_1 = pd.read_csv('GPT-wiki-intro.csv')


In [3]:
data_1['generated_text'] = data_1['generated_intro'] + ' ' + data_1['generated_text']

In [4]:
data_1.head()

Unnamed: 0,id,url,title,wiki_intro,generated_intro,title_len,wiki_intro_len,generated_intro_len,prompt,generated_text,prompt_tokens,generated_text_tokens
0,63064638,https://en.wikipedia.org/wiki/Sexhow%20railway...,Sexhow railway station,Sexhow railway station was a railway station b...,Sexhow railway station was a railway station l...,3,174,78,200 word wikipedia style introduction on 'Sexh...,Sexhow railway station was a railway station l...,25,88
1,279621,https://en.wikipedia.org/wiki/Eti%C3%A4inen,Etiäinen,"In Finnish folklore, all places and things, an...","In Finnish folklore, all places and things, an...",1,187,80,200 word wikipedia style introduction on 'Etiä...,"In Finnish folklore, all places and things, an...",26,101


In [5]:
# drop the specified columns
data_1 = data_1.drop(columns=['url','id', 'title_len', 'prompt', 'prompt_tokens','title','generated_text_tokens','generated_intro'])

In [6]:
data_1.head(2)

Unnamed: 0,wiki_intro,wiki_intro_len,generated_intro_len,generated_text
0,Sexhow railway station was a railway station b...,174,78,Sexhow railway station was a railway station l...
1,"In Finnish folklore, all places and things, an...",187,80,"In Finnish folklore, all places and things, an..."


In [7]:
# create a copy of the DataFrame
data_2 = data_1.copy()

In [8]:
data_2.head(2)

Unnamed: 0,wiki_intro,wiki_intro_len,generated_intro_len,generated_text
0,Sexhow railway station was a railway station b...,174,78,Sexhow railway station was a railway station l...
1,"In Finnish folklore, all places and things, an...",187,80,"In Finnish folklore, all places and things, an..."


# Dealing with the First Data: AI Generated Text

In [9]:
# rename the DataFrame
AI_generated = data_1

In [10]:
# drop the specified columns
AI_generated = data_1.drop(columns=['wiki_intro','wiki_intro_len'])

In [11]:
# add a new column with all values equal to zero
AI_generated = AI_generated.assign(new_col=0)
# rename the 'output' column to 'new_output'
AI_generated = AI_generated.rename(columns={'new_col': 'output'})

In [12]:
AI_generated.head(2)

Unnamed: 0,generated_intro_len,generated_text,output
0,78,Sexhow railway station was a railway station l...,0
1,80,"In Finnish folklore, all places and things, an...",0


# Dealing with the second data: Wiki Generated Text


In [13]:
# rename the DataFrame
wiki_generated = data_2

In [14]:
# drop the specified columns
wiki_generated = data_2.drop(columns=['generated_text','generated_intro_len'])

In [15]:
# add a new column with all values equal to zero
wiki_generated = wiki_generated.assign(new_col=1)
# rename the 'output' column to 'new_output'
wiki_generated = wiki_generated.rename(columns={'new_col': 'output'})

In [16]:
wiki_generated.head(2)

Unnamed: 0,wiki_intro,wiki_intro_len,output
0,Sexhow railway station was a railway station b...,174,1
1,"In Finnish folklore, all places and things, an...",187,1


# Renaming the Dataset to make them Uniform

In [17]:
# rename the 'wiki_intro' and 'wiki_intro_len' columns to 'Text' and 'len'
wiki_generated = wiki_generated.rename(columns={'wiki_intro': 'Text', 'wiki_intro_len': 'len'})


In [18]:
wiki_generated.tail(2)

Unnamed: 0,Text,len,output
149998,Vossius Gymnasium is a public gymnasium in Ams...,168,1
149999,"Simone Stratigo (, Symeon Filippos Stratigos; ...",153,1


In [19]:

# rename the 'wiki_intro' and 'wiki_intro_len' columns to 'Text' and 'len'
AI_generated = AI_generated.rename(columns={'generated_text': 'Text', 'generated_intro_len': 'len'})


In [20]:
AI_generated.tail(2)

Unnamed: 0,len,Text,output
149998,108,Vossius Gymnasium is a public gymnasium in the...,0
149999,132,"Simone Stratigo (, Symeon Filippos Stratigos; ...",0


# Adding Both Dataset Together

In [21]:
# concatenate the two DataFrames vertically
Data = pd.concat([AI_generated, wiki_generated], axis=0, ignore_index=True)


In [22]:
Data.tail(2)

Unnamed: 0,len,Text,output
299998,168,Vossius Gymnasium is a public gymnasium in Ams...,1
299999,153,"Simone Stratigo (, Symeon Filippos Stratigos; ...",1


In [23]:
Data.isnull().sum()

len       0
Text      0
output    0
dtype: int64

In [24]:
# shuffle the rows randomly before training to avoid over fitting
data = Data.sample(frac=1)

In [25]:
data.head(3)

Unnamed: 0,len,Text,output
106327,95,The American Hairless Terrier is a breed of te...,0
91423,91,Cuminestown is a village in the Formartine Val...,0
262999,184,MeeK (birth name Stephane-Franck Pascal; born ...,1


# Feature Extraction

In [30]:
 data['text_length'] = data['Text'].apply(len)
# Extracting TF-IDF features from the text
vectorizer = TfidfVectorizer(max_features=1000)
tfidf_features = vectorizer.fit_transform(data['Text']).toarray()
# Combining text length and TF-IDF features
features = np.hstack((data[['text_length']].values, tfidf_features))
labels = data['label']

# Training the Model

In [156]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test, len_train, len_test = train_test_split(data[['Text', 'len']], data['output'], test_size=0.2, random_state=0)

In [9]:
# Extract features from the text
vectorizer = TfidfVectorizer(stop_words='english', max_features=10000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [10]:
param = {
    'max_depth': 4,
    'eta':0.3,
    'objective':'multi:softmax',
    'num_class':3}
epochs = 5

In [None]:
# Train a machine learning model
model = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
model.fit(X_train_tfidf, y_train)

In [None]:
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)
report = classification_report(y_test, y_pred)

print("Accuracy: ", accuracy)
print("Confusion Matrix: \n", cm)
print("Classification Report: \n", report)
