In [166]:
import numpy as np
import scipy as sp
import pandas as pd
import sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split

In [167]:
# First, get the dataset
dataset = pd.read_csv('datasets/daigt_external_dataset.csv')

In [168]:
dataset.shape

(2421, 4)

In [169]:
dataset.columns

Index(['id', 'text', 'instructions', 'source_text'], dtype='object')

In [170]:
# Text is human generated, source_text is A.I generated, lets change the column names
dataset.rename(columns={'id': 'id', 'text': 'human_text', 'instructions': 'prompt', 'source_text': 'A.I Text'}, inplace=True)

In [171]:
dataset.columns

Index(['id', 'human_text', 'prompt', 'A.I Text'], dtype='object')

In [172]:
dataset.head()

Unnamed: 0,id,human_text,prompt,A.I Text
0,6060D28C05B6,Some schools in United States ofter classes fr...,\nTask: Write a persuasive essay on whether or...,\nWhen considering the pros and cons of attend...
1,60623DB5DE7A,"Four-day work week, a remarkable idea to conse...",\nTask: Research the advantages and disadvanta...,\nOne of the primary arguments for implementin...
2,607A39D981DE,Students and their families should consider an...,\nTask: \n\n1. Talk to your parents before tak...,\nBefore making any decisions about getting in...
3,60ACDFA1609E,Agree you will never grow if something beyond ...,\nTask: Write an essay discussing the benefits...,"\nRalph Waldo Emerson once said, ""Go confident..."
4,60AE13D3F07B,I think our character traits are formed by inf...,\nTask: Research and discuss how character tra...,\nHuman character traits are shaped by a wide ...


In [173]:
# Clean the data
dataset['human_text'] = dataset['human_text'].str.replace('\n', '')
dataset['human_text'] = dataset['human_text'].str.lower()

dataset['prompt'] = dataset['prompt'].str.replace('\nTask:', '')
dataset['prompt'] = dataset['prompt'].str.replace('\n', '')
dataset['prompt'] = dataset['prompt'].str.lower()

dataset['A.I Text'] = dataset['A.I Text'].str.replace('\n', '')
dataset['A.I Text'] = dataset['A.I Text'].str.lower()

In [174]:
dataset.head()

Unnamed: 0,id,human_text,prompt,A.I Text
0,6060D28C05B6,some schools in united states ofter classes fr...,write a persuasive essay on whether or not cl...,when considering the pros and cons of attendin...
1,60623DB5DE7A,"four-day work week, a remarkable idea to conse...",research the advantages and disadvantages of ...,one of the primary arguments for implementing ...
2,607A39D981DE,students and their families should consider an...,1. talk to your parents before taking any dec...,before making any decisions about getting invo...
3,60ACDFA1609E,agree you will never grow if something beyond ...,write an essay discussing the benefits of pus...,"ralph waldo emerson once said, ""go confidently..."
4,60AE13D3F07B,i think our character traits are formed by inf...,research and discuss how character traits are...,human character traits are shaped by a wide ra...


In [175]:
# We will rearange the dataframe to the following structure: 
# id | text | prompt | is_ai 

df_human = pd.concat([dataset['id'], dataset['prompt'], dataset['human_text'], pd.Series(0, index=dataset.index, name='is_ai')], axis=1)
df_human.rename(columns={'human_text': 'text'}, inplace=True)
df_human.head()

Unnamed: 0,id,prompt,text,is_ai
0,6060D28C05B6,write a persuasive essay on whether or not cl...,some schools in united states ofter classes fr...,0
1,60623DB5DE7A,research the advantages and disadvantages of ...,"four-day work week, a remarkable idea to conse...",0
2,607A39D981DE,1. talk to your parents before taking any dec...,students and their families should consider an...,0
3,60ACDFA1609E,write an essay discussing the benefits of pus...,agree you will never grow if something beyond ...,0
4,60AE13D3F07B,research and discuss how character traits are...,i think our character traits are formed by inf...,0


In [176]:
df_ai = pd.concat([dataset['id'], dataset['prompt'], dataset['A.I Text'], pd.Series(1, index=dataset.index, name='is_ai')], axis=1)
df_ai.rename(columns={'A.I Text': 'text'}, inplace=True)
df_ai.head()

Unnamed: 0,id,prompt,text,is_ai
0,6060D28C05B6,write a persuasive essay on whether or not cl...,when considering the pros and cons of attendin...,1
1,60623DB5DE7A,research the advantages and disadvantages of ...,one of the primary arguments for implementing ...,1
2,607A39D981DE,1. talk to your parents before taking any dec...,before making any decisions about getting invo...,1
3,60ACDFA1609E,write an essay discussing the benefits of pus...,"ralph waldo emerson once said, ""go confidently...",1
4,60AE13D3F07B,research and discuss how character traits are...,human character traits are shaped by a wide ra...,1


In [177]:
df = pd.concat([df_human, df_ai], ignore_index=True)
df

Unnamed: 0,id,prompt,text,is_ai
0,6060D28C05B6,write a persuasive essay on whether or not cl...,some schools in united states ofter classes fr...,0
1,60623DB5DE7A,research the advantages and disadvantages of ...,"four-day work week, a remarkable idea to conse...",0
2,607A39D981DE,1. talk to your parents before taking any dec...,students and their families should consider an...,0
3,60ACDFA1609E,write an essay discussing the benefits of pus...,agree you will never grow if something beyond ...,0
4,60AE13D3F07B,research and discuss how character traits are...,i think our character traits are formed by inf...,0
...,...,...,...,...
4837,F5FF5E9E553C,research different kinds of medical professio...,becoming a surgeon requires a great deal of de...,1
4838,F60545D8271E,write an essay discussing why schools need an...,schools should offer an after school homework ...,1
4839,F610B3CBF3DF,write an essay about how having a few mistake...,it’s human nature to be afraid to make mistake...,1
4840,F610C7BCD9EC,write an essay exploring the pros and cons of...,one of the main debates of 2020 for many stude...,1


In [178]:
shuffled_data = df.sample(frac=1).reset_index(drop=True)

In [179]:
shuffled_data.head()

Unnamed: 0,id,prompt,text,is_ai
0,EEEF81C2E133,consider the reasons why the author of this e...,the author of this essay can be a suitable emp...,1
1,E966121134B7,write an essay that explores the rights of st...,"the life of the some student's, of wants super...",0
2,CD81F6CCE819,research ways to build self-confidence and se...,i believe that self-esteem comes from achievem...,0
3,EB1500B8EA75,research the pros and cons of online schoolin...,online schooling has become increasingly popul...,1
4,DC88C1130FD9,write an essay discussing how failure is impo...,"when it comes to success, failing can often be...",1


In [180]:
# Split the data 
X = shuffled_data['text']
y = shuffled_data['is_ai']

X_train, X_test, y_train, y_test = train_test_split(X, y)

In [181]:
vectoriser = TfidfVectorizer(max_features=5000)

In [182]:
X_train = vectoriser.fit_transform(X_train)
X_test = vectoriser.fit_transform(X_test)

In [183]:
clf = LogisticRegression()
clf.fit(X_train, y_train)

In [184]:
y_pred = clf.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))

Accuracy: 0.5111478117258464


In [185]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00       592
           1       0.51      1.00      0.68       619

    accuracy                           0.51      1211
   macro avg       0.26      0.50      0.34      1211
weighted avg       0.26      0.51      0.35      1211



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
