In [65]:
import numpy as np
import scipy as sp
import pandas as pd
import sklearn
import xgboost as xgb

In [66]:
# First, get the dataset
dataset = pd.read_csv('datasets/daigt_external_dataset.csv')

In [67]:
dataset.shape

(2421, 4)

In [68]:
dataset.columns

Index(['id', 'text', 'instructions', 'source_text'], dtype='object')

In [69]:
# Text is human generated, source_text is A.I generated, lets change the column names
dataset.rename(columns={'id': 'id', 'text': 'human_text', 'instructions': 'prompt', 'source_text': 'A.I Text'}, inplace=True)

In [70]:
dataset.columns

Index(['id', 'human_text', 'prompt', 'A.I Text'], dtype='object')

In [71]:
dataset.head()

Unnamed: 0,id,human_text,prompt,A.I Text
0,6060D28C05B6,Some schools in United States ofter classes fr...,\nTask: Write a persuasive essay on whether or...,\nWhen considering the pros and cons of attend...
1,60623DB5DE7A,"Four-day work week, a remarkable idea to conse...",\nTask: Research the advantages and disadvanta...,\nOne of the primary arguments for implementin...
2,607A39D981DE,Students and their families should consider an...,\nTask: \n\n1. Talk to your parents before tak...,\nBefore making any decisions about getting in...
3,60ACDFA1609E,Agree you will never grow if something beyond ...,\nTask: Write an essay discussing the benefits...,"\nRalph Waldo Emerson once said, ""Go confident..."
4,60AE13D3F07B,I think our character traits are formed by inf...,\nTask: Research and discuss how character tra...,\nHuman character traits are shaped by a wide ...


In [72]:
# Strip the \n for the text in each of the columns
dataset['human_text'] = dataset['human_text'].str.replace('\n', '')
dataset['prompt'] = dataset['prompt'].str.replace('\nTask:', '')
dataset['prompt'] = dataset['prompt'].str.replace('\n:', '')
dataset['A.I Text'] = dataset['A.I Text'].str.replace('\n', '')

In [73]:
dataset.head()

Unnamed: 0,id,human_text,prompt,A.I Text
0,6060D28C05B6,Some schools in United States ofter classes fr...,Write a persuasive essay on whether or not cl...,When considering the pros and cons of attendin...
1,60623DB5DE7A,"Four-day work week, a remarkable idea to conse...",Research the advantages and disadvantages of ...,One of the primary arguments for implementing ...
2,607A39D981DE,Students and their families should consider an...,\n\n1. Talk to your parents before taking any...,Before making any decisions about getting invo...
3,60ACDFA1609E,Agree you will never grow if something beyond ...,Write an essay discussing the benefits of pus...,"Ralph Waldo Emerson once said, ""Go confidently..."
4,60AE13D3F07B,I think our character traits are formed by inf...,Research and discuss how character traits are...,Human character traits are shaped by a wide ra...


In [74]:
# We will rearange the dataframe to the following structure: 
# id | text | prompt | is_ai 

df_human = pd.concat([dataset['id'], dataset['prompt'], dataset['human_text'], pd.Series(0, index=dataset.index, name='is_ai')], axis=1)
df_human.rename(columns={'human_text': 'text'}, inplace=True)
df_human.head()

Unnamed: 0,id,prompt,text,is_ai
0,6060D28C05B6,Write a persuasive essay on whether or not cl...,Some schools in United States ofter classes fr...,0
1,60623DB5DE7A,Research the advantages and disadvantages of ...,"Four-day work week, a remarkable idea to conse...",0
2,607A39D981DE,\n\n1. Talk to your parents before taking any...,Students and their families should consider an...,0
3,60ACDFA1609E,Write an essay discussing the benefits of pus...,Agree you will never grow if something beyond ...,0
4,60AE13D3F07B,Research and discuss how character traits are...,I think our character traits are formed by inf...,0


In [54]:
df_ai = pd.concat([dataset['id'], dataset['prompt'], dataset['A.I Text'], pd.Series(1, index=dataset.index, name='is_ai')], axis=1)
df_ai.rename(columns={'A.I Text': 'text'}, inplace=True)
df_ai.head()

Unnamed: 0,id,prompt,text,is_ai
0,6060D28C05B6,Task: Write a persuasive essay on whether or n...,When considering the pros and cons of attendin...,1
1,60623DB5DE7A,Task: Research the advantages and disadvantage...,One of the primary arguments for implementing ...,1
2,607A39D981DE,Task: 1. Talk to your parents before taking an...,Before making any decisions about getting invo...,1
3,60ACDFA1609E,Task: Write an essay discussing the benefits o...,"Ralph Waldo Emerson once said, ""Go confidently...",1
4,60AE13D3F07B,Task: Research and discuss how character trait...,Human character traits are shaped by a wide ra...,1


In [57]:
df = pd.concat([df_human, df_ai], ignore_index=True)
df

Unnamed: 0,id,prompt,text,is_ai
0,6060D28C05B6,Task: Write a persuasive essay on whether or n...,Some schools in United States ofter classes fr...,0
1,60623DB5DE7A,Task: Research the advantages and disadvantage...,"Four-day work week, a remarkable idea to conse...",0
2,607A39D981DE,Task: 1. Talk to your parents before taking an...,Students and their families should consider an...,0
3,60ACDFA1609E,Task: Write an essay discussing the benefits o...,Agree you will never grow if something beyond ...,0
4,60AE13D3F07B,Task: Research and discuss how character trait...,I think our character traits are formed by inf...,0
...,...,...,...,...
4837,F5FF5E9E553C,Task: Research different kinds of medical prof...,Becoming a surgeon requires a great deal of de...,1
4838,F60545D8271E,Task: Write an essay discussing why schools ne...,Schools should offer an after school homework ...,1
4839,F610B3CBF3DF,Task: Write an essay about how having a few mi...,It’s human nature to be afraid to make mistake...,1
4840,F610C7BCD9EC,Task: Write an essay exploring the pros and co...,One of the main debates of 2020 for many stude...,1


In [61]:
shuffled_data = df.sample(frac=1).reset_index(drop=True)

In [62]:
shuffled_data.head()

Unnamed: 0,id,prompt,text,is_ai
0,81B000BCC5B3,Task:Write an essay about the importance of en...,Enthusiasm is one of the most important qualit...,1
1,A1DC92E4E3C8,Task: Research the positive effects of limited...,"In the United States, limited human contact du...",1
2,EFF1803E4B81,Task: Write an essay explaining why failure is...,Why are people scared to fail? Is failing real...,0
3,E36B406AFE70,Task: 1. Research how being inactive can serve...,Although it may seem like being active helps u...,0
4,B936994E140D,Task: Research the benefits of online classes ...,I agree that students would benefit better on ...,0
