In [16]:
import gzip
import shutil
import time

import pandas as pd
import requests
import torch
import torch.nn.functional as F
import torchtext
import numpy as np

import transformers
from transformers import DistilBertForSequenceClassification, AdamW

import os

In [2]:
os.environ['CUDA_VISIBLE_DEVICES'] = '0,1'

In [3]:
torch.backends.cudnn.deterministic = True
RANDOM_SEED = 123
torch.manual_seed(RANDOM_SEED)

NUM_EPOCHS = 3

DEVICE = torch.device("cuda:0")

In [56]:
df = pd.read_csv('HumanAI_Merged.csv')
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
df["human"] = df["human"].astype(int)

In [57]:
AI_human = df.loc[df['human'] == False][:30000]
Human_human = filtered_data = df.loc[df['human'] == True][:30000]
AI_content = df.loc[AI_human.index]['content']
human_content = df.loc[Human_human.index]['content']

all_test = pd.concat([AI_human, Human_human])
all_test = all_test.sample(frac=1, random_state=42)

In [22]:
no_Diff = pd.read_csv('dataset_noDifficultWords.csv', encoding='latin-1')
no_Diff.head()

Unnamed: 0,id,url,title,wiki_intro,generated_intro,title_len,wiki_intro_len,generated_intro_len,prompt,generated_text,prompt_tokens,generated_text_tokens
0,63064638,https://en.wikipedia.org/wiki/Sexhow%20railway...,Sexhow railway station,Sexhow way station was a way station built to ...,Sexhow way station was a way station located i...,3,174,78,200 word wikipedia style introduction on 'Sexh...,"located in the town of Sexhow, on the Cumbria...",25,88
1,279621,https://en.wikipedia.org/wiki/Eti%C3%A4inen,EtiÃ¤inen,"In Finnish folklore, all ps and things, and al...","In Finnish folklore, all ps and things, anie o...",1,187,80,200 word wikipedia style introduction on 'EtiÃ...,"animate or inanimate, have a spirit or ""etiÃ¤...",26,101
2,287229,https://en.wikipedia.org/wiki/Inverse%20functi...,Inverse function theorem,"In atics, specifically diffeial calculus, the ...","In atics, specifically diffeial calculus, the ...",3,170,59,200 word wikipedia style introduction on 'Inve...,function theorem states that for every real-v...,26,65
3,26712375,https://en.wikipedia.org/wiki/Stepping%20on%20...,Stepping on Roses,is a Jaese shÅjo manga series ten and ilrated...,is a Jaese shÅjo manga series ten and ilrated...,3,335,121,200 word wikipedia style introduction on 'Step...,and illustrated by Maki Fujii. The series fol...,26,150
4,38894426,https://en.wikipedia.org/wiki/Rob%20Bradley,Rob Bradley,"Robert Milner ""Rob"" Bradley, Jr. (born Au 24, ...","Robert Milner ""Rob"" Bradley, Jr. (born Au 29, ...",2,170,136,200 word wikipedia style introduction on 'Rob ...,"29, 1973) is an American former professional ...",28,162


In [32]:
AI = pd.DataFrame(no_Diff['generated_intro'])
Human = pd.DataFrame(no_Diff['wiki_intro'])

In [36]:
print(AI.shape)
print(Human.shape)

(150000, 1)
(150000, 1)


In [37]:
AI.rename(columns={"generated_intro": "content"}, inplace = True)
AI['human'] = [0]*150000
AI.head()

Unnamed: 0,content,human
0,Sexhow way station was a way station located i...,0
1,"In Finnish folklore, all ps and things, anie o...",0
2,"In atics, specifically diffeial calculus, the ...",0
3,is a Jaese shÅjo manga series ten and ilrated...,0
4,"Robert Milner ""Rob"" Bradley, Jr. (born Au 29, ...",0


In [39]:
Human.rename(columns={"wiki_intro": "content"}, inplace = True)
Human['human'] = [1]*150000
Human.head()

Unnamed: 0,content,human
0,Sexhow way station was a way station built to ...,1
1,"In Finnish folklore, all ps and things, and al...",1
2,"In atics, specifically diffeial calculus, the ...",1
3,is a Jaese shÅjo manga series ten and ilrated...,1
4,"Robert Milner ""Rob"" Bradley, Jr. (born Au 24, ...",1


In [40]:
merged = pd.concat([AI, Human])
merged = merged.sample(frac = 1)
merged.head()

Unnamed: 0,content,human
90121,Nandi Thim (15th and 16th centuries CE) was a ...,1
94996,"Oswald George Nelson (March 20, 1906 â Septe...",0
21049,French emig from the years 1789 to 1815 refers...,1
148281,Ernst RenÃ© Anselm Nyberg (born 13 February 19...,1
15484,"Ajeet Sohi (born March 8, 1964) is a Canadian...",1


In [49]:
import csv
ai_1 = pd.read_csv('ai_text_summaries_0_24000.csv')
ai_2 = pd.read_csv('ai_text_summaries_24000_55000.csv')
human_1 = pd.read_csv('human_text_summaries_0_20000.csv')
human_2 = pd.read_csv('human_text_summaries_20000_40000.csv')
data_updated_ai = pd.concat([ai_1, ai_2[:6000]])
data_updated_ai.rename(columns={"AI_Summaries": "content"}, inplace = True)
data_updated_ai['human'] = [0]*30000

data_updated_human = pd.concat([human_1, human_2[:10000]])
data_updated_human.rename(columns={"H_Summaries": "content"}, inplace = True)
data_updated_human['human'] = [1]*30000

merged = pd.concat([data_updated_ai, data_updated_human])
merged = merged.sample(frac = 1)
merged.head()

Unnamed: 0,Summary No.,content,human
18491,18492,Jamugurihat is a town and a town area committe...,1
21001,21002,"The Bad Pass Trail, also known as the Haystack...",0
5033,5034,"Bruno Schön was a pastoral psychiatrist, homeo...",1
13934,13935,The Goudarzi dynasty descends from the 15th-ce...,0
8645,28646,The Stone-Darracott House is a historic house ...,1


In [8]:
merged.head()

Unnamed: 0,content,human
6478,Biggar is a surname of Scottish origin. People...,1
2635,"Basilica of Our Lady of Perpetual Help, Mary ...",1
8954,Ukraine is situated in Eastern Europe and bord...,0
1959,The West Palatinate Way is a German scenic roa...,0
15488,Georgia Valerie Toffolo (born 23 October 1994)...,0


In [6]:
merged.drop(['Summary No.'], axis=1, inplace = True)

In [50]:
merged.shape

(60000, 3)

In [44]:
texts = merged['content'].values
labels = merged['human'].values
test_texts = texts
test_labels = labels

In [35]:
print("shape of texts --",texts.shape)
print("shape of labels --",labels.shape)

shape of texts -- (60000,)
shape of labels -- (60000,)


In [42]:
class IMDbDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [12]:
from transformers import Trainer, TrainingArguments

In [13]:
def compute_accuracy(model, data_loader, device):

    with torch.no_grad():

        correct_pred, num_examples = 0, 0

        for batch_idx, batch in enumerate(data_loader):

            ### Prepare data
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss, logits = outputs['loss'], outputs['logits']

            _, predicted_labels = torch.max(logits, 1)

            num_examples += labels.size(0)

            correct_pred += (predicted_labels == labels).sum()
    return correct_pred.float()/num_examples * 100

In [14]:
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')
model.to(DEVICE)
model.load_state_dict(torch.load('pytorch_model.bin'))

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_projector.weight', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'classifier.bias', 'pre_classi

<All keys matched successfully>

In [58]:
test_encodings = tokenizer(list(all_test['content']), truncation=True, padding=True)
test_labels = all_test['human'].to_numpy()
test_dataset = IMDbDataset(test_encodings, test_labels)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=16, shuffle=False)

# model.to(DEVICE)
# test_accuracy = compute_accuracy(model, test_loader, DEVICE)
# print(f'Test accuracy: {test_accuracy:.2f}%')


In [46]:
all_test.shape

(60000, 2)

In [55]:
AI_human = merged.loc[all_test['human'] == False][:30000]
Human_human = filtered_data = all_test.loc[merged['human'] == True][:30000]
AI_content = merged.loc[AI_human.index]['content']
human_content = merged.loc[Human_human.index]['content']

all_test = pd.concat([AI_human, Human_human])
all_test = all_test.sample(frac=1, random_state=42)

  Human_human = filtered_data = all_test.loc[merged['human'] == True][:30000]


ValueError: cannot reindex on an axis with duplicate labels

In [59]:
from sklearn.metrics import precision_recall_fscore_support

predictions = []
true_labels = []
with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(DEVICE)
        attention_mask = batch['attention_mask'].to(DEVICE)
        labels = batch['labels'].to(DEVICE)

        outputs = model(input_ids, attention_mask=attention_mask)
        _, predicted_labels = torch.max(outputs.logits, dim=1)

        predictions.extend(predicted_labels.cpu().numpy().tolist())
        true_labels.extend(labels.cpu().numpy().tolist())

# compute precision, recall, and f1_score
precision, recall, f1_score, _ = precision_recall_fscore_support(true_labels, predictions, average='binary')

print('Precision:', precision)
print('Recall:', recall)
print('F1 score:', f1_score)

Precision: 0.9997619128601068
Recall: 0.9798
F1 score: 0.9896803084123162
