In [1]:
import gzip
import shutil
import time

import pandas as pd
import requests
import torch
import torch.nn.functional as F
import torchtext
import numpy as np

import transformers
from transformers import DistilBertForSequenceClassification, AdamW

import os

2023-04-24 19:55:27.421026: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-04-24 19:55:27.470219: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
os.environ['CUDA_VISIBLE_DEVICES'] = '0,1'

In [3]:
torch.backends.cudnn.deterministic = True
RANDOM_SEED = 123
torch.manual_seed(RANDOM_SEED)

NUM_EPOCHS = 3

DEVICE = torch.device("cuda:0")

In [4]:
import csv
ai_1 = pd.read_csv('ai_text_summaries_0_24000.csv')
ai_2 = pd.read_csv('ai_text_summaries_24000_55000.csv')
human_1 = pd.read_csv('human_text_summaries_0_20000.csv')
human_2 = pd.read_csv('human_text_summaries_20000_40000.csv')
data_updated_ai = pd.concat([ai_1, ai_2[:6000]])
data_updated_ai.rename(columns={"AI_Summaries": "content"}, inplace = True)
data_updated_ai['human'] = [0]*30000

data_updated_human = pd.concat([human_1, human_2[:10000]])
data_updated_human.rename(columns={"H_Summaries": "content"}, inplace = True)
data_updated_human['human'] = [1]*30000

merged = pd.concat([data_updated_ai, data_updated_human])
merged = merged.sample(frac = 1)
merged.head()

Unnamed: 0,Summary No.,content,human
6478,26479,Biggar is a surname of Scottish origin. People...,1
2635,2636,"Basilica of Our Lady of Perpetual Help, Mary ...",1
8954,8955,Ukraine is situated in Eastern Europe and bord...,0
1959,1960,The West Palatinate Way is a German scenic roa...,0
15488,15489,Georgia Valerie Toffolo (born 23 October 1994)...,0


In [8]:
merged.head()

Unnamed: 0,content,human
6478,Biggar is a surname of Scottish origin. People...,1
2635,"Basilica of Our Lady of Perpetual Help, Mary ...",1
8954,Ukraine is situated in Eastern Europe and bord...,0
1959,The West Palatinate Way is a German scenic roa...,0
15488,Georgia Valerie Toffolo (born 23 October 1994)...,0


In [6]:
merged.drop(['Summary No.'], axis=1, inplace = True)

In [7]:
merged.shape

(60000, 2)

In [44]:
texts = merged['content'].values
labels = merged['human'].values
test_texts = texts
test_labels = labels

In [35]:
print("shape of texts --",texts.shape)
print("shape of labels --",labels.shape)

shape of texts -- (60000,)
shape of labels -- (60000,)


In [11]:
class IMDbDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [12]:
from transformers import Trainer, TrainingArguments

In [13]:
def compute_accuracy(model, data_loader, device):

    with torch.no_grad():

        correct_pred, num_examples = 0, 0

        for batch_idx, batch in enumerate(data_loader):

            ### Prepare data
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss, logits = outputs['loss'], outputs['logits']

            _, predicted_labels = torch.max(logits, 1)

            num_examples += labels.size(0)

            correct_pred += (predicted_labels == labels).sum()
    return correct_pred.float()/num_examples * 100

In [14]:
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')
model.to(DEVICE)
model.load_state_dict(torch.load('pytorch_model.bin'))

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_projector.weight', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'classifier.bias', 'pre_classi

<All keys matched successfully>

In [15]:
test_encodings = tokenizer(list(merged['content']), truncation=True, padding=True)
test_labels = merged['human'].to_numpy()
test_dataset = IMDbDataset(test_encodings, test_labels)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=16, shuffle=False)

model.to(DEVICE)
test_accuracy = compute_accuracy(model, test_loader, DEVICE)
print(f'Test accuracy: {test_accuracy:.2f}%')


Test accuracy: 51.65%
