In [None]:
# Importing the libraries needed
import pandas as pd
import torch
import transformers
from torch.utils.data import Dataset, DataLoader
from transformers import DistilBertModel, DistilBertTokenizer

In [None]:
# Setting up the device for GPU usage

from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [None]:
df = pd.read_csv("/content/drive/MyDrive/Project/AI_Human_sample.csv")

In [None]:
df.head()

Unnamed: 0,text,generated
0,"As an eighth grade student, I believe that sch...",1.0
1,Smog is proof that pollution is a problem. Car...,0.0
2,Studying abroad is an excellent way to gain ne...,1.0
3,"And people are debating over whether the"" Face...",0.0
4,The author didst support this claim very well....,0.0


In [None]:
df['text'][4]

'The author didst support this claim very well. The author talked a lot about the dangers IQ Venus rather the good that comes from Venus. I say this because as I am reading I see how the author mentions many of times how Venus has extreme temperatures and how to one has went to Venus because of these extreme conditions.\n\nThe author says IQ paragraph 2, "Numerous factors contribute to Venuses reputation as a challenging planet for humans to study, despite its proximity to us." Which gives the idea that it is a very dangerous place. You may begin to question "Why would AQY one wait to go to Venus" or you may say to yourself "This is not AQY where that I would wait to go." The author also states IQ paragraph 3," Even more challenging are the clouds of highly corrosive sulfuric acid IQ Venuses atmosphere." This is even more if a danger sign. No one would wait to be anywhere where you could potentially suffocate. We also know from this paragraph that Venus temperatures average over 800 de

In [None]:
df.shape

(500, 2)

In [None]:
# Defining some key variables that will be used later on in the training
MAX_LEN = 512
TRAIN_BATCH_SIZE = 4
VALID_BATCH_SIZE = 2
EPOCHS = 5
LEARNING_RATE = 1e-05
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/465 [00:00<?, ?B/s]

In [None]:
class Triage(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.len = len(dataframe)
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __getitem__(self, index):
        text = str(self.data.text[index])
        text = " ".join(text.split())
        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            #pad_to_max_length=True,
            padding='max_length',
            return_token_type_ids=True,
            truncation=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']

        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'targets': torch.tensor(self.data.generated[index], dtype=torch.long)
        }

    def __len__(self):
        return self.len

In [None]:
# Creating the dataset and dataloader for the neural network

train_size = 0.8
train_dataset=df.sample(frac=train_size,random_state=200)
test_dataset=df.drop(train_dataset.index).reset_index(drop=True)
train_dataset = train_dataset.reset_index(drop=True)


print("FULL Dataset: {}".format(df.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(test_dataset.shape))

training_set = Triage(train_dataset, tokenizer, MAX_LEN)
testing_set = Triage(test_dataset, tokenizer, MAX_LEN)

FULL Dataset: (500, 2)
TRAIN Dataset: (400, 2)
TEST Dataset: (100, 2)


In [None]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

## Creating the Neural Network for Fine Tuning

In [None]:
# Creating the customized model, by adding a drop out and a dense layer on top of distil bert to get the final output for the model.

class DistillBERTClass(torch.nn.Module):
    def __init__(self):
        super(DistillBERTClass, self).__init__()
        self.l1 = DistilBertModel.from_pretrained("distilbert-base-uncased")
        self.pre_classifier = torch.nn.Linear(768, 768)
        self.dropout = torch.nn.Dropout(0.3)
        self.classifier = torch.nn.Linear(768, 1)

    def forward(self, input_ids, attention_mask):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0]
        pooler = self.pre_classifier(pooler)
        pooler = torch.nn.ReLU()(pooler)
        pooler = self.dropout(pooler)
        output = self.classifier(pooler)
        return output

In [None]:
model = DistillBERTClass()
model.to(device)

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

DistillBERTClass(
  (l1): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            (lin1): Linear(i

In [None]:
# Creating the loss function and optimizer
loss_function = torch.nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(params=model.parameters(), lr=LEARNING_RATE)

## Fine Tuning the Model

In [None]:
# Defining the training function on the 80% of the dataset for tuning the distilbert model

def train(epoch):
    tr_loss = 0
    n_correct = 0
    nb_tr_steps = 0
    nb_tr_examples = 0
    model.train()
    for _,data in enumerate(training_loader, 0):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float)
        outputs = model(ids, mask)
        loss = loss_function(outputs.squeeze(1), targets)  # Squeeze the output to match the shape of targets
        tr_loss += loss.item()
        # Apply a sigmoid activation to the outputs to obtain probabilities
        probabilities = torch.sigmoid(outputs)
        # Convert probabilities to binary predictions based on a threshold (e.g., 0.5)
        predictions = (probabilities > 0.5).float()
        n_correct += torch.sum(predictions.squeeze(1) == targets).item()

        nb_tr_steps += 1
        nb_tr_examples+=targets.size(0)

        optimizer.zero_grad()
        loss.backward()
        # # When using GPU
        optimizer.step()

    print(f'The Total Accuracy for Epoch {epoch}: {(n_correct*100)/nb_tr_examples}')
    epoch_loss = tr_loss/nb_tr_steps
    epoch_accu = (n_correct*100)/nb_tr_examples
    print(f"Training Loss Epoch: {epoch_loss}")
    print(f"Training Accuracy Epoch: {epoch_accu}")

    return

In [None]:
for epoch in range(EPOCHS):
    train(epoch)

In [None]:
def valid(model, testing_loader):
    model.eval()
    tr_loss = 0
    nb_tr_steps = 0
    nb_tr_examples = 0
    n_correct = 0
    with torch.no_grad():
        for _, data in enumerate(testing_loader, 0):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float)

            outputs = model(ids, mask)
            loss = loss_function(outputs.squeeze(1), targets)  # Squeeze the output to match the shape of targets
            tr_loss += loss.item()
            # Apply a sigmoid activation to the outputs to obtain probabilities
            probabilities = torch.sigmoid(outputs)
            # Convert probabilities to binary predictions based on a threshold (e.g., 0.5)
            predictions = (probabilities > 0.5).float()
            n_correct += torch.sum(predictions.squeeze(1) == targets).item()

            #targets = targets.unsqueeze(1)
            #outputs = model(ids, mask)
            #loss = loss_function(outputs, targets)
            #tr_loss += loss.item()
            #big_val, big_idx = torch.max(outputs.data, dim=1)
            #n_correct += calcuate_accu(big_idx, targets)

            nb_tr_steps += 1
            nb_tr_examples+=targets.size(0)

    epoch_loss = tr_loss/nb_tr_steps
    epoch_accu = (n_correct*100)/nb_tr_examples
    print(f"Validation Loss Epoch: {epoch_loss}")
    print(f"Validation Accuracy Epoch: {epoch_accu}")

    return epoch_accu

In [None]:
print('This is the validation section to print the accuracy and see how it performs')
print('Here we are leveraging on the dataloader crearted for the validation dataset, the approcah is using more of pytorch')

acc = valid(model, testing_loader)
print("Accuracy on test data = %0.2f%%" % acc)

This is the validation section to print the accuracy and see how it performs
Here we are leveraging on the dataloader crearted for the validation dataset, the approcah is using more of pytorch
Validation Loss Epoch: 0.6852735233306885
Validation Accuracy Epoch: 51.0
Accuracy on test data = 51.00%


## Saving the Trained Model Artifacts for inference

In [None]:
# Saving the files for re-use

output_model_file = './pytorch_distilbert_news.bin'
output_vocab_file = './vocab_distilbert_news.bin'

model_to_save = model
torch.save(model_to_save, output_model_file)
tokenizer.save_vocabulary(output_vocab_file)

print('All files saved')

All files saved


In [None]:
# Load the model
new_model = torch.load(output_model_file)

# Load the tokenizer's vocabulary
new_tokenizer = DistilBertTokenizer.from_pretrained(output_vocab_file)

# Ensure the model is in evaluation mode
model.eval()

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'BertTokenizer'. 
The class this function is called from is 'DistilBertTokenizer'.


DistillBERTClass(
  (l1): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            (lin1): Linear(i

## Alternatively, Saving the model's state Dict

In [None]:
# Save the model's state dictionary
torch.save(model.state_dict(), 'model_state_dict.pth')

### load the model's state dict

In [None]:
new_model = DistillBERTClass()
new_model.to(device)

# Load the saved model weights into the initialized model
new_model.load_state_dict(torch.load("model_state_dict.pth"))

# Make sure to set the model to evaluation mode after loading
new_model.eval()

# Load the tokenizer
#new_tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

def predict(sentence):
    # Tokenize the input sentence
    inputs = tokenizer(sentence, return_tensors='pt', truncation=True, padding=True)

    # Forward pass through the model
    with torch.no_grad():
        outputs = new_model(**inputs)
    print(f"outputs: {outputs}")
    # Get the predicted probabilities
    probabilities = torch.sigmoid(outputs)

    # Convert probabilities to binary predictions
    predictions = (probabilities > 0.5).int().squeeze()

    return predictions.item()

# Example usage
sentence = "This is a positive review"
prediction = predict(sentence)
print("Prediction:", prediction)

outputs: tensor([[0.1353]])
Prediction: 1


In [None]:
# Example usage
sentence = """The author didst support this claim very well. The author talked a lot about the dangers IQ Venus rather the good that comes from Venus. I say this because as I am reading I see how the author mentions many of times how Venus has extreme temperatures and how to one has went to Venus because of these extreme conditions.

The author says IQ paragraph 2, "Numerous factors contribute to Venuses reputation as a challenging planet for humans to study, despite its proximity to us." Which gives the idea that it is a very dangerous place. You may begin to question "Why would AQY one wait to go to Venus" or you may say to yourself "This is not AQY where that I would wait to go." The author also states IQ paragraph 3," Even more challenging are the clouds of highly corrosive sulfuric acid IQ Venuses atmosphere." This is even more if a danger sign. No one would wait to be anywhere where you could potentially suffocate. We also know from this paragraph that Venus temperatures average over 800 degrees Fahrenheit and the atmospheric pressure is 90 times greater that what we experience of our own planet. The author states IQ paragraph 6, "Therefore, scientists seeking to conduct a thorough mission to understand Venus would need to get up close and personal despite the risks." The author gives us tons and tons of reasons why it is not worth going to Venus.

From what I have read today I will never wait to go to Venus. The author gave plenty of good reasons of why we should not go to Venus. It is not a place for human as we have already read. So I feel as if the author did not do a good job supporting his claim of why Venus is a worthy pursuit despite the dangers it may present."""

prediction = predict(sentence)
print("Prediction:", prediction)

outputs: tensor([[0.1641]])
Prediction: 1
