In [1]:
from transformers import (
    AutoTokenizer,
    AutoConfig, 
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer)
from datasets import load_dataset, DatasetDict, Dataset
from transformers import BertForSequenceClassification

from peft import PeftModel, PeftConfig, get_peft_model, LoraConfig
import evaluate
import torch
import numpy as np

import pandas as pd
import numpy as np
from tensorflow.keras.layers import TextVectorization, Embedding, Bidirectional, LSTM, Conv1D, GlobalMaxPooling1D, Dense, Dropout
from tensorflow.keras import Model, Input
from keras.callbacks import EarlyStopping,ModelCheckpoint
from imblearn.over_sampling import SMOTE
import tensorflow as tf
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
import re
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from transformers import TextClassificationPipeline
warnings.filterwarnings("ignore")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from accelerate import Accelerator
from torch.utils.data.dataloader import DataLoader
from peft import LoraConfig, TaskType

In [3]:
dataset = load_dataset('csv', data_files={'train': 'train.csv', 'test': 'test.csv'})
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 8000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 2529
    })
})

In [4]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)
tokenized_datasets = dataset.map(tokenize_function, batched=True)

In [5]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 8000
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 2529
    })
})

In [5]:
tokenized_datasets = tokenized_datasets.remove_columns(["text"])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch")

In [6]:
small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000))
small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000))

In [7]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(small_train_dataset, shuffle=True, batch_size=8)
eval_dataloader = DataLoader(small_eval_dataset, batch_size=8)

In [9]:
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
peft_config = LoraConfig(target_modules='all-linear', task_type=TaskType.SEQ_CLS, inference_mode=False, r=8, lora_alpha=32, lora_dropout=0.1)
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 1,274,130 || all params: 68,247,588 || trainable%: 1.8669


In [10]:
from torch.optim import AdamW

optimizer = AdamW(model.parameters(), lr=5e-5)

In [11]:
from transformers import get_scheduler

num_epochs = 2
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)

In [12]:
device = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu")
print(f"Using device: {device}")
model.to(device)

#torch.backends.mps.is_available()

Using device: mps


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [13]:
from tqdm.auto import tqdm

progress_bar = tqdm(range(num_training_steps))

model.train()
for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 250/250 [10:01<00:00,  1.93s/it]

In [14]:
import evaluate

metric = evaluate.load("accuracy")
model.eval()
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])

metric.compute()

{'accuracy': 0.99}

In [15]:
test=pd.read_csv('test_essays.csv')
test.head()

Unnamed: 0,id,prompt_id,text
0,0000aaaa,2,Aaa bbb ccc.
1,1111bbbb,3,Bbb ccc ddd.
2,2222cccc,4,CCC ddd eee.


In [16]:
test=pd.read_csv('sample_submission.csv')
test.head()

Unnamed: 0,id,generated
0,0000aaaa,0.1
1,1111bbbb,0.9
2,2222cccc,0.4


In [17]:
import torch.nn.functional as F

model.eval()
metric = evaluate.load("accuracy")
all_predictions = []
all_probabilities = []

for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)
    
    logits = outputs.logits
    probabilities = F.softmax(logits, dim=-1)
    predictions = torch.argmax(logits, dim=-1)
    
    # Add batch to metric
    metric.add_batch(predictions=predictions, references=batch["labels"])

    # Store predictions and probabilities
    all_predictions.extend(predictions.cpu().numpy())
    all_probabilities.extend(probabilities.cpu().numpy())

# Compute the final metric
metric_result = metric.compute()
print(metric_result)

# Print predictions and their probabilities
for pred, prob in zip(all_predictions, all_probabilities):
    print(f"Prediction: {pred}, Probability: {prob}")

{'accuracy': 0.99}
Prediction: 1, Probability: [0.00267159 0.9973284 ]
Prediction: 0, Probability: [0.964124   0.03587591]
Prediction: 1, Probability: [0.0027465 0.9972535]
Prediction: 1, Probability: [0.00256407 0.997436  ]
Prediction: 0, Probability: [0.9823037  0.01769638]
Prediction: 0, Probability: [0.99301094 0.00698908]
Prediction: 0, Probability: [0.99084526 0.00915472]
Prediction: 0, Probability: [0.9909078 0.0090922]
Prediction: 0, Probability: [0.992029   0.00797107]
Prediction: 0, Probability: [0.987932  0.0120679]
Prediction: 1, Probability: [0.00234297 0.99765706]
Prediction: 1, Probability: [0.0024093  0.99759066]
Prediction: 0, Probability: [0.9925532  0.00744678]
Prediction: 0, Probability: [0.9904236  0.00957631]
Prediction: 0, Probability: [0.98800886 0.01199118]
Prediction: 0, Probability: [0.99119824 0.00880179]
Prediction: 0, Probability: [0.9728907  0.02710931]
Prediction: 0, Probability: [0.98988956 0.01011051]
Prediction: 1, Probability: [0.00272435 0.99727565]

In [18]:
pipe = TextClassificationPipeline(model=model, tokenizer=tokenizer, return_all_scores=True)

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [19]:
def get_pred(text):
    pred = pipe(text)
    print(pred)
    return pred[0][1]['score']

In [20]:
final_df_path = 'test_essays.csv'
test_df = pd.read_csv(final_df_path)
test_df['generated'] = test_df['text'].apply(get_pred)
test_df = test_df[["id", "generated"]]
test_df

[[{'label': 'LABEL_0', 'score': 0.6134911179542542}, {'label': 'LABEL_1', 'score': 0.38650888204574585}]]
[[{'label': 'LABEL_0', 'score': 0.6508035659790039}, {'label': 'LABEL_1', 'score': 0.3491964638233185}]]
[[{'label': 'LABEL_0', 'score': 0.6304649114608765}, {'label': 'LABEL_1', 'score': 0.36953505873680115}]]


Unnamed: 0,id,generated
0,0000aaaa,0.386509
1,1111bbbb,0.349196
2,2222cccc,0.369535


In [21]:
test_df.to_csv('submission.csv', index=False)
test_df

Unnamed: 0,id,generated
0,0000aaaa,0.386509
1,1111bbbb,0.349196
2,2222cccc,0.369535


In [22]:
from transformers import pipeline, AutoModel, AutoTokenizer

In [23]:
#pipe = pipeline("text-classification")
pipe.save_pretrained("my_local_path_new")

In [27]:
model.save_pretrained('NEW_MODEL')
tokenizer.save_pretrained('NEW_TOKEN')

('NEW_TOKEN/tokenizer_config.json',
 'NEW_TOKEN/special_tokens_map.json',
 'NEW_TOKEN/vocab.txt',
 'NEW_TOKEN/added_tokens.json',
 'NEW_TOKEN/tokenizer.json')

In [25]:
#############################################################################################################################################################################################

In [11]:
model = AutoModelForSequenceClassification.from_pretrained('NEW_MODEL')
tokenizer = AutoTokenizer.from_pretrained('NEW_TOKEN')
pipe = TextClassificationPipeline(model=model, tokenizer=tokenizer,device = 1, return_all_scores=True)

In [12]:
def get_pred(text):
    pred = pipe(text, padding="max_length", truncation=True)
    return pred[0][1]['score']

In [13]:
final_df_path = 'train_essays.csv'
test_df = pd.read_csv(final_df_path)
test_df = test_df[['id', 'text']]
test_df

Unnamed: 0,id,text
0,0059830c,Cars. Cars have been around since they became ...
1,005db917,Transportation is a large necessity in most co...
2,008f63e3,"""America's love affair with it's vehicles seem..."
3,00940276,How often do you ride in a car? Do you drive a...
4,00c39458,Cars are a wonderful thing. They are perhaps o...
...,...,...
1373,fe6ff9a5,There has been a fuss about the Elector Colleg...
1374,ff669174,Limiting car usage has many advantages. Such a...
1375,ffa247e0,There's a new trend that has been developing f...
1376,ffc237e9,As we all know cars are a big part of our soci...


In [14]:
%%time
test_df['generated'] = np.round(test_df['text'].apply(get_pred), 1)
test_df = test_df[["id", "generated"]]
test_df

CPU times: user 33.3 s, sys: 6.95 s, total: 40.3 s
Wall time: 1min 33s


Unnamed: 0,id,generated
0,0059830c,0.0
1,005db917,0.0
2,008f63e3,0.1
3,00940276,0.0
4,00c39458,0.0
...,...,...
1373,fe6ff9a5,0.0
1374,ff669174,0.0
1375,ffa247e0,0.1
1376,ffc237e9,0.0
