In [26]:
data = "/content/drive/MyDrive/bbc_data"

In [27]:
import pandas as pd
import numpy as np
import os

In [28]:
directory = []
file = []
title = []
text = []
label = []
datapath = data
for dirname, _, filenames in os.walk(datapath):
    try:
        filenames.remove("README.TXT")
    except:
        pass
    for filename in filenames:
        directory.append(dirname)
        file.append(filename)
        label.append(dirname.split('/')[-1])
        fullpathfile = os.path.join(dirname,filename)
        with open(fullpathfile, 'r', encoding="utf8", errors='ignore') as infile:
            intext = ''
            firstline = True
            for line in infile:
                if firstline:
                    title.append(line.replace('\n',''))
                    firstline = False
                else:
                    intext = intext + ' ' + line.replace('\n','')
            text.append(intext)


In [29]:
DATA_COLUMN = 'text'
LABEL_COLUMN = 'label'

fulldf = pd.DataFrame(list(zip(directory, file, title, text, label)),
               columns =['directory', 'file', 'title', 'text', 'label'])

df = fulldf.filter(['text','label'], axis=1)
df.head()

Unnamed: 0,text,label
0,Viewers could soon be rewarded for watching ...,tech
1,"Robots are learning lessons on ""robotiquette...",tech
2,Computer games could enhance learning and ha...,tech
3,"Games aimed at ""casual players"" are set to b...",tech
4,UK mobile owners continue to break records w...,tech


In [30]:
df["label"].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
sport,511
business,510
politics,417
tech,401
entertainment,386


In [31]:
label_mapping = {"sport": 0, "business": 1, "politics": 2, "tech": 3, "entertainment": 4}
df["label"]=df.label.map(label_mapping)

In [32]:
df["label"].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
0,511
1,510
2,417
3,401
4,386


In [33]:
import re

In [34]:
def clean_text(text):
    text = text.lower()
    text = re.sub('<.*?>', '', text)
    text = re.sub('http\S+', '', text)
    text = re.sub("[^a-zA-Z0-9\s]", "", text)
    text = re.sub("\s+", " ", text).strip()
    return text

In [35]:
df["clean_text"]= df["text"].apply(clean_text)

In [36]:
%pip install datasets



In [37]:
%pip install transformers



In [38]:
%pip install accelerate



In [39]:
import torch
torch.cuda.is_available()

True

In [40]:
from sklearn.model_selection import train_test_split

In [91]:
train_df, test_df = train_test_split(df, test_size=0.2)

In [42]:
from transformers import AutoTokenizer
from datasets import load_dataset, Dataset, DatasetDict
from transformers import DataCollatorWithPadding
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
import torch

In [43]:
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

# Convert datasets to tokenized format
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)
dataset = DatasetDict({
    "train": train_dataset,
    "test": test_dataset
})

In [44]:
def tokenize_data(examples):
    return tokenizer(examples["clean_text"], truncation=True)
dataset = dataset.map(tokenize_data, batched=True)

Map:   0%|          | 0/1780 [00:00<?, ? examples/s]

Map:   0%|          | 0/445 [00:00<?, ? examples/s]

In [45]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'clean_text', '__index_level_0__', 'input_ids', 'attention_mask'],
        num_rows: 1780
    })
    test: Dataset({
        features: ['text', 'label', 'clean_text', '__index_level_0__', 'input_ids', 'attention_mask'],
        num_rows: 445
    })
})

In [46]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=5)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [47]:
training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()

Step,Training Loss
500,0.1608
1000,0.003


TrainOutput(global_step=1120, training_loss=0.07329689268288868, metrics={'train_runtime': 919.1173, 'train_samples_per_second': 19.366, 'train_steps_per_second': 1.219, 'total_flos': 2356396125197520.0, 'train_loss': 0.07329689268288868, 'epoch': 10.0})

In [48]:
# Save the trained model
trainer.save_model('model')

In [49]:
# !zip -r export.zip results

In [51]:
device = torch.device("cuda"if torch.cuda.is_available() else"cpu")
device

device(type='cuda')

In [52]:
df1 = df["clean_text"]
df1.head(1)

Unnamed: 0,clean_text
0,viewers could soon be rewarded for watching tv...


In [53]:
df1[1205]

'wales coach mike ruddock has defended his decision not to release any of the international stars for this weekends regional celtic league fixtures ruddock says the players will benefit from the rest and their absence will give youngsters a chance to impress weve got the wru charter in place now which outlines exactly what happens ruddock told bbc wales sport once were in the six nations the players will only be released in his and the wrus best interests the ospreys and scarlets say they are happy to support the wales cause but the dragons have expressed disappointment at not being able to use their national squad players in fridays game with ulster ceri sweeney gareth cooper ian gough and kevin morgan have been used sparingly by ruddock in the opening two six nations wins and captain jason forster believes they would benefit from a game with the dragons im sure the guys would want to come back to get some game time forster told bbc wales sport it would also be a timely reminder to mi

In [54]:
input_text = tokenizer(df1[1], return_tensors="pt", truncation=True).to(device)
prediction = model(**input_text)

In [55]:
prediction

SequenceClassifierOutput(loss=None, logits=tensor([[-1.6756, -2.3231, -2.4569,  5.4186, -1.9118]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [56]:
int(torch.argmax(prediction.logits, axis=1).detach())


3

In [57]:
input_text = tokenizer(df1[1205], return_tensors="pt", truncation=True).to(device)
prediction = model(**input_text)

In [58]:
prediction

SequenceClassifierOutput(loss=None, logits=tensor([[ 6.0985, -2.1404, -2.0249, -1.8135, -2.4597]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [59]:
int(torch.argmax(prediction.logits, axis=1).detach())

0

In [60]:
label_mapping = {0 : "sport", 1: "business", 2: "politics", 3: "tech", 4: "entertainment"}

In [61]:
label_mapping

{0: 'sport', 1: 'business', 2: 'politics', 3: 'tech', 4: 'entertainment'}

In [63]:
len(test_df["clean_text"])

445

In [79]:
y_pred = []
for i in (test_df["clean_text"]):
  input_text = tokenizer(i, return_tensors="pt", truncation=True).to(device)
  prediction = model(**input_text)
  y_pred.append(int(torch.argmax(prediction.logits, axis=1).detach()))


In [86]:
len(y_pred)

445

In [93]:
y_pred = pd.DataFrame(y_pred, columns= ["Pred"])

In [95]:
test_df

Unnamed: 0,text,label,clean_text
414,Air passengers who are unable to board their...,1,air passengers who are unable to board their f...
420,Customers trying to get through to call cent...,1,customers trying to get through to call centre...
1644,Two of Britain's big trade unions could merg...,2,two of britains big trade unions could merge t...
416,"Parmalat, the Italian food group at the cent...",1,parmalat the italian food group at the centre ...
1232,"Murrayfield, Edinburgh Saturday, 26 Februar...",0,murrayfield edinburgh saturday 26 february 140...
...,...,...,...
741,The ongoing public spat between the two heir...,1,the ongoing public spat between the two heirs ...
205,"Faster, better or funkier hardware alone is ...",3,faster better or funkier hardware alone is not...
1102,Martin O'Neill hopes to block Stilian Petrov...,0,martin oneill hopes to block stilian petrovs c...
668,Strong growth in subscriptions to mobile pho...,1,strong growth in subscriptions to mobile phone...


In [69]:
prediction

SequenceClassifierOutput(loss=None, logits=tensor([[-2.2951,  6.4143, -1.3542, -1.8120, -2.1397]], device='cuda:0',
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [70]:
int(torch.argmax(prediction.logits, axis=1).detach())

1

In [103]:
from sklearn.metrics import classification_report, confusion_matrix

In [98]:
y_true = test_df["label"]
y_pred = y_pred["Pred"]

In [100]:
print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        99
           1       0.98      0.98      0.98       104
           2       0.98      0.98      0.98        82
           3       1.00      0.99      0.99        85
           4       0.99      1.00      0.99        75

    accuracy                           0.99       445
   macro avg       0.99      0.99      0.99       445
weighted avg       0.99      0.99      0.99       445



In [104]:
print(confusion_matrix(y_true, y_pred))

[[ 99   0   0   0   0]
 [  0 102   2   0   0]
 [  0   1  80   0   1]
 [  0   1   0  84   0]
 [  0   0   0   0  75]]


In [105]:
model.save_pretrained("bbcmodel")

In [106]:
tokenizer.save_pretrained("tokenizer")

('tokenizer/tokenizer_config.json',
 'tokenizer/special_tokens_map.json',
 'tokenizer/vocab.txt',
 'tokenizer/added_tokens.json',
 'tokenizer/tokenizer.json')

In [107]:
!zip -r export.zip model

  adding: model/ (stored 0%)
  adding: model/vocab.txt (deflated 53%)
  adding: model/config.json (deflated 51%)
  adding: model/tokenizer.json (deflated 71%)
  adding: model/special_tokens_map.json (deflated 42%)
  adding: model/model.safetensors (deflated 8%)
  adding: model/tokenizer_config.json (deflated 76%)
  adding: model/training_args.bin (deflated 51%)


In [108]:
from transformers import pipeline

In [109]:
from transformers import BertTokenizerFast, AutoModelForTokenClassification

In [110]:
model = "distilbert-base-uncased"

In [111]:
tokenizer = BertTokenizerFast.from_pretrained(model)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'DistilBertTokenizer'. 
The class this function is called from is 'BertTokenizerFast'.


In [112]:
model_fine_tuned=AutoModelForTokenClassification.from_pretrained("/content/model")

In [114]:
classify_pipeline=pipeline("text-classification",model=model_fine_tuned,tokenizer=tokenizer)

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.
The model 'DistilBertForTokenClassification' is not supported for text-classification. Supported models are ['AlbertForSequenceClassification', 'BartForSequenceClassification', 'BertForSequenceClassification', 'BigBirdForSequenceClassification', 'BigBirdPegasusForSequenceClassification', 'BioGptForSequenceClassification', 'BloomForSequenceClassification', 'CamembertForSequenceClassification', 'CanineForSequenceClassification', 'LlamaForSequenceClassification', 'ConvBertForSequenceClassification', 'CTRLForSequenceClassification', 'Data2VecTextForSequenceClassification', 'DebertaForSequenceClassification', 'DebertaV2ForSequenceClassification', 'DistilBertForSequenceClassification', 'ElectraForSequenceClassification', 'ErnieForSequenceClassification', 'ErnieMForSequenceClassification', 'EsmForSequenceClassification', 'FalconForSequenceClassifica

In [115]:
example="Sunny is Data Scientist and Generative AI Engineer"

In [None]:
classify_pipeline(example)