In [None]:
!unzip /content/dataset.zip -d /content/dataset

In [None]:
!pip install transformers[torch] datasets evaluate

In [None]:
import os
from sklearn.model_selection import train_test_split
import pandas as pd

In [None]:
dataframes = [pd.read_csv(f'/content/dataset/{filename}', header=None, index_col=False) for filename in os.listdir('/content/dataset')]

In [None]:
dataframe = pd.concat(dataframes).rename(columns={0: 'text'})

In [None]:
dataframe['label'] = dataframe[list(range(1, 30))].values.argmax(axis=1)
dataframe.drop(list(range(1, 30)), axis=1, inplace=True)

In [None]:
dataframe.head()

Unnamed: 0,text,label
0,шерстяной переполох это канал где обитают самы...,10
1,отец работает на рынке сегодня срочно вызвали ...,10
2,украинцы в тиктоке выбирают сторону,10
3,азербайджан начал сво в карабахе следим за соб...,10
4,благодаря интернету и бобрам мы начали понимат...,10


In [None]:
from datasets import Dataset

raw_dataset = Dataset.from_pandas(dataframe).remove_columns('__index_level_0__')

In [None]:
raw_dataset

Dataset({
    features: ['text', 'label'],
    num_rows: 70855
})

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [None]:
def preprocess(examples):
    return tokenizer(examples['text'], max_length=512, truncation=True)

In [None]:
tokenized_dataset = raw_dataset.map(preprocess, batched=True)

Map:   0%|          | 0/70855 [00:00<?, ? examples/s]

In [None]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
tokenized_dataset = tokenized_dataset.train_test_split(test_size=0.2)

In [None]:
import evaluate

accuracy = evaluate.load("accuracy")

In [None]:
import numpy as np


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [None]:
from enum import Enum


class Categories(int, Enum):
    BLOGS = 0
    NEWS = 1
    ECONOMICS = 2
    FINANCES = 3
    POLITICS = 4
    LAW = 5
    MARKETING = 6
    CRYPTOCURRENCIES = 7
    BUSINESS = 8
    SHOWBIZ = 9
    ENTERTAINMENT = 10
    ART = 11
    FILMS = 12
    PICTURES = 13
    GAMES = 14
    SPORT = 15
    TRAVELLING = 16
    MUSIC = 17
    FASHION = 18
    CULINARY = 19
    PHRASES = 20
    DIY = 21
    TECHNOLOGIES = 22
    APPLICATIONS = 23
    EDUCATION = 24
    MEDICINE = 25
    PSYCHOLOGY = 26
    DESIGN = 27
    OTHER = 28


id2label = {i.name: i.value for i in Categories}
label2id = {value: key for key, value in id2label.items()}

In [None]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=len(id2label), id2label=id2label, label2id=label2id
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
training_args = TrainingArguments(
    output_dir="classification_model",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,1.8012,1.635183,0.543222
2,1.3945,1.34992,0.614
3,1.1965,1.269626,0.639616


TrainOutput(global_step=10629, training_loss=1.6566670577763187, metrics={'train_runtime': 8716.9664, 'train_samples_per_second': 19.508, 'train_steps_per_second': 1.219, 'total_flos': 2.253411131213952e+16, 'train_loss': 1.6566670577763187, 'epoch': 3.0})

In [None]:
trainer.save_model("model")

In [None]:
!zip -r /content/model.zip /content/model

  adding: content/model/ (stored 0%)
  adding: content/model/tokenizer_config.json (deflated 76%)
  adding: content/model/special_tokens_map.json (deflated 42%)
  adding: content/model/vocab.txt (deflated 53%)
  adding: content/model/config.json (deflated 56%)
  adding: content/model/model.safetensors (deflated 8%)
  adding: content/model/tokenizer.json (deflated 71%)
  adding: content/model/training_args.bin (deflated 51%)


In [None]:
import shutil

shutil.copyfile('model.zip', '/content/drive/MyDrive/classification_model.zip')

'/content/drive/MyDrive/classification_model.zip'