# Load and Prepare Data

In [None]:
!rm nlp_text.zip
!curl -o nlp_text.zip -L 'https://drive.google.com/uc?export=download&confirm=yes&id=1n6DF2zBa0N9IpXUGNCAXm6m8L26nb6PL'
!unzip -o nlp_text.zip

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:--  0:00:10 --:--:--     0
100 4180k  100 4180k    0     0   366k      0  0:00:11  0:00:11 --:--:-- 9008k
Archive:  nlp_text.zip
  inflating: sample_submission.csv   
  inflating: test.csv                
  inflating: train.csv               


In [None]:
import pandas as pd
import numpy as np

In [None]:
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

print(f'Shape of train data : {train_data.shape}')
print(f'Shape of test data : {test_data.shape}')

Shape of train data : (41159, 3)
Shape of test data : (3798, 2)


In [None]:
train_data.head()

Unnamed: 0.1,Unnamed: 0,Text,Sentiment
0,0,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,Neutral
1,1,advice Talk to your neighbours family to excha...,Positive
2,2,Coronavirus Australia: Woolworths to give elde...,Positive
3,3,My food stock is not the only one which is emp...,Positive
4,4,"Me, ready to go at supermarket during the #COV...",Extremely Negative


In [None]:
train_data.isna().sum()

Unnamed: 0    1
Text          1
Sentiment     4
dtype: int64

In [None]:
train_data = train_data.dropna()

In [None]:
!pip install evaluate



In [None]:
! pip install transformers



In [None]:
import numpy as np
import pandas as pd
import os
import re
import torch
import evaluate
from sklearn import preprocessing

import transformers
from transformers import AutoTokenizer
from transformers import Trainer
from transformers import TrainingArguments
from transformers import (
    AutoConfig,
    AutoModel,
    AutoModelForSequenceClassification,
    AutoTokenizer,
    DataCollatorWithPadding,
    PretrainedConfig,
    default_data_collator
)

import datasets
from datasets import load_dataset
from datasets import load_dataset_builder
from datasets import Dataset

torch.cuda.is_available()
os.environ["WANDB_DISABLED"] = "true"

In [None]:
labels = train_data['Sentiment'].unique()
count_labels = len(train_data['Sentiment'].unique())

In [None]:
labels

array(['Neutral', 'Positive', 'Extremely Negative', 'Negative',
       'Extremely Positive'], dtype=object)

In [None]:
train_data.isna().sum()

Unnamed: 0    0
Text          0
Sentiment     0
dtype: int64

In [None]:
def cleaner(text :str):
    '''will clean from unicode, url, hashtags, numbers, punctuation, emoji,
    nicknames, lower'''
    text = str(text)  # convert to string

    text = re.sub(r'[.,#!$%\^&\*;:{}=\-_`~()]',r'',text) # punct.

    text = re.sub(r'(\\u[0-9A-Fa-f]+)',r'', text)
    text = re.sub(r'[^\x00-\x7f]',r'',text)# unicode

    text = re.sub(r'http\S+', r'', text)# url

    text = re.sub('@[A-Za-z0-9_-]+',r'',text)# nickname

    text = re.sub(r'#([^\s]+)', r'\1', text)# hashtag

    text = re.sub(':\)|;\)|:-\)|\(-:|:-D|=D|:P|xD|X-p|\^\^|:-*|\^\.\^|\^\-\^|\^\_\^|\,-\)|\)-:|:\'\(|:\(|:-\(|:\S|T\.T|\.\_\.|:<|:-\S|:-<|\*\-\*|:O|=O|=\-O|O\.o|XO|O\_O|:-\@|=/|:/|X\-\(|>\.<|>=\(|D:',
                  '', text) # emoji

    text = ''.join([i for i in text if not i.isdigit()])# int.

    text = text.lower()
    return text

In [None]:
train_data['text'] = train_data['Text'].apply(lambda x : cleaner(x))
test_data['text'] = test_data['Text'].apply(lambda x : cleaner(x))

In [None]:
train_data.head()

Unnamed: 0.1,Unnamed: 0,Text,Sentiment,text
0,0,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,Neutral,and and
1,1,advice Talk to your neighbours family to excha...,Positive,advice talk to your neighbours family to excha...
2,2,Coronavirus Australia: Woolworths to give elde...,Positive,coronavirus australia woolworths to give elder...
3,3,My food stock is not the only one which is emp...,Positive,my food stock is not the only one which is emp...
4,4,"Me, ready to go at supermarket during the #COV...",Extremely Negative,me ready to go at supermarket during the covid...


In [None]:
le = preprocessing.LabelEncoder()
le.fit(labels)

train_data['Sentiment'] = le.transform(train_data['Sentiment'])
train_data.rename(columns={'Sentiment': 'label'}, inplace=True)

In [None]:
train_data.head()

Unnamed: 0.1,Unnamed: 0,Text,label,text
0,0,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,3,and and
1,1,advice Talk to your neighbours family to excha...,4,advice talk to your neighbours family to excha...
2,2,Coronavirus Australia: Woolworths to give elde...,4,coronavirus australia woolworths to give elder...
3,3,My food stock is not the only one which is emp...,4,my food stock is not the only one which is emp...
4,4,"Me, ready to go at supermarket during the #COV...",0,me ready to go at supermarket during the covid...


In [None]:
count_labels = len(train_data['label'].unique())
count_labels

5

In [None]:
print(train_data.isnull().sum())

Unnamed: 0    0
Text          0
label         0
text          0
dtype: int64


In [None]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 41155 entries, 0 to 41158
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  41155 non-null  object
 1   Text        41155 non-null  object
 2   label       41155 non-null  int64 
 3   text        41155 non-null  object
dtypes: int64(1), object(3)
memory usage: 1.6+ MB


# Text Augmentation

Создадим дополнительные текстовые данные, чтобы количество предложений в каждом классе было одинаково

In [None]:
train_data['label'].value_counts()

4    11422
2     9917
3     7711
1     6624
0     5481
Name: label, dtype: int64

Как мы можем заметить количество текстов разное. Если мы оставим так , то метрика accuracy окажется неточной, поэтому мы дополним сгенерированными текстами все категории кроме 4. Как итог, мы должны получить олинаковое количество текстов в каждой категории(возьмем среднее значение после text augmentation)

In [None]:
! pip install nlpaug



In [None]:
import nlpaug.augmenter.word as naw
import pandas as pd
import numpy as np


class TextAugmentation:
    def __init__(self, df, aug_p = 0.3) -> None:
        """
        Инициализирует атрибуты класса TextAugmentation.

        Параметры:
        df (pandas.DataFrame): DataFrame с исходными данными.
        aug_p (float): вероятность замены слова в предложении на его синонимы.

        Выбрасывает AssertionError, если df не является экземпляром pandas.DataFrame или
        колонок 'text' и 'label' нет в df.
        """

        assert isinstance(df, pd.DataFrame), "Input should be an instance of pandas.DataFrame"
        assert "text" in df.columns, "DataFrame must contain 'text' column"
        assert "label" in df.columns, "DataFrame must contain 'label' column"

        self.df = df.copy()
        self.aug = naw.SynonymAug(aug_p=aug_p)
        self.mean_count = int(np.mean(df['label'].value_counts()))

    def _augment_texts(self) -> tuple:
        """
        Применяет аугментацию к текстам в df.

        Возвращает кортеж из двух списков:
        список аугментированных текстов ('augmented_text') и список соответствующих меток ('labels')
        """

        augmented_text = []
        labels = []
        for label in self.df['label'].unique():
            temp_aug_text = self.df[self.df['label'] == label]['text'].apply(lambda x: " ".join([str(elem) for elem in self.aug.augment(x)]))
            augmented_text.extend(temp_aug_text)
            labels.extend([label] * len(temp_aug_text))
        return augmented_text, labels

    def _create_same_size_labels(self, df) -> pd.DataFrame:
        """
        Создает DataFrame, в котором классы представлены в одинаковом количестве.

        prm df(pandas.DataFrame): DataFrame с исходными данными.

        Возвращает сбалансированный DataFrame.
        """

        shuffled = df.reindex(np.random.permutation(df.index))
        balanced_df = pd.concat([shuffled[shuffled['label'] == label][:self.mean_count] for label in df['label'].unique()], ignore_index=True)
        balanced_df = balanced_df.reindex(np.random.permutation(balanced_df.index))

        return balanced_df

    def get_df_with_text_augmentation(self) -> pd.DataFrame:
        """
        Получает DataFrame с аугментированными и сбалансированными данными.

        Возвращает DataFrame с аугментированными и сбалансированными данными.
        """

        augmented_text, labels = self._augment_texts()
        aug_df = pd.DataFrame({'text': augmented_text, 'label': labels})
        data_copy = pd.concat([self.df, aug_df], ignore_index=True)

        return self._create_same_size_labels(data_copy)

In [None]:
txt_aug = TextAugmentation(train_data)
train_data = txt_aug.get_df_with_text_augmentation()

In [None]:
train_data['label'].value_counts()

2    8231
1    8231
0    8231
3    8231
4    8231
Name: label, dtype: int64

In [None]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 41155 entries, 26737 to 9360
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  20601 non-null  object
 1   Text        20601 non-null  object
 2   label       41155 non-null  int64 
 3   text        41155 non-null  object
dtypes: int64(1), object(3)
memory usage: 1.6+ MB


In [None]:
Dataset.from_pandas(train_data[['text','label']])

Dataset({
    features: ['text', 'label', '__index_level_0__'],
    num_rows: 41155
})

# Load model and Tokenizer

In [None]:
from transformers import Trainer, AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, DataCollatorWithPadding
from datasets import Dataset, DatasetDict, load_metric
from sklearn.model_selection import train_test_split
import numpy as np
from scipy.special import softmax
from sklearn.metrics import accuracy_score
import torch

def tokenize_function(example):
    return tokenizer(example["text"], padding='max_length', truncation=True, max_length=128)

SEED = 42
torch.manual_seed(SEED)
np.random.seed(SEED)

checkpoints = ["bert-base-uncased", "roberta-base", "xlnet-base-cased"]
count_labels = len(train_data['label'].unique())

train_ds = []
test_ds = []
tokenizers = []
models = []

train_dt = Dataset.from_pandas(train_data[['text','label']])
splitted_data = train_dt.train_test_split(test_size=0.2, seed=SEED)
train_dataset = splitted_data['train']
test_dataset = splitted_data['test']

for checkpoint in checkpoints:
    tokenizer = AutoTokenizer.from_pretrained(checkpoint)
    model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=count_labels)

    tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
    tokenized_train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

    tokenized_test_dataset = test_dataset.map(tokenize_function, batched=True)
    tokenized_test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

    train_ds.append(tokenized_train_dataset)
    test_ds.append(tokenized_test_dataset)
    tokenizers.append(tokenizer)
    models.append(model)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/32924 [00:00<?, ? examples/s]

Map:   0%|          | 0/8231 [00:00<?, ? examples/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/32924 [00:00<?, ? examples/s]

Map:   0%|          | 0/8231 [00:00<?, ? examples/s]

Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['sequence_summary.summary.bias', 'sequence_summary.summary.weight', 'logits_proj.weight', 'logits_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/32924 [00:00<?, ? examples/s]

Map:   0%|          | 0/8231 [00:00<?, ? examples/s]

In [None]:
# function for metrics count
from datasets import load_metric

def compute_metrics(eval_preds):

    metric = load_metric('accuracy')
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

Ensemble

In [None]:
! pip install -U accelerate
! pip install -U transformers



In [None]:
! pip install transformers[torch] -U



In [None]:
import accelerate
import transformers

transformers.__version__, accelerate.__version__

('4.33.3', '0.23.0')

In [None]:
training_args_list = [
    TrainingArguments(
        "test-trainer-1",
        evaluation_strategy="epoch",
        num_train_epochs=3,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=32,
        learning_rate=2e-5,
        lr_scheduler_type='linear',
        warmup_steps=600,
        weight_decay=0.01,
        logging_dir="./logs_1",report_to="none"
    ),

    TrainingArguments(
        "test-trainer-2",
        evaluation_strategy="epoch",
        num_train_epochs=3,
        per_device_train_batch_size=32,
        per_device_eval_batch_size=64,
        learning_rate=2e-5,
        lr_scheduler_type='cosine',
        warmup_steps=300,
        weight_decay=0.05,
        logging_dir="./logs_2",report_to="none"
    ),
      TrainingArguments(
        "test-trainer-3",
        evaluation_strategy="epoch",
        num_train_epochs=3,
        per_device_train_batch_size=32,
        per_device_eval_batch_size=64,
        learning_rate=2e-5,
        lr_scheduler_type='cosine_with_restarts',
        warmup_steps=600,
        weight_decay=0.05,
        logging_dir="./logs_3",report_to="none"
    )
]

In [None]:
from transformers import DefaultDataCollator, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score
from scipy.special import softmax
from torch import cuda
import numpy as np

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
accuracies = []

for i in range(len(models)):
    model = models[i].to(device)
    trainer_arguments = training_args_list[i]
    tokenizer = tokenizers[i]

    trainer = Trainer(
        model=model,
        args=trainer_arguments,
        train_dataset=train_ds[i],
        eval_dataset=test_ds[i],
        tokenizer=tokenizer,
        data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
        compute_metrics=compute_metrics
    )

    trainer.train()
    trainer.save_model(f'model_{i}')

    prediction_output = trainer.predict(test_ds[i])
    predictions = np.argmax(softmax(prediction_output.predictions, axis=-1), axis=-1)

    accuracy = accuracy_score(prediction_output.label_ids, predictions)
    accuracies.append(accuracy)

    print(f'Trained model {i} with accuracy {accuracy}')

best_model_index = np.argmax(accuracies)
best_model = models[best_model_index]

print(f"The best model is model_{best_model_index} with accuracy of {accuracies[best_model_index]}")

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.7898,0.73424,0.72555
2,0.5828,0.660909,0.75981
3,0.4027,0.684423,0.777184


  metric = load_metric('accuracy')


Downloading builder script:   0%|          | 0.00/1.65k [00:00<?, ?B/s]

Trained model 0 with accuracy 0.7771838172761512


You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.8996,0.842266,0.666869
2,0.6971,0.71918,0.731503
3,0.5735,0.724798,0.736484


Trained model 1 with accuracy 0.7364840238124165


You're using a XLNetTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy
1,1.002,0.905762,0.632122
2,0.7375,0.745483,0.718625
3,0.563,0.77754,0.721298


Trained model 2 with accuracy 0.7212975337140081
The best model is model_0 with accuracy of 0.7771838172761512


# Get Prediction and Create Submission

In [None]:
test_data.head()

Unnamed: 0,id,Text,text
0,787bc85b-20d4-46d8-84a0-562a2527f684,TRENDING: New Yorkers encounter empty supermar...,trending new yorkers encounter empty supermark...
1,17e934cd-ba94-4d4f-9ac0-ead202abe241,When I couldn't find hand sanitizer at Fred Me...,when i couldn't find hand sanitizer at fred me...
2,5914534b-2b0f-4de8-bb8a-e25587697e0d,Find out how you can protect yourself and love...,find out how you can protect yourself and love...
3,cdf06cfe-29ae-48ee-ac6d-be448103ba45,#Panic buying hits #NewYork City as anxious sh...,panic buying hits newyork city as anxious shop...
4,aff63979-0256-4fb9-a2d9-86a3d3ca5470,#toiletpaper #dunnypaper #coronavirus #coronav...,toiletpaper dunnypaper coronavirus coronavirus...


In [60]:
sample_submission = pd.read_csv('sample_submission.csv')

In [65]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

def get_prediction(text):
    inputs = tokenizer(text, truncation=True,padding=True, return_tensors="pt")
    outputs = best_model(**inputs)
    proba = outputs[0].softmax(1)
    return proba.argmax().item()

import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Загрузка лучшей модели
best_model_index = np.argmax(accuracies)
model_path = f"model_{best_model_index}"
best_model = AutoModelForSequenceClassification.from_pretrained(model_path)
tokenizer = tokenizers[best_model_index]

#test
test_dt = Dataset.from_pandas(test_data)
tokenized_test = test_dt.map(tokenize_function, batched=True)
tokenized_test = tokenized_test.remove_columns(['Text'])


predictions = test_data['text'].apply(lambda text: get_prediction(text))
sample_submission['Sentiment'] = le.inverse_transform(predictions)

Map:   0%|          | 0/3798 [00:00<?, ? examples/s]

In [70]:
test_data['Sentiment'] = le.inverse_transform(predictions)
test_data = test_data.drop(['Text','text'], axis = 'columns')
test_data.to_csv('submission.csv',index=False)

In [None]:
import zipfile
import os
import datetime

dt = datetime.datetime.now()  # получаем дату и время!
now_date = dt.date().strftime("%Y-%m-%d")  # Текущая дата
now_time = dt.time().strftime("%H-%M-%S")  # Текущее время
backup_folders = [f'/content/model_{best_model_index}']  # Список папок для архивации
arch_name = "backup_" + str(now_date) + ".zip"  # имя архива!
ignore_file = []  # если надо исключить файлы

def mybackup(arch, folder_list, mode):
    # Счетчики
    num = 0
    num_ignore = 0
    # Создание нового архива
    z = zipfile.ZipFile(arch, mode, zipfile.ZIP_DEFLATED, True)
    # Получаем папки из списка папок.
    for add_folder in folder_list:
        # Список всех файлов и папок в директории add_folder
        for root, dirs, files in os.walk(add_folder):
            for file in files:
                if file in ignore_file:  # Исключаем лишние файлы
                    print("Исключен! ", str(file))
                    num_ignore += 1
                    continue
                # Создание относительных путей и запись файлов в архив
                path = os.path.join(root, file)
                z.write(path)
                print(num, path)
                num += 1
    z.close()
    print("------------------------------")
    print("Добавлено: ", num)
    print("Проигнорировано: ", num_ignore)

print(now_time, now_date)
# создаст архив при наличии перезапишет существующий
mybackup(arch_name, backup_folders, "w")

15-10-58 2023-10-03
0 /content/model_0/tokenizer_config.json
1 /content/model_0/special_tokens_map.json
2 /content/model_0/training_args.bin
3 /content/model_0/config.json
4 /content/model_0/pytorch_model.bin
5 /content/model_0/vocab.txt
6 /content/model_0/tokenizer.json
------------------------------
Добавлено:  7
Проигнорировано:  0
