In [1]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.29.2-py3-none-any.whl (7.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m49.8 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m31.0 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m59.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.14.1 tokenizers-0.13.3 transformers-4.29.2


In [2]:
import pandas as pd
import numpy as np
import torch
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import f1_score, accuracy_score
from sklearn.linear_model import LogisticRegression
from transformers import  AutoTokenizer, AutoModel
from tqdm import tqdm
from google.colab import files 

# Загрузка данных

In [3]:
uploaded = files.upload()

Saving test.csv to test.csv
Saving train.csv to train.csv


In [4]:
df_train = pd.read_csv('/content/train.csv')
df_test = pd.read_csv('/content/test.csv')

# Предварительный анализ данных

In [5]:
df_train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [6]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB


In [7]:
df_train['target'].value_counts()

0    4342
1    3271
Name: target, dtype: int64

# Загрузка предобученной модели и токенизатора

In [8]:
model_name = 'aellxx/raw_disaster_tweets' # Украл отсюда https://huggingface.co/aellxx/raw_disaster_tweets
tokenizer = AutoTokenizer.from_pretrained(model_name) # Создание объекта tokenizer, который будет использоваться для токенизации текста
model = AutoModel.from_pretrained(model_name) # Создание объекта model, который будет использоваться для получения эмбеддингов текста

Downloading (…)okenizer_config.json:   0%|          | 0.00/1.36k [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/799 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of the model checkpoint at aellxx/raw_disaster_tweets were not used when initializing RobertaModel: ['classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.out_proj.weight', 'classifier.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at aellxx/raw_disaster_tweets and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Токенизация текста и подготовка входных данных для модели

In [9]:
tokenized = df_train['text'].apply(lambda x: tokenizer.encode(x, add_special_tokens=True, max_length=512, truncation=True)) # Добавление специальных токенов, ограничение длины до 512 токенов и обрезка текста при необходимости
padded = np.array([i + [0]*(512-len(i)) for i in tokenized.values]) # Создание массива padded, содержащего последовательности токенов, выровненных по максимальной длине 512 с добавлением нулей для заполнения
attention_mask = np.where(padded != 0, 1, 0) # Создание массива attention_mask, который указывает, какие токены являются реальными токенами (1) и какие являются заполнением (0)

# Определение устройства для обучения модели (GPU или CPU)

In [10]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model.to(device)
print(device)

cuda:0


# Генерация эмбеддингов для текстовых данных

In [11]:
batch_size = 200
embeddings = []
# Генерация эмбеддингов для текстовых данных в пакетном режиме
for i in tqdm(range(0, padded.shape[0], batch_size)):
    batch = torch.tensor(padded[i:i+batch_size])
    attention_mask_batch = torch.tensor(attention_mask[i:i+batch_size])

    with torch.no_grad():
        batch_embeddings = model(batch.to(device), attention_mask=attention_mask_batch.to(device))

    embeddings.append(batch_embeddings[0][:, 0, :].cpu().numpy())

embeddings = np.concatenate(embeddings) # Соединение всех полученных эмбеддингов в один массив

100%|██████████| 39/39 [04:01<00:00,  6.19s/it]


# Подготовка данных для обучения и валидации модели

In [12]:
X = embeddings
y = df_train['target']

In [13]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.1, random_state=42, stratify=y)

# Обучение и оценка модели с использованием логистической регрессии

In [14]:
lr = LogisticRegression(random_state=42, solver='liblinear')
scores_f1 = cross_val_score(lr, X_train, y_train, cv=5, scoring='f1') # Оценка модели с использованием кросс-валидации (5 фолдов) на обучающей выборке
scores_acc = cross_val_score(lr, X_train, y_train, cv=5, scoring='accuracy')

In [15]:
print(f'LogisticRegression cross-validation accuracy: {scores_acc.mean():.4f} +/- {scores_acc.std():.4f}')
print(f'LogisticRegression cross-validation f1: {scores_f1.mean():.4f} +/- {scores_f1.std():.4f}')

LogisticRegression cross-validation accuracy: 0.8180 +/- 0.0130
LogisticRegression cross-validation f1: 0.7781 +/- 0.0167
