# Import & upload

In [12]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
import torch
from torch.utils.data import DataLoader, TensorDataset, random_split, RandomSampler, SequentialSampler
import pandas as pd
import numpy as np
import torch.nn.functional as F

In [13]:
# Загрузка данных
train_data = pd.read_csv("train.csv", index_col=0).dropna()
test_data = pd.read_csv("test.csv", index_col=0).dropna().reset_index(drop=True)

In [35]:
test_data1 = pd.read_csv("test.csv").dropna().reset_index(drop=True)
test_data1

Unnamed: 0,id,Text
0,787bc85b-20d4-46d8-84a0-562a2527f684,TRENDING: New Yorkers encounter empty supermar...
1,17e934cd-ba94-4d4f-9ac0-ead202abe241,When I couldn't find hand sanitizer at Fred Me...
2,5914534b-2b0f-4de8-bb8a-e25587697e0d,Find out how you can protect yourself and love...
3,cdf06cfe-29ae-48ee-ac6d-be448103ba45,#Panic buying hits #NewYork City as anxious sh...
4,aff63979-0256-4fb9-a2d9-86a3d3ca5470,#toiletpaper #dunnypaper #coronavirus #coronav...
...,...,...
3793,65712d27-5c41-4863-b74f-0bd66199b7df,Meanwhile In A Supermarket in Israel -- People...
3794,9fd189c5-e79c-49d7-8985-576450a4e6e3,Did you panic buy a lot of non-perishable item...
3795,3a06785f-6f9b-4f4d-9880-22562ad3e296,Asst Prof of Economics @cconces was on @NBCPhi...
3796,dd29ff09-9bc2-40f4-8201-4b6361aca760,Gov need to do somethings instead of biar je r...


# Preproc

In [14]:
label_to_int = {'Extremely Positive': 4, 'Positive': 3, 'Neutral': 2, 'Negative': 1, 'Extremely Negative': 0}
train_data['Sentiment'] = train_data['Sentiment'].replace(label_to_int)

In [15]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def tokenize_batch(batch):
    return tokenizer(list(batch['Text']), padding=True, truncation=True)

In [16]:
train_data, val_data = train_test_split(train_data, test_size=0.2)

train_batch = tokenize_batch(train_data)
val_batch = tokenize_batch(val_data)

In [18]:
batch_size = 32

train_dataset = TensorDataset(
    torch.tensor(train_batch['input_ids'], dtype=torch.long), 
    torch.tensor(train_batch['attention_mask'], dtype=torch.long), 
    torch.tensor(train_data['Sentiment'].to_numpy(), dtype=torch.long)
)

val_dataset = TensorDataset(
    torch.tensor(val_batch['input_ids'], dtype=torch.long),
    torch.tensor(val_batch['attention_mask'], dtype=torch.long),
    torch.tensor(val_data['Sentiment'].to_numpy(), dtype=torch.long)
)

train_dataloader = DataLoader(train_dataset, sampler=RandomSampler(train_dataset), batch_size=batch_size)
val_dataloader = DataLoader(val_dataset, sampler=SequentialSampler(val_dataset), batch_size=batch_size)

In [19]:
# HP

learning_rates = [2e-5, 3e-5]
batch_sizes = [32, 64]

# Model

In [21]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Инициализация модели
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=5)
model.to(device)

# Оптимизатор и sheduler
optimizer = AdamW(model.parameters(), lr=1e-5)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(train_dataloader))

# Обучение и валидация
num_epochs = 3
for epoch in range(num_epochs):
    model.train()
    for batch in train_dataloader:
        input_ids, attention_mask, labels = tuple(t.to(device) for t in batch)
        
        optimizer.zero_grad()
        
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        
        # Регуляризация и обновление весов
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()
    
    model.eval()
    val_preds, val_labels = [], []
    for batch in val_dataloader:
        input_ids, attention_mask, labels = tuple(t.to(device) for t in batch)
        
        with torch.no_grad():
            outputs = model(input_ids, attention_mask=attention_mask)
        
        logits = outputs.logits
        logits = logits.detach().cpu().numpy()
        label_ids = labels.to('cpu').numpy()
        
        val_preds.extend(np.argmax(logits, axis=1).flatten())
        val_labels.extend(label_ids.flatten())
        
    val_accuracy = accuracy_score(val_labels, val_preds)
    print(f"Validation Accuracy: {val_accuracy}")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Validation Accuracy: 0.6400194387073259
Validation Accuracy: 0.6400194387073259
Validation Accuracy: 0.6400194387073259


# Ensemble

In [22]:
# ф-я ансамбля
def ensemble_predict(models, dataloader):
    ensemble_preds, true_labels = [], []
    
    for batch in dataloader:
        batch_preds = np.zeros((batch[0].shape[0], 5))  # 5 классов
        input_ids, attention_mask, labels = tuple(t.to(device) for t in batch)
        
        for model in models:
            model.eval()
            with torch.no_grad():
                outputs = model(input_ids, attention_mask=attention_mask)
            
            logits = outputs.logits
            logits = F.softmax(logits, dim=1)
            logits = logits.detach().cpu().numpy()
            
            batch_preds += logits
            
        batch_preds /= len(models)  # Усреднение
        ensemble_preds.extend(np.argmax(batch_preds, axis=1).flatten())
        true_labels.extend(labels.to('cpu').numpy().flatten())
    
    return np.array(ensemble_preds), np.array(true_labels)

In [23]:
# models_list = [model1, model2, model3]  # model1, model2 и model3 должны быть предварительно обучены

# ensemble_preds, true_labels = ensemble_predict(models_list, val_dataloader)
# ensemble_accuracy = accuracy_score(true_labels, ensemble_preds)

# print(f"Ensemble Validation Accuracy: {ensemble_accuracy}")

In [None]:
# # Экспорт предсказаний в CSV
# test_batch = tokenize_batch(test_data)  # `test_data` содержит тестовый датасет

# test_dataset = TensorDataset(
#     torch.tensor(test_batch['input_ids'], dtype=torch.long),
#     torch.tensor(test_batch['attention_mask'], dtype=torch.long)
# )
# test_dataloader = DataLoader(test_dataset, sampler=SequentialSampler(test_dataset), batch_size=batch_size)

# ensemble_test_preds, _ = ensemble_predict(models_list, test_dataloader)

# # Обратное преобразование предсказаний (decoding)
# sentiment_map = {0: "Extremely Negative", 1: "Negative", 2: "Neutral", 3: "Positive", 4: "Extremely Positive"}
# decoded_preds = [sentiment_map[pred] for pred in ensemble_test_preds]

# # Сохранение в CSV
# output_df = pd.DataFrame({"Text": test_data['Text'], "Sentiment": decoded_preds})
# output_df.to_csv("ensemble_predictions.csv", index=False)

In [24]:
# Предсказание и экспорт для одной модели
def predict_single_model(model, dataloader):
    model_preds = []
    model.eval()

    for batch in dataloader:
        input_ids, attention_mask = tuple(t.to(device) for t in batch)
        
        with torch.no_grad():
            outputs = model(input_ids, attention_mask=attention_mask)
        
        logits = outputs.logits
        logits = logits.detach().cpu().numpy()
        
        model_preds.extend(np.argmax(logits, axis=1).flatten())
        
    return np.array(model_preds)

In [36]:
# Подготовка тестового DataLoader
test_batch = tokenize_batch(test_data1)

In [37]:
test_dataset = TensorDataset(
    torch.tensor(test_batch['input_ids'], dtype=torch.long),
    torch.tensor(test_batch['attention_mask'], dtype=torch.long)
)

In [38]:
test_dataloader = DataLoader(test_dataset, sampler=SequentialSampler(test_dataset), batch_size=batch_size)

In [39]:
# Применение функции для предсказания
single_model_preds = predict_single_model(model, test_dataloader)

In [40]:
# Обратное преобразование предсказаний (decoding)
sentiment_map = {0: "Extremely Negative", 1: "Negative", 2: "Neutral", 3: "Positive", 4: "Extremely Positive"}
decoded_preds = [sentiment_map[pred] for pred in single_model_preds]

In [42]:
test_data

Unnamed: 0,Text
0,TRENDING: New Yorkers encounter empty supermar...
1,When I couldn't find hand sanitizer at Fred Me...
2,Find out how you can protect yourself and love...
3,#Panic buying hits #NewYork City as anxious sh...
4,#toiletpaper #dunnypaper #coronavirus #coronav...
...,...
3793,Meanwhile In A Supermarket in Israel -- People...
3794,Did you panic buy a lot of non-perishable item...
3795,Asst Prof of Economics @cconces was on @NBCPhi...
3796,Gov need to do somethings instead of biar je r...


In [43]:
# Экспорт предсказаний в CSV
output_df = pd.DataFrame({"id": test_data1['id'], "Sentiment": decoded_preds})
output_df.to_csv("single_model_bert_predictions.csv", index=False)