In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, classification_report
from transformers import RobertaTokenizer, RobertaModel
import torch

df = pd.read_csv('homework2_data.csv', sep = ',')

print(df)

            id                                               text author
4097   id12495  Besides, though the violence of her anguish ma...    MWS
15325  id01543  He stopped in his tracks then, flailing his ar...    HPL
2719   id03377  "Extremely singular" said the young Baron, wit...    EAP
5067   id06179  'It is impossible,' it urges, 'that a person s...    EAP
13795  id21333  Thus has a week passed away, while I have list...    MWS
...        ...                                                ...    ...
13451  id09541  The elder partner of the firm, however, would ...    EAP
7556   id10243  It was hard to conceive how all this beauty ha...    EAP
6055   id23719  Mr. Crab having now paused in his discourse, t...    EAP
1923   id12590                 What was it that had enmeshed him?    HPL
4389   id07568  I did not at least during the long period in w...    EAP

[979 rows x 3 columns]


In [23]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder

df = df.drop(columns=['id'])

categorial_features = ['author']

ct = ColumnTransformer(
       transformers=[
           ('ordinal', OrdinalEncoder(), categorial_features)
       ],
       remainder='passthrough',
       verbose_feature_names_out=False
   )

encoded = ct.fit_transform(df)
df = pd.DataFrame(encoded, columns=ct.get_feature_names_out())
df['author'] = df['author'].astype(int)
df


Unnamed: 0,author,text
0,2,"Besides, though the violence of her anguish ma..."
1,1,"He stopped in his tracks then, flailing his ar..."
2,0,"""Extremely singular"" said the young Baron, wit..."
3,0,"'It is impossible,' it urges, 'that a person s..."
4,2,"Thus has a week passed away, while I have list..."
...,...,...
974,0,"The elder partner of the firm, however, would ..."
975,0,It was hard to conceive how all this beauty ha...
976,0,"Mr. Crab having now paused in his discourse, t..."
977,1,What was it that had enmeshed him?


In [24]:


class_column = 'author'
print(df[class_column].value_counts())
min_size = df[class_column].value_counts().min()

print('min class size =', min_size)

author
0    427
2    294
1    258
Name: count, dtype: int64
min class size = 258


In [25]:
df_downsampled = pd.DataFrame()
for class_type in pd.unique(df[class_column].values):
    sampled_class_df = df[df[class_column] == class_type].sample(min_size, random_state=777)
    df_downsampled = pd.concat([df_downsampled, sampled_class_df], ignore_index=True)
df_downsampled = df_downsampled[df_downsampled[class_column].notnull()]
df_downsampled = df_downsampled[df_downsampled['text'].notnull()]
print(df_downsampled)
print(df_downsampled[class_column].value_counts())
df = df_downsampled

     author                                               text
0         2              We were surely sufficiently degraded.
1         2  "I remember, the first time that I did this, t...
2         2  For the last time we looked on the wide extent...
3         2  At first these agonizing plaints filled me wit...
4         2  Perdita at length subdued her burst of passion...
..      ...                                                ...
769       0  It is therefore evident that, ascend as high a...
770       0  The scene of the two outrages will naturally b...
771       0  In halls such as these in a bridal chamber suc...
772       0  Whenever a rich old hunks or prodigal heir or ...
773       0  "Precisely the one went east and the other wen...

[774 rows x 2 columns]
author
2    258
1    258
0    258
Name: count, dtype: int64


In [26]:
from sklearn.model_selection import train_test_split

# Шаг 2: Разделение данных на train/test (80/20)
X_train, X_test, y_train, y_test = train_test_split(
    df['text'].values,
    df['author'].values,
    test_size=0.2,
    random_state=42,
    stratify=df['author']  # для сохранения пропорций классов
)

print(f"Training set size: {len(X_train)} ({len(X_train)/len(df)*100:.1f}%)")
print(f"Test set size: {len(X_test)} ({len(X_test)/len(df)*100:.1f}%)")



Training set size: 619 (80.0%)
Test set size: 155 (20.0%)


In [27]:
# Шаг 3: Инициализация токенизатора и модели BERT
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
bert_model = RobertaModel.from_pretrained('roberta-base')

# Перевод модели в режим оценки (отключение dropout и т.д.)
bert_model.eval()

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
bert_model.to(device)


vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/197 [00:00<?, ?it/s]

[1mRobertaModel LOAD REPORT[0m from: roberta-base
Key                             | Status     | 
--------------------------------+------------+-
lm_head.bias                    | UNEXPECTED | 
lm_head.layer_norm.bias         | UNEXPECTED | 
lm_head.dense.bias              | UNEXPECTED | 
roberta.embeddings.position_ids | UNEXPECTED | 
lm_head.layer_norm.weight       | UNEXPECTED | 
lm_head.dense.weight            | UNEXPECTED | 
pooler.dense.bias               | MISSING    | 
pooler.dense.weight             | MISSING    | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING[3m	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.[0m


RobertaModel(
  (embeddings): RobertaEmbeddings(
    (word_embeddings): Embedding(50265, 768, padding_idx=1)
    (token_type_embeddings): Embedding(1, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
    (position_embeddings): Embedding(514, 768, padding_idx=1)
  )
  (encoder): RobertaEncoder(
    (layer): ModuleList(
      (0-11): 12 x RobertaLayer(
        (attention): RobertaAttention(
          (self): RobertaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): RobertaSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (dropou

In [28]:
from tqdm import tqdm

def get_bert_embeddings(texts, batch_size=8):
    embeddings = []
    
    # Внешний прогресс-бар для текстов
    with tqdm(total=len(texts), desc="Всего текстов", position=0) as pbar_texts:
        # Внутренний прогресс-бар для батчей
        with tqdm(total=(len(texts) + batch_size - 1) // batch_size, 
                  desc="Батчи", position=1, leave=False) as pbar_batches:
            
            for i in range(0, len(texts), batch_size):
                batch_texts = texts[i:i + batch_size]
                actual_size = len(batch_texts)
                
                encoded = tokenizer(
                    batch_texts.tolist() if isinstance(batch_texts, np.ndarray) else batch_texts,
                    padding=True,
                    truncation=True,
                    max_length=512,
                    return_tensors='pt'
                )
                
                encoded = {key: val.to(device) for key, val in encoded.items()}
                
                with torch.no_grad():
                    outputs = bert_model(**encoded)
                    cls_embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()
                    embeddings.append(cls_embeddings)
                
                pbar_texts.update(actual_size)
                pbar_batches.update(1)
    
    return np.vstack(embeddings)


In [29]:
# Шаг 5: Получение эмбеддингов для train и test данных
print("Получение эмбеддингов для обучающей выборки...")
X_train_embeddings = get_bert_embeddings(X_train)

print("Получение эмбеддингов для тестовой выборки...")
X_test_embeddings = get_bert_embeddings(X_test)

Получение эмбеддингов для обучающей выборки...


Всего текстов: 100%|██████████| 619/619 [00:28<00:00, 22.09it/s]


Получение эмбеддингов для тестовой выборки...


Всего текстов: 100%|██████████| 155/155 [00:06<00:00, 22.98it/s]


In [30]:
print(f"Размерность эмбеддингов: {X_train_embeddings.shape[1]}")

print("Проверка меток классов:")
print(f"Тип y_train: {type(y_train)}")
print(f"Dtype y_train: {y_train.dtype if hasattr(y_train, 'dtype') else 'N/A'}")
print(f"Первые 10 элементов y_train: {y_train[:10]}")
print(f"Уникальные значения: {np.unique(y_train)}")
print(f"Количество классов: {len(np.unique(y_train))}")


Размерность эмбеддингов: 768
Проверка меток классов:
Тип y_train: <class 'numpy.ndarray'>
Dtype y_train: int64
Первые 10 элементов y_train: [1 2 1 1 1 1 2 2 1 0]
Уникальные значения: [0 1 2]
Количество классов: 3


In [31]:
from xgboost import XGBClassifier
from sklearn.metrics import f1_score, classification_report

print("="*50)
print("XGBOOST КЛАССИФИКАТОР")
print("="*50)

xgb_clf = XGBClassifier(
    n_estimators=300,
    max_depth=7,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1,
    eval_metric='mlogloss'
)

xgb_clf.fit(X_train_embeddings, y_train)


XGBOOST КЛАССИФИКАТОР


0,1,2
,objective,'multi:softprob'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.8
,device,
,early_stopping_rounds,
,enable_categorical,False


In [32]:
y_pred = xgb_clf.predict(X_test_embeddings)

In [33]:

# Шаг 8: Вычисление метрик
# F1-score для многоклассовой классификации
f1_macro = f1_score(y_test, y_pred, average='macro')
f1_micro = f1_score(y_test, y_pred, average='micro')
f1_weighted = f1_score(y_test, y_pred, average='weighted')

print("\n" + "="*50)
print("РЕЗУЛЬТАТЫ")
print("="*50)
print(f"\nF1-score (macro):    {f1_macro:.4f}")
print(f"F1-score (micro):    {f1_micro:.4f}")
print(f"F1-score (weighted): {f1_weighted:.4f}")

# Детальный отчет по классам
print("\n" + "="*50)
print("ДЕТАЛЬНЫЙ ОТЧЕТ ПО КЛАССАМ")
print("="*50)
print(classification_report(y_test, y_pred))

# Дополнительно: важность признаков
print("\nТочность на обучающей выборке:", xgb_clf.score(X_train_embeddings, y_train))
print("Точность на тестовой выборке:", xgb_clf.score(X_test_embeddings, y_test))


РЕЗУЛЬТАТЫ

F1-score (macro):    0.7471
F1-score (micro):    0.7484
F1-score (weighted): 0.7469

ДЕТАЛЬНЫЙ ОТЧЕТ ПО КЛАССАМ
              precision    recall  f1-score   support

           0       0.76      0.80      0.78        51
           1       0.86      0.62      0.72        52
           2       0.67      0.83      0.74        52

    accuracy                           0.75       155
   macro avg       0.77      0.75      0.75       155
weighted avg       0.77      0.75      0.75       155


Точность на обучающей выборке: 1.0
Точность на тестовой выборке: 0.7483870967741936
