In [30]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, classification_report
from transformers import RobertaTokenizer, RobertaModel
import torch

df = pd.read_csv('homework2_data.csv', sep = ',')
# df = df.sample(frac = 0.05)
print(df)

            id                                               text author
0      id26305  This process, however, afforded me no means of...    EAP
1      id17569  It never once occurred to me that the fumbling...    HPL
2      id11008  In his left hand was a gold snuff box, from wh...    EAP
3      id27763  How lovely is spring As we looked from Windsor...    MWS
4      id12958  Finding nothing else, not even gold, the Super...    HPL
...        ...                                                ...    ...
19574  id17718  I could have fancied, while I looked at it, th...    EAP
19575  id08973  The lids clenched themselves together as if in...    EAP
19576  id05267  Mais il faut agir that is to say, a Frenchman ...    EAP
19577  id17513  For an item of news like this, it strikes us i...    EAP
19578  id00393  He laid a gnarled claw on my shoulder, and it ...    HPL

[19579 rows x 3 columns]


In [31]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder

df = df.drop(columns=['id'])

categorial_features = ['author']

ct = ColumnTransformer(
       transformers=[
           ('ordinal', OrdinalEncoder(), categorial_features)
       ],
       remainder='passthrough',
       verbose_feature_names_out=False
   )

encoded = ct.fit_transform(df)
df = pd.DataFrame(encoded, columns=ct.get_feature_names_out())
df['author'] = df['author'].astype(int)
df


Unnamed: 0,author,text
0,0,"This process, however, afforded me no means of..."
1,1,It never once occurred to me that the fumbling...
2,0,"In his left hand was a gold snuff box, from wh..."
3,2,How lovely is spring As we looked from Windsor...
4,1,"Finding nothing else, not even gold, the Super..."
...,...,...
19574,0,"I could have fancied, while I looked at it, th..."
19575,0,The lids clenched themselves together as if in...
19576,0,"Mais il faut agir that is to say, a Frenchman ..."
19577,0,"For an item of news like this, it strikes us i..."


In [32]:


class_column = 'author'
print(df[class_column].value_counts())
min_size = df[class_column].value_counts().min()

print('min class size =', min_size)

author
0    7900
2    6044
1    5635
Name: count, dtype: int64
min class size = 5635


In [33]:
df_downsampled = pd.DataFrame()
for class_type in pd.unique(df[class_column].values):
    sampled_class_df = df[df[class_column] == class_type].sample(min_size, random_state=777)
    df_downsampled = pd.concat([df_downsampled, sampled_class_df], ignore_index=True)
df_downsampled = df_downsampled[df_downsampled[class_column].notnull()]
df_downsampled = df_downsampled[df_downsampled['text'].notnull()]
print(df_downsampled)
print(df_downsampled[class_column].value_counts())
df = df_downsampled

       author                                               text
0           0  Is sure that it was not the voice of an Englis...
1           0  Meantime, our vegetation had perceptibly alter...
2           0               "Be a little more explicit," I said.
3           0  Yet, for some minutes longer I refrained and s...
4           0  Stay here to night, and I will send Jup down f...
...       ...                                                ...
16900       2  As a child I had not been content with the res...
16901       2  I trod air; no doubt, no fear, no hope even, d...
16902       2  A few words from us decided him, and hope and ...
16903       2  She did not in the least resemble either of he...
16904       2  The veil must be thicker than that invented by...

[16905 rows x 2 columns]
author
0    5635
1    5635
2    5635
Name: count, dtype: int64


In [34]:
from sklearn.model_selection import train_test_split

# Шаг 2: Разделение данных на train/test (80/20)
X_train, X_test, y_train, y_test = train_test_split(
    df['text'].values,
    df['author'].values,
    test_size=0.2,
    random_state=42,
    stratify=df['author']  # для сохранения пропорций классов
)

print(f"Training set size: {len(X_train)} ({len(X_train)/len(df)*100:.1f}%)")
print(f"Test set size: {len(X_test)} ({len(X_test)/len(df)*100:.1f}%)")



Training set size: 13524 (80.0%)
Test set size: 3381 (20.0%)


In [35]:
# Шаг 3: Инициализация токенизатора и модели BERT
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
bert_model = RobertaModel.from_pretrained('roberta-base')

# Перевод модели в режим оценки (отключение dropout и т.д.)
bert_model.eval()

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
bert_model.to(device)


Loading weights:   0%|          | 0/197 [00:00<?, ?it/s]

[1mRobertaModel LOAD REPORT[0m from: roberta-base
Key                             | Status     | 
--------------------------------+------------+-
lm_head.layer_norm.weight       | UNEXPECTED | 
lm_head.dense.weight            | UNEXPECTED | 
roberta.embeddings.position_ids | UNEXPECTED | 
lm_head.bias                    | UNEXPECTED | 
lm_head.layer_norm.bias         | UNEXPECTED | 
lm_head.dense.bias              | UNEXPECTED | 
pooler.dense.bias               | MISSING    | 
pooler.dense.weight             | MISSING    | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING[3m	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.[0m


RobertaModel(
  (embeddings): RobertaEmbeddings(
    (word_embeddings): Embedding(50265, 768, padding_idx=1)
    (token_type_embeddings): Embedding(1, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
    (position_embeddings): Embedding(514, 768, padding_idx=1)
  )
  (encoder): RobertaEncoder(
    (layer): ModuleList(
      (0-11): 12 x RobertaLayer(
        (attention): RobertaAttention(
          (self): RobertaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): RobertaSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (dropou

In [36]:
from tqdm import tqdm

def get_bert_embeddings(texts, batch_size=8):
    embeddings = []
    
    # Внешний прогресс-бар для текстов
    with tqdm(total=len(texts), desc="Всего текстов", position=0) as pbar_texts:
        # Внутренний прогресс-бар для батчей
        with tqdm(total=(len(texts) + batch_size - 1) // batch_size, 
                  desc="Батчи", position=1, leave=False) as pbar_batches:
            
            for i in range(0, len(texts), batch_size):
                batch_texts = texts[i:i + batch_size]
                actual_size = len(batch_texts)
                
                encoded = tokenizer(
                    batch_texts.tolist() if isinstance(batch_texts, np.ndarray) else batch_texts,
                    padding=True,
                    truncation=True,
                    max_length=512,
                    return_tensors='pt'
                )
                
                encoded = {key: val.to(device) for key, val in encoded.items()}
                
                with torch.no_grad():
                    outputs = bert_model(**encoded)
                    cls_embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()
                    embeddings.append(cls_embeddings)
                
                pbar_texts.update(actual_size)
                pbar_batches.update(1)
    
    return np.vstack(embeddings)


In [37]:
import numpy as np

def extract_additional_features(texts):
    """Извлекаем дополнительные признаки"""
    features = []
    
    for text in texts:
        text_str = str(text)
        
        feat = {
            'length': len(text_str),  # Длина текста
            'word_count': len(text_str.split()),  # Количество слов
            'avg_word_length': np.mean([len(w) for w in text_str.split()]) if text_str.split() else 0,
        }
        features.append(list(feat.values()))
    
    return np.array(features)

print("="*50)
print("BERT + ДОПОЛНИТЕЛЬНЫЕ ПРИЗНАКИ")
print("="*50)

# Получаем BERT эмбеддинги
X_train_bert = get_bert_embeddings(X_train)
X_test_bert = get_bert_embeddings(X_test)

# Добавляем дополнительные признаки
X_train_extra = extract_additional_features(X_train)
X_test_extra = extract_additional_features(X_test)

# Объединяем
X_train_embeddings = np.hstack([X_train_bert, X_train_extra])
X_test_embeddings = np.hstack([X_test_bert, X_test_extra])

print(f"BERT размерность: {X_train_bert.shape[1]}")
print(f"Дополнительные признаки: {X_train_extra.shape[1]}")
print(f"Итоговая размерность: {X_train_embeddings.shape[1]}")


BERT + ДОПОЛНИТЕЛЬНЫЕ ПРИЗНАКИ


Всего текстов: 100%|██████████| 13524/13524 [10:24<00:00, 21.65it/s]
Всего текстов: 100%|██████████| 3381/3381 [02:46<00:00, 20.30it/s]


BERT размерность: 768
Дополнительные признаки: 3
Итоговая размерность: 771


In [38]:
print(f"Размерность эмбеддингов: {X_train_embeddings.shape[1]}")

print("Проверка меток классов:")
print(f"Тип y_train: {type(y_train)}")
print(f"Dtype y_train: {y_train.dtype if hasattr(y_train, 'dtype') else 'N/A'}")
print(f"Первые 10 элементов y_train: {y_train[:10]}")
print(f"Уникальные значения: {np.unique(y_train)}")
print(f"Количество классов: {len(np.unique(y_train))}")


Размерность эмбеддингов: 771
Проверка меток классов:
Тип y_train: <class 'numpy.ndarray'>
Dtype y_train: int64
Первые 10 элементов y_train: [2 1 0 0 2 0 0 1 2 1]
Уникальные значения: [0 1 2]
Количество классов: 3


In [39]:
# from sklearn.model_selection import GridSearchCV, StratifiedKFold
# from xgboost import XGBClassifier
# from sklearn.metrics import f1_score, classification_report

# print("="*50)
# print("БЫСТРЫЙ GRID SEARCH ДЛЯ XGBOOST")
# print("="*50)

# param_grid_quick = {
#     'n_estimators': [200, 300],
#     'max_depth': [5, 7, 9],
#     'learning_rate': [0.05, 0.1],
#     'subsample': [0.8, 1.0],
#     'colsample_bytree': [0.8, 1.0],
# }

# print(f"Комбинаций: {2*3*2*2*2} = 48")
# print(f"С CV=3: 144 обучений\n")

# xgb = XGBClassifier(
#     random_state=42,
#     n_jobs=4,
#     eval_metric='mlogloss',
#     verbosity=3
# )

# cv_strategy = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

# grid_search = GridSearchCV(
#     estimator=xgb,
#     param_grid=param_grid_quick,
#     cv=cv_strategy,
#     scoring='f1_weighted',
#     scoring='f1_weighted',
#     n_jobs=4,
#     verbose=3  # Показывать прогресс
# )

# print("\n" + "="*50)
# print("ЛУЧШИЕ ПАРАМЕТРЫ")
# print("="*50)
# for key, value in grid_search.best_params_.items():
#     print(f"  {key}: {value}")

# grid_search.fit(X_train_embeddings, y_train)


# pip install xgboost

from xgboost import XGBClassifier
from sklearn.metrics import f1_score, classification_report

print("="*50)
print("XGBOOST КЛАССИФИКАТОР")
print("="*50)

xgb_clf = XGBClassifier(
    n_estimators=300,
    max_depth=5,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=4,
    eval_metric='mlogloss'
)

xgb_clf.fit(X_train_embeddings, y_train)
y_pred = xgb_clf.predict(X_test_embeddings)

f1 = f1_score(y_test, y_pred, average='weighted')
print(f"\nF1-score (XGBoost): {f1:.4f}")
print(classification_report(y_test, y_pred))


XGBOOST КЛАССИФИКАТОР

F1-score (XGBoost): 0.7739
              precision    recall  f1-score   support

           0       0.76      0.72      0.74      1127
           1       0.79      0.80      0.80      1127
           2       0.77      0.81      0.79      1127

    accuracy                           0.77      3381
   macro avg       0.77      0.77      0.77      3381
weighted avg       0.77      0.77      0.77      3381



In [40]:
y_pred = xgb_clf.predict(X_test_embeddings)

In [41]:

# Шаг 8: Вычисление метрик
# F1-score для многоклассовой классификации
f1_macro = f1_score(y_test, y_pred, average='macro')
f1_micro = f1_score(y_test, y_pred, average='micro')
f1_weighted = f1_score(y_test, y_pred, average='weighted')

print("\n" + "="*50)
print("РЕЗУЛЬТАТЫ")
print("="*50)
print(f"\nF1-score (macro):    {f1_macro:.4f}")
print(f"F1-score (micro):    {f1_micro:.4f}")
print(f"F1-score (weighted): {f1_weighted:.4f}")

# Детальный отчет по классам
print("\n" + "="*50)
print("ДЕТАЛЬНЫЙ ОТЧЕТ ПО КЛАССАМ")
print("="*50)
print(classification_report(y_test, y_pred))

# Дополнительно: важность признаков
print("\nТочность на обучающей выборке:", xgb_clf.score(X_train_embeddings, y_train))
print("Точность на тестовой выборке:", xgb_clf.score(X_test_embeddings, y_test))


РЕЗУЛЬТАТЫ

F1-score (macro):    0.7739
F1-score (micro):    0.7743
F1-score (weighted): 0.7739

ДЕТАЛЬНЫЙ ОТЧЕТ ПО КЛАССАМ
              precision    recall  f1-score   support

           0       0.76      0.72      0.74      1127
           1       0.79      0.80      0.80      1127
           2       0.77      0.81      0.79      1127

    accuracy                           0.77      3381
   macro avg       0.77      0.77      0.77      3381
weighted avg       0.77      0.77      0.77      3381


Точность на обучающей выборке: 0.9990387459331559
Точность на тестовой выборке: 0.7743271221532091
