In [None]:
import random

from tqdm import tqdm

import numpy as np
import pandas as pd

import re
import string

from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import log_loss

from sklearn.feature_extraction.text import CountVectorizer

from keras.preprocessing.sequence import pad_sequences

import torch
import torch.nn as nn
from torch.nn import functional as F
from torch.utils.data import DataLoader
from torch.nn.functional import relu, sigmoid
from torch import optim

In [None]:
torch.__version__

'2.6.0+cu124'

In [None]:
print("Number of GPU: ", torch.cuda.device_count())
print("GPU Name: ", torch.cuda.get_device_name())


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)

Number of GPU:  1
GPU Name:  Tesla T4
Using device: cuda


In [None]:
DEVICE="cuda" if torch.cuda.is_available() else "cpu"
DEBUG = 1
N_FOLDS = 5
PATH = "/content/sample_data"
DEVICE

'cuda'

In [None]:
torch.manual_seed(42)
torch.cuda.manual_seed(42)
random.seed(42)

In [None]:
df = pd.read_csv(PATH+"/train.csv")

In [None]:
df.head(5)

Unnamed: 0,id,text,author
0,id26305,"This process, however, afforded me no means of...",EAP
1,id17569,It never once occurred to me that the fumbling...,HPL
2,id11008,"In his left hand was a gold snuff box, from wh...",EAP
3,id27763,How lovely is spring As we looked from Windsor...,MWS
4,id12958,"Finding nothing else, not even gold, the Super...",HPL


In [None]:
value_counts = df['author'].value_counts()
count = len(value_counts)
value_counts,count

(author
 EAP    7900
 MWS    6044
 HPL    5635
 Name: count, dtype: int64,
 3)

In [None]:
encoded_df = pd.get_dummies(df, columns=['author'])
encoded_df.head(5)

Unnamed: 0,id,text,author_EAP,author_HPL,author_MWS
0,id26305,"This process, however, afforded me no means of...",True,False,False
1,id17569,It never once occurred to me that the fumbling...,False,True,False
2,id11008,"In his left hand was a gold snuff box, from wh...",True,False,False
3,id27763,How lovely is spring As we looked from Windsor...,False,False,True
4,id12958,"Finding nothing else, not even gold, the Super...",False,True,False


In [None]:
encoded_df['author_EAP'] = encoded_df['author_EAP'].map({True: 1, False: 0})
encoded_df.head(5)

Unnamed: 0,id,text,author_EAP,author_HPL,author_MWS
0,id26305,"This process, however, afforded me no means of...",1,False,False
1,id17569,It never once occurred to me that the fumbling...,0,True,False
2,id11008,"In his left hand was a gold snuff box, from wh...",1,False,False
3,id27763,How lovely is spring As we looked from Windsor...,0,False,True
4,id12958,"Finding nothing else, not even gold, the Super...",0,True,False


In [None]:
encoded_df['author_HPL'] = encoded_df['author_HPL'].map({True: 1, False: 0})
encoded_df.head(5)

Unnamed: 0,id,text,author_EAP,author_HPL,author_MWS
0,id26305,"This process, however, afforded me no means of...",1,0,False
1,id17569,It never once occurred to me that the fumbling...,0,1,False
2,id11008,"In his left hand was a gold snuff box, from wh...",1,0,False
3,id27763,How lovely is spring As we looked from Windsor...,0,0,True
4,id12958,"Finding nothing else, not even gold, the Super...",0,1,False


In [None]:
encoded_df['author_MWS'] = encoded_df['author_MWS'].map({True: 1, False: 0})
encoded_df.head(5)

Unnamed: 0,id,text,author_EAP,author_HPL,author_MWS
0,id26305,"This process, however, afforded me no means of...",1,0,0
1,id17569,It never once occurred to me that the fumbling...,0,1,0
2,id11008,"In his left hand was a gold snuff box, from wh...",1,0,0
3,id27763,How lovely is spring As we looked from Windsor...,0,0,1
4,id12958,"Finding nothing else, not even gold, the Super...",0,1,0


🔹 این قسمت کوانتایل‌های مختلف را محاسبه می‌کند:

q = 0.50 (صدک ۵۰٪) → میانه طول جملات

q = 0.90 (صدک ۹۰٪) → ۱۰٪ بلندترین جملات چه اندازه‌ای دارند؟

q = 0.95 (صدک ۹۵٪) → ۵٪ بلندترین جملات

q = 0.99 (صدک ۹۹٪) → ۱٪ بلندترین جملات

q = 0.999 (صدک ۹۹.۹٪) → تقریباً طولانی‌ترین جمله

In [None]:
lens = []
for seq in df['text']:
    lens += [len(seq.split(' '))]

for q in [.50, .90, .95, .99, .999]:
    print (q, np.quantile(lens, q))

0.5 23.0
0.9 48.0
0.95 58.0
0.99 85.0
0.999 147.42199999999866


In [None]:
df, test = train_test_split(encoded_df,
                            test_size=.20,
                            random_state=1723)
df.reset_index(drop=True,
               inplace=True)
test.reset_index(drop=True,
                 inplace=True)

In [None]:
df.head(5)

Unnamed: 0,id,text,author_EAP,author_HPL,author_MWS
0,id01053,"The opinion of Bob, the devil who kept dark ab...",1,0,0
1,id25613,EH AHHHH AH E'YAAHHHH. . .,0,1,0
2,id14131,I won't say that all this is wholly true in bo...,0,1,0
3,id07785,Curtis Whateley of the undecayed branch was ho...,0,1,0
4,id27141,The bust of the General was unquestionably the...,1,0,0


In [None]:
class TextPreprocessing:

    def transform(self,
                  seq,
                  ys=None):

        seq = seq.apply(lambda row: re.sub('[0-9]', '', row))

        regular_punctuation = list(string.punctuation)
        extra_punctuation = [
            ',', '.', '"', ':', ')', '(', '!', '?', '|', ';', "'", '$', '&',
            '/', '[', ']', '>', '%', '=', '#', '*', '+', '\\', '•',  '~', '@', '£',
            '·', '_', '{', '}', '©', '^', '®', '`',  '<', '→', '°', '€', '™', '›',
            '♥', '←', '×', '§', '″', '′', 'Â', '█', '½', 'à', '…', '“', '★', '”',
            '–', '●', 'â', '►', '−', '¢', '²', '¬', '░', '¶', '↑', '±', '¿', '▾',
            '═', '¦', '║', '―', '¥', '▓', '—', '‹', '─', '▒', '：', '¼', '⊕', '▼',
            '▪', '†', '■', '’', '▀', '¨', '▄', '♫', '☆', 'é', '¯', '♦', '¤', '▲',
            'è', '¸', '¾', 'Ã', '⋅', '‘', '∞', '∙', '）', '↓', '、', '│', '（', '»',
            '，', '♪', '╩', '╚', '³', '・', '╦', '╣', '╔', '╗', '▬', '❤', 'ï', 'Ø',
            '¹', '≤', '‡', '√', '«', '»', '´', 'º', '¾', '¡', '§', '£', '₤'
        ]
        for p in set(regular_punctuation + extra_punctuation):
            seq = [s.replace(p, ' ') for s in seq]

        seq = [s.lower() for s in seq]

        return seq

In [None]:
pp = TextPreprocessing()

df['text'] = pp.transform(df['text'])

df.head()

Unnamed: 0,id,text,author_EAP,author_HPL,author_MWS
0,id01053,the opinion of bob the devil who kept dark ab...,1,0,0
1,id25613,eh ahhhh ah e yaahhhh,0,1,0
2,id14131,i won t say that all this is wholly true in bo...,0,1,0
3,id07785,curtis whateley of the undecayed branch was ho...,0,1,0
4,id27141,the bust of the general was unquestionably the...,1,0,0


In [None]:
# !D:\kaggle\gpu\myenv\Scripts\pip.exe install transformers

In [None]:
from transformers import BertTokenizer, BertModel
import torch
from tqdm import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model1 = BertModel.from_pretrained('bert-base-uncased').to(device)
model1.eval()

def get_bert_embeddings_batch(texts, batch_size=32, max_length=128):
    embeddings = []

    with torch.no_grad():
        for i in tqdm(range(0, len(texts), batch_size)):
            batch_texts = texts[i:i+batch_size]

            inputs = tokenizer(batch_texts, return_tensors="pt", truncation=True,
                               padding=True, max_length=max_length)

            # انتقال ورودی‌ها به GPU
            inputs = {k: v.to(device) for k, v in inputs.items()}

            outputs = model1(**inputs)

            cls_embeddings = outputs.last_hidden_state[:, 0, :]  # shape: [batch_size, 768]

            embeddings.extend(cls_embeddings.cpu().numpy())  # برگردوندن به CPU و numpy

    return embeddings

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [None]:
texts = df['text'].tolist()
bert_embeddings = get_bert_embeddings_batch(texts, batch_size=32)

100%|██████████| 490/490 [01:13<00:00,  6.68it/s]


In [None]:
class BERTVectorDataset(torch.utils.data.Dataset):
    def __init__(self, embeddings, df, targets):
        self.embeddings = torch.tensor(embeddings, dtype=torch.float32)
        self.labels = torch.tensor(df[targets].values, dtype=torch.float32)

    def __len__(self):
        return len(self.embeddings)

    def __getitem__(self, idx):
        return self.embeddings[idx], self.labels[idx]

In [None]:
class MLPClassifier(nn.Module):
    def __init__(self, input_dim=768, hidden_dim=128, n_targets=3):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(hidden_dim, n_targets)
        )

    def forward(self, x):
        return self.model(x)

In [None]:
EPOCHS = 5
BATCH_SIZE = 64

kf = KFold(n_splits=N_FOLDS, shuffle=True, random_state=42)
targets = ['author_EAP', 'author_HPL', 'author_MWS']
for fold, (train_idx, val_idx) in enumerate(kf.split(df)):
    print(f"\n🔁 Fold {fold + 1}")

    train_dataset = BERTVectorDataset([bert_embeddings[i] for i in train_idx], df.iloc[train_idx], targets)
    val_dataset = BERTVectorDataset([bert_embeddings[i] for i in val_idx], df.iloc[val_idx], targets)

    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)

    model = MLPClassifier()
    model.to(device)

    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
    criterion = nn.BCEWithLogitsLoss()

    for epoch in range(EPOCHS):
        model.train()
        total_loss = 0

        for x_batch, y_batch in tqdm(train_loader, desc=f"Epoch {epoch+1}", leave=False):
            x_batch, y_batch = x_batch.to(device), y_batch.to(device)

            optimizer.zero_grad()
            y_pred = model(x_batch)
            loss = criterion(y_pred, y_batch)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        print(f"✅ Epoch {epoch+1} - Train Loss: {total_loss/len(train_loader):.4f}")

        # === Validation ===
        model.eval()
        val_loss = 0
        y_true, y_preds = [], []

        with torch.no_grad():
            for x_batch, y_batch in val_loader:
                x_batch, y_batch = x_batch.to(device), y_batch.to(device)
                y_pred = model(x_batch)
                loss = criterion(y_pred, y_batch)
                val_loss += loss.item()

                y_true.extend(y_batch.cpu().numpy())
                y_preds.extend(torch.sigmoid(y_pred).cpu().numpy())

        fold_log_loss = log_loss(y_true, y_preds)
        print(f"📉 Fold {fold+1} - Validation Loss: {val_loss/len(val_loader):.4f}, LogLoss: {fold_log_loss:.4f}")


🔁 Fold 1


  self.embeddings = torch.tensor(embeddings, dtype=torch.float32)


✅ Epoch 1 - Train Loss: 0.4587
📉 Fold 1 - Validation Loss: 0.4013, LogLoss: 0.6694




✅ Epoch 2 - Train Loss: 0.3821
📉 Fold 1 - Validation Loss: 0.3797, LogLoss: 0.6229




✅ Epoch 3 - Train Loss: 0.3628
📉 Fold 1 - Validation Loss: 0.3566, LogLoss: 0.5769




✅ Epoch 4 - Train Loss: 0.3512
📉 Fold 1 - Validation Loss: 0.3463, LogLoss: 0.5561




✅ Epoch 5 - Train Loss: 0.3404
📉 Fold 1 - Validation Loss: 0.3558, LogLoss: 0.5818

🔁 Fold 2




✅ Epoch 1 - Train Loss: 0.4570
📉 Fold 2 - Validation Loss: 0.3932, LogLoss: 0.6570




✅ Epoch 2 - Train Loss: 0.3815
📉 Fold 2 - Validation Loss: 0.3718, LogLoss: 0.6191




✅ Epoch 3 - Train Loss: 0.3618
📉 Fold 2 - Validation Loss: 0.3633, LogLoss: 0.5917




✅ Epoch 4 - Train Loss: 0.3487
📉 Fold 2 - Validation Loss: 0.3613, LogLoss: 0.5959




✅ Epoch 5 - Train Loss: 0.3395
📉 Fold 2 - Validation Loss: 0.3494, LogLoss: 0.5768

🔁 Fold 3




✅ Epoch 1 - Train Loss: 0.4538
📉 Fold 3 - Validation Loss: 0.4112, LogLoss: 0.6797




✅ Epoch 2 - Train Loss: 0.3795
📉 Fold 3 - Validation Loss: 0.3811, LogLoss: 0.6369




✅ Epoch 3 - Train Loss: 0.3587
📉 Fold 3 - Validation Loss: 0.3749, LogLoss: 0.6031




✅ Epoch 4 - Train Loss: 0.3483
📉 Fold 3 - Validation Loss: 0.3770, LogLoss: 0.6372




✅ Epoch 5 - Train Loss: 0.3359
📉 Fold 3 - Validation Loss: 0.3628, LogLoss: 0.6080

🔁 Fold 4




✅ Epoch 1 - Train Loss: 0.4558
📉 Fold 4 - Validation Loss: 0.3793, LogLoss: 0.5973




✅ Epoch 2 - Train Loss: 0.3831
📉 Fold 4 - Validation Loss: 0.3566, LogLoss: 0.5878




✅ Epoch 3 - Train Loss: 0.3652
📉 Fold 4 - Validation Loss: 0.3490, LogLoss: 0.5811




✅ Epoch 4 - Train Loss: 0.3499
📉 Fold 4 - Validation Loss: 0.3455, LogLoss: 0.5667




✅ Epoch 5 - Train Loss: 0.3407
📉 Fold 4 - Validation Loss: 0.3457, LogLoss: 0.5711

🔁 Fold 5




✅ Epoch 1 - Train Loss: 0.4669
📉 Fold 5 - Validation Loss: 0.4019, LogLoss: 0.6691




✅ Epoch 2 - Train Loss: 0.3853
📉 Fold 5 - Validation Loss: 0.3804, LogLoss: 0.6153




✅ Epoch 3 - Train Loss: 0.3638
📉 Fold 5 - Validation Loss: 0.3709, LogLoss: 0.5949




✅ Epoch 4 - Train Loss: 0.3491
📉 Fold 5 - Validation Loss: 0.3641, LogLoss: 0.5946


                                                           

✅ Epoch 5 - Train Loss: 0.3386
📉 Fold 5 - Validation Loss: 0.3596, LogLoss: 0.5794





In [None]:
df_test = pd.read_csv(PATH +"/test.csv")
df_test.head()

Unnamed: 0,id,text
0,id02310,"Still, as I urged our leaving Ireland with suc..."
1,id24541,"If a fire wanted fanning, it could readily be ..."
2,id00134,And when they had broken down the frail door t...
3,id27757,While I was thinking how I should possibly man...
4,id04081,I am not sure to what limit his knowledge may ...


In [None]:
test_texts = df_test['text'].tolist()
test_bert_embeddings = get_bert_embeddings_batch(test_texts, batch_size=32)

100%|██████████| 263/263 [00:48<00:00,  5.45it/s]


In [None]:
class TestBERTDataset(torch.utils.data.Dataset):
    def __init__(self, embeddings):
        self.embeddings = torch.tensor(embeddings, dtype=torch.float32)

    def __len__(self):
        return len(self.embeddings)

    def __getitem__(self, idx):
        return self.embeddings[idx]


In [None]:
test_dataset = TestBERTDataset(test_bert_embeddings)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)


In [None]:
model.eval()
all_preds = []

with torch.no_grad():
    for x_batch in test_loader:
        x_batch = x_batch.to(device)
        y_pred = torch.sigmoid(model(x_batch))  # چون خروجی BCEWithLogitsLoss بود
        all_preds.extend(y_pred.cpu().numpy())


In [None]:
for x_batch in test_loader:
    print(type(x_batch))
    break

<class 'torch.Tensor'>


In [None]:
preds_df = pd.DataFrame(all_preds, columns=targets)  # targets = ['author_EAP', 'author_HPL', 'author_MWS']
preds_df['id'] = df_test['id'].values  # اگر ستون id داری

# ذخیره CSV برای سابمیت
preds_df.to_csv("submission1.csv", index=False)


In [None]:
preds_df.head()

Unnamed: 0,author_EAP,author_HPL,author_MWS,id
0,0.037342,0.164174,0.830265,id02310
1,0.793377,0.158953,0.023739,id24541
2,0.011289,0.992705,0.000752,id00134
3,0.383611,0.665462,0.001784,id27757
4,0.364285,0.446405,0.272205,id04081
