In [2]:
import pandas as pd
from transformers import AutoModel, AutoTokenizer, LineByLineTextDataset, DataCollatorForLanguageModeling, LineByLineWithSOPTextDataset
from sklearn.model_selection import train_test_split
import os
import torch
from torch.utils.data.dataset import Dataset
from transformers.tokenization_utils import PreTrainedTokenizer
from filelock import FileLock
from transformers.utils import logging
from typing import Dict, List, Optional
import pickle
import random
import time
import re

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
def text_normalize(text):
    text = text.lower()
    text = text.strip() 
    text = re.sub(r'[^\w\s\n]', '', text)
    text = text.replace('\n\n', '\n')
    text = re.sub(r'\.{3,}', '', text)
    text = re.sub(r' - ', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    text = '\n'.join(['<start> ' + line + ' <end>' for line in text.split('\n') if line != '' and len(line.split()) == 5])
 
    return text

In [4]:
tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base", use_fast=False)
tokenizer.add_tokens('\n')
model = AutoModel.from_pretrained('vinai/phobert-base')

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Some weights of the model checkpoint at vinai/phobert-base were not used when initializing RobertaModel: ['lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.bias', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [5]:
df = pd.read_csv('processed_test.csv')

combined_text = " ".join(df['content'].dropna().astype(str).tolist())

In [6]:
tokenizer.convert_tokens_to_ids(tokenizer.pad_token)

1

In [7]:
tokenizer

PhobertTokenizer(name_or_path='vinai/phobert-base', vocab_size=64000, model_max_length=256, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': '<mask>'}, clean_up_tokenization_spaces=True)

In [8]:
tokenizer.convert_tokens_to_ids(tokenizer.bos_token)

0

In [9]:
tokenizer.convert_tokens_to_ids(tokenizer.eos_token)

2

In [13]:
class CustomerDataset(Dataset):
    def __init__(self, tokenizer, file_path, block_size: int, text_column: 'content'):
        block_size = block_size - tokenizer.num_special_tokens_to_add(pair=False)
        self.examples = []
        self.mask = []
        
        # Đọc dữ liệu từ file CSV
        df = pd.read_csv(file_path)
        
        # Kết hợp nội dung của cột văn bản
        combined_text = " ".join(df[text_column].dropna().astype(str).tolist())
        
        # Token hóa và chuyển đổi thành ID
        tokenized_text = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(combined_text))
        
        if len(tokenized_text) < block_size:
            inds = [tokenizer.convert_tokens_to_ids(tokenizer.bos_token)] + tokenized_text + [tokenizer.convert_tokens_to_ids(tokenizer.eos_token)] + \
                   (block_size - len(tokenized_text)) * [tokenizer.convert_tokens_to_ids(tokenizer.pad_token)]
            mask = [x != tokenizer.convert_tokens_to_ids(tokenizer.pad_token) for x in inds]
        else:
            inds = [tokenizer.convert_tokens_to_ids(tokenizer.bos_token)] + tokenized_text[:block_size] + [tokenizer.convert_tokens_to_ids(tokenizer.eos_token)]
            mask = [1] * len(inds)
        
        self.examples.append(inds)
        self.mask.append(mask)
            
    def __len__(self):
        return len(self.examples)

    def __getitem__(self, i) -> torch.Tensor:
        return {
            "input_ids": torch.tensor(self.examples[i], dtype=torch.long), 
            "attention_mask": torch.tensor(self.mask[i], dtype=torch.long)
        }

In [14]:
from transformers import LineByLineTextDataset, DataCollatorForLanguageModeling, LineByLineWithSOPTextDataset

def load_dataset(train_path, tokenizer):
    train_dataset = CustomerDataset(
        tokenizer=tokenizer,
        file_path=train_path,
        block_size=256
    )
    
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer, mlm=False,
    )
    
    return train_dataset, data_collator

train_dataset, data_collator = load_dataset(combined_text, tokenizer)

TypeError: __init__() missing 1 required positional argument: 'text_column'

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')