In [14]:
import transformers
from transformers import BertTokenizer, BertModel, BertForMaskedLM, BertConfig, BertPreTrainedModel
from torch.optim import AdamW
import datasets
import pandas as pd
import numpy as np
import nltk
import sentencepiece
import re
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import contractions

In [15]:
dataset = pd.read_csv("dataset/cnn_dailymail/train.csv", nrows=1000)
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /Users/merrick/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [16]:
## keep the first N articles if you want to keep it lite 
dtf = pd.DataFrame(dataset).rename(columns={"article":"text", 
      "highlights":"y"})[["text","y"]]
dtf.head()

Unnamed: 0,text,y
0,By . Associated Press . PUBLISHED: . 14:11 EST...,"Bishop John Folda, of North Dakota, is taking ..."
1,(CNN) -- Ralph Mata was an internal affairs li...,Criminal complaint: Cop used his role to help ...
2,A drunk driver who killed a young woman in a h...,"Craig Eccleston-Todd, 27, had drunk at least t..."
3,(CNN) -- With a breezy sweep of his pen Presid...,Nina dos Santos says Europe must be ready to a...
4,Fleetwood are the only team still to have a 10...,Fleetwood top of League One after 2-0 win at S...


In [17]:
def utils_preprocess_text(txt, punkt=True, lower=True, slang=True, lst_stopwords=None, stemm=False, lemm=True):
    ### separate sentences with '. '
    txt = re.sub(r'\.(?=[^ \W\d])', '. ', str(txt))
    ### remove punctuations and characters
    txt = re.sub(r'[^\w\s]', '', txt) if punkt is True else txt
    ### strip
    txt = " ".join([word.strip() for word in txt.split()])
    ### lowercase
    txt = txt.lower() if lower is True else txt
    ### slang
    txt = contractions.fix(txt) if slang is True else txt   
    ### tokenize (convert from string to list)
    lst_txt = txt.split()
    ### stemming (remove -ing, -ly, ...)
    if stemm is True:
        ps = nltk.stem.porter.PorterStemmer()
        lst_txt = [ps.stem(word) for word in lst_txt]
    ### lemmatization (convert the word into root word)
    if lemm is True:
        lem = nltk.stem.wordnet.WordNetLemmatizer()
        lst_txt = [lem.lemmatize(word) for word in lst_txt]
    ### remove Stopwords
    if lst_stopwords is not None:
        lst_txt = [word for word in lst_txt if word not in 
                   lst_stopwords]
    ### back to string
    txt = " ".join(lst_txt)
    return txt

In [18]:
# preprocess text
dtf["text"] = dtf["text"].apply(lambda x: utils_preprocess_text(x))
dtf["y"] = dtf["y"].apply(lambda x: utils_preprocess_text(x))

In [19]:
# Instantiate the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize the text in the DataFrame
dtf['text'] = dtf['text'].apply(lambda x: tokenizer.encode(x, return_tensors='pt'))

# Tokenize the target summaries if applicable
dtf['y'] = dtf['y'].apply(lambda x: tokenizer.encode(x, return_tensors='pt'))


Token indices sequence length is longer than the specified maximum sequence length for this model (918 > 512). Running this sequence through the model will result in indexing errors


In [20]:
print(dtf["text"][0])

tensor([[  101,  2011,  3378,  2811,  2405, 15471,  2487,  9765,  2423,  2255,
          2286,  7172, 16710,  2575,  9765,  2423,  2255,  2286,  1996,  3387,
          1997,  1996, 23054,  3234,  5801,  1999,  2167,  7734,  5292,  6086,
          9280,  3634,  1997,  2277,  2266,  1999, 23054,  2882,  9292,  1998,
         27435,  2000,  1996, 28389,  1037,  7865,  1999,  2397,  2244,  1998,
          2220,  2255,  1996,  2110,  2740,  2533,  5292,  3843,  2019,  7319,
          1997,  7524,  2005,  3087,  2040,  3230,  2274,  2277,  1998,  2165,
         15661,  3387,  2198, 10671,  2050, 15885,  1997,  1996, 23054,  3234,
          5801,  1999,  2167,  7734,  5292,  6086,  9280,  3634,  1997,  2277,
          2266,  1999, 23054,  2882,  9292,  1998, 27435,  2000,  1996, 28389,
          1037,  2110, 10047, 23041,  3989,  2565,  3208,  9618, 18473,  2360,
          1996,  3891,  2003,  2659,  2021,  2880,  2514,  2009,  2590,  2000,
          9499,  2111,  2000,  1996,  2825,  7524,  

In [21]:
class CustomDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        # Access the data using the 'text' key
        return self.data[index]['text']

# Construct the data loader with the same key 'text'
train_dataset = CustomDataset(dtf)
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True, num_workers=4)


In [22]:
# Example for configuring a pre-trained BERT model
# Load pre-trained BERT model and configuration
model_name = 'bert-base-uncased'
config = BertConfig.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name, config=config)

# Modify the model configuration
config.num_labels = 1  # Set number of labels for your specific task (e.g., binary classification)
config.hidden_dropout_prob = 0.2  # Set dropout probability for hidden layers
config.attention_probs_dropout_prob = 0.2  # Set dropout probability for attention layer

# Create a custom architecture by subclassing BertPreTrainedModel
class CustomBertModel(BertPreTrainedModel):
    def __init__(self, config):
        super(CustomBertModel, self).__init__(config)
        # Define the layers of your custom model architecture here
        self.bert = BertModel(config)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.linear = nn.Linear(config.hidden_size, config.num_labels)
        self.init_weights()

    def forward(self, input_ids, attention_mask=None):
        # Forward pass of your custom model
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs[1]  # Use the pooled output for classification
        pooled_output = self.dropout(pooled_output)
        logits = self.linear(pooled_output)
        return logits

# Example for configuring model hyperparameters
learning_rate = 0.001
batch_size = 16
num_epochs = 10
optimizer = AdamW(model.parameters(), lr=learning_rate)

# Training loop
for epoch in range(num_epochs):
    # Load data, prepare data loader, and iterate through batches
    for batch in train_dataloader:
        # Zero gradients, forward pass, compute loss
        optimizer.zero_grad()
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        logits = model(input_ids, attention_mask)
        # Compute loss and update weights
        loss = compute_loss(logits, batch['labels'])
        loss.backward()
        optimizer.step()

# # Example for evaluating the model
# # Load test data, prepare data loader
# # Forward pass and compute evaluation metrics
# for batch in test_data_loader:
#     input_ids = batch['input_ids']
#     attention_mask = batch['attention_mask']
#     logits = model(input_ids, attention_mask)
#     # Compute evaluation metrics (e.g., ROUGE, BLEU) using logits and ground truth summaries



Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Traceback (most recent call last):
  File "<string>", line 1, in <module>
  File "/opt/homebrew/Cellar/python@3.11/3.11.2_1/Frameworks/Python.framework/Ver