In [7]:
import os
import csv
import numpy as np
import pandas as pd
import torch
from torch import nn
from torch.utils.data import DataLoader
from torch.optim import SGD, AdamW
from transformers import BertTokenizer, BertForTokenClassification
from transformers import AutoTokenizer, AutoModelForTokenClassification, AutoConfig
from transformers.models.roberta.modeling_roberta import RobertaModel
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from tqdm import tqdm


In [8]:
# Set global variables and load data
max_seq = 75
data_path = "Train_Tagged_Titles.tsv"
df = pd.read_csv(data_path, sep="\t", dtype=str, keep_default_na=False, na_values=[""], quoting=csv.QUOTE_NONE)

In [9]:
# Define dunction to add additional classes
def process_row(row, last_an_entity):
    if pd.isna(row['Tag']):
        if last_an_entity is not None:
            return 'I-' + last_an_entity[0]
        else:
            return row['Tag']
    else:
        last_an_entity[0] = row['Tag']
        return 'B-' + row['Tag']

In [10]:
# Tag data with new classes
# Initialize a list to keep track of the last non-NaN entity
last_non_nan_entity = [None]

# Use apply with a lambda function
df['mod_Tag'] = df.apply(lambda row: process_row(row, last_non_nan_entity), axis=1)

# Check the result
df_entities = df[['Record Number','Token','mod_Tag']]
vocab = ['[PAD]'] + df_entities['mod_Tag'].unique().tolist()
voc_map = {}
for label in vocab:
    voc_map[label] = len(voc_map)

rev_map = {v: (k[2:] if (k[:2]=='B-') else '' ) for k,v in voc_map.items()}


In [11]:
# Define class models
class EntityNamingModel(nn.Module):

    def __init__(self, model_source = './bert_for_ebay'):
        super(EntityNamingModel, self).__init__()

        self.bert = BertForTokenClassification.from_pretrained(model_source, num_labels=len(voc_map))

    def forward(self, input_id, mask, label):

        output = self.bert(input_ids=input_id, attention_mask=mask, labels=label, return_dict=False)
        
        return output

class RobertaNamingModel(nn.Module):

    def __init__(self, config, dropout = 0.1, num_labels = 70):
        super(RobertaNamingModel, self).__init__()

        # Adds Roberta followed by a classifier layer
        self.roberta = RobertaModel(config, add_pooling_layer=False)
        self.dropout = nn.Dropout(dropout)
        self.classifier = nn.Linear(config.hidden_size, num_labels)
        self.num_labels = num_labels

    def forward(self, input_id, mask, labels):
        # Feed forward
        logits = self.roberta(input_ids=input_id, attention_mask=mask, return_dict=False)
        logits = self.dropout(logits[0])
        logits = self.classifier(logits)

        # Generate Losses
        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1 , self.num_labels) , labels.view(-1))

        
        return loss, logits

In [12]:
# Defines DataSets for training ebay listings
class DataSeq(torch.utils.data.Dataset):

    def __init__(self, text, labels):

        self.labels = labels
        self.texts = tokenizer(text,padding='max_length', max_length = max_seq, 
                       truncation=True, return_tensors="pt")

    def classes(self):
        return self.labels

    def __len__(self):
        return len(self.labels)

    def get_batch_data(self, idx):
        batch_data = {key: value[idx] for key, value in self.texts.items()}
        return batch_data

    def get_batch_labels(self, idx):
        return torch.LongTensor(self.labels[idx])

    def __getitem__(self, idx):

        batch_texts = self.get_batch_data(idx)
        batch_labels = self.get_batch_labels(idx)
        return batch_texts, batch_labels
    
# Define function for adding a progress bar while tokenizing
def tokenizer_with_progress(large_batch):
    tokenized_texts = {'input_ids' : [],
                       'attention_mask' : [],
                       }
    for text in tqdm(large_batch, desc="Tokenizing", unit="texts"):
        for k,v in tokenizer(text,padding='max_length', max_length = max_seq, 
                       truncation=True, return_tensors="pt").items():
            tokenized_texts[k].append(v[0])
        
    for k,v in tokenized_texts.items():
        if(v != []):
            tokenized_texts[k] = torch.stack(v)
    return tokenized_texts

# Define datasets for inference on ebay listings
class DataInf(torch.utils.data.Dataset):

    def __init__(self, text):
        self.texts = tokenizer_with_progress(text)

    def __len__(self):
        return len(self.texts['input_ids'])

    def get_batch_data(self, idx):
        batch_data = {key: value[idx] for key, value in self.texts.items()}
        return batch_data

    def __getitem__(self, idx):

        batch_texts = self.get_batch_data(idx)
        return batch_texts
    

## BERT (Bidirectional Encoder Representations from Transformers)
- **Release Year**: 2018
- **Key Features**:
  - Utilizes the Transformer architecture.
  - Pre-trained on a large corpus of unlabelled text including the entire Wikipedia (2,500M words) and Book Corpus (800M words).
  - Uses Masked Language Model (MLM) and Next Sentence Prediction (NSP) for training.
- **Applications**: Sentiment analysis, question answering, language inference.

In [None]:
# Bert specific globals
model_name = "bert-base-german-cased"
model_source = "./bert_for_ebay"
tokenizer = BertTokenizer.from_pretrained(model_name)


In [None]:
train_seq = df['Title'].unique().tolist()

df_entities['Tokenized_Length'] = df_entities['Token'].apply(lambda x: len(tokenizer.tokenize(x)))

# Group the DataFrame by 'Record Number'
grouped_entities = df_entities.groupby('Record Number')

# Initialize a numpy array with zeros for label ids
token_labels = np.zeros(shape=(len(train_seq), max_seq), dtype=np.int32)

for i in range(5000):
    if str(i + 1) in grouped_entities.groups:
        curr_entities = grouped_entities.get_group(str(i + 1))
        pointer = 1
        for _, row in curr_entities.iterrows():
            token_len = row['Tokenized_Length']
            token_labels[i, pointer:(pointer + token_len)] = np.array([voc_map[row['mod_Tag']]])
            pointer += token_len

# Split data into train and validation sets
train_ids, val_ids, train_labels, val_labels = train_test_split(train_seq, token_labels, test_size=0.05, random_state=42)

In [None]:
# Hyperparameters
LEARNING_RATE = 5.0e-5
EPOCHS = 40
BATCH_SIZE = 1

# Train Loop
def train_loop(model, train_ids, train_labels, val_ids, val_labels):
  # Initialize a dictionary to store checkpoints in training
  checkpoints = {
    'epoch': [],
    'model_state_dict': [],
    'optimizer_state_dict': [],
    'loss': [],
    'accuracy': [],
  }

  train_dataset = DataSeq(train_ids, train_labels)
  val_dataset = DataSeq(val_ids, val_labels)

  train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
  val_dataloader = DataLoader(val_dataset, batch_size=BATCH_SIZE)

  # Set device to Metal Performance Shaders
  device = torch.device("mps")

  # Initialize optimizer
  optimizer = SGD(model.parameters(), lr=LEARNING_RATE)

  for epoch_num in range(EPOCHS):

    total_acc_train = 0
    total_loss_train = 0
    model.to(device)  # Move model to GPU
    model.train()     # Set model to training mode

    for train_data, train_label in tqdm(train_dataloader):

      train_label = train_label.to(device)
      mask = train_data['attention_mask'].squeeze(1).to(device)
      input_id = train_data['input_ids'].squeeze(1).to(device)

      optimizer.zero_grad() # Reset gradients
      loss, logits = model(input_id, mask, train_label) # Forward pass

      # Calculate accuracy
      for i in range(logits.shape[0]):

        logits_clean = logits[i][train_label[i] != -100]
        label_clean = train_label[i][train_label[i] != -100]

        predictions = logits_clean.argmax(dim=1)
        acc = (predictions == label_clean).float().mean()
        total_acc_train += acc
        total_loss_train += loss.item()

      loss.backward()   # Backward pass
      optimizer.step()  # Update weights

    model.eval() # Set model to evaluation mode

    total_acc_val = 0
    total_loss_val = 0

    # Lists to store predictions and true labels
    all_predictions = []
    all_true_labels = []

    # Iterate over validation set
    for val_data, val_label in val_dataloader:

      val_label = val_label.to(device)
      mask = val_data['attention_mask'].squeeze(1).to(device)
      input_id = val_data['input_ids'].squeeze(1).to(device)

      loss, logits = model(input_id, mask, val_label)

      # Calculate accuracy
      for i in range(logits.shape[0]):

        logits_clean = logits[i][val_label[i] != -100]
        label_clean = val_label[i][val_label[i] != -100]

        predictions = logits_clean.argmax(dim=1)
        acc = (predictions == label_clean).float().mean()
        total_acc_val += acc
        total_loss_val += loss.item()

        # Collect predictions and true labels for F1 score calculation
        all_predictions.append(predictions.cpu().numpy())
        all_true_labels.append(label_clean.cpu().numpy())

    # Flatten the lists
    all_predictions = np.concatenate(all_predictions)
    all_true_labels = np.concatenate(all_true_labels)

    # Calculate macro-averaged F1 score
    f1 = f1_score(all_true_labels, all_predictions, average='macro')

    val_accuracy = total_acc_val / len(val_ids)
    val_loss = total_loss_val / len(val_ids)
    train_accuracy = total_acc_train / len(train_labels)
    train_loss = total_loss_train / len(train_labels)

    print(
        f'Epochs: {epoch_num + 1} | Loss: {train_loss: .3f} | Accuracy: {train_accuracy: .3f} | Val_Loss: {val_loss: .3f} | Accuracy: {val_accuracy: .3f} | F1-SCORE: {f1: .3f}')
    
    # Save checkpoints
    checkpoints['epoch'].append(epoch_num)
    checkpoints['model_state_dict'].append(model.state_dict())
    checkpoints['optimizer_state_dict'].append(optimizer.state_dict())
    checkpoints['loss'].append(val_loss)
    checkpoints['accuracy'].append(val_accuracy)
  
  torch.save(checkpoints, './fine_tuned_checkpoints')



model = EntityNamingModel()
train_loop(model, train_ids, train_labels, val_ids, val_labels)


## RoBERTa (A Robustly Optimized BERT Pretraining Approach)
- **Developer**: Facebook AI
- **Release Year**: 2019
- **Key Features**:
  - An optimized version of BERT with changes in pretraining procedures.
  - Removes the NSP task and dynamically changes the masking pattern during the pretraining phase.
  - Trained with larger mini-batches and learning rates.
- **Applications**: More effective than BERT on several NLP benchmarks and tasks.


In [36]:
# RoBERTa specific globals
model_name = "xlm-roberta-large-finetuned-conll03-german"
model_source = "./RoBERTa_for_ebay/"
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [37]:
train_seq = df['Title'].unique().tolist()

df_entities['Tokenized_Length'] = df_entities['Token'].apply(lambda x: len(tokenizer.tokenize(x)))

# Group the DataFrame by 'Record Number'
grouped_entities = df_entities.groupby('Record Number')

# Initialize a numpy array with zeros for label ids
token_labels = np.zeros(shape=(len(train_seq), max_seq), dtype=np.int32)

for i in range(5000):
    if str(i + 1) in grouped_entities.groups:
        curr_entities = grouped_entities.get_group(str(i + 1))
        pointer = 1
        for _, row in curr_entities.iterrows():
            token_len = row['Tokenized_Length']
            token_labels[i, pointer:(pointer + token_len)] = np.array([voc_map[row['mod_Tag']]])
            pointer += token_len

# Split data into train and validation sets
train_ids, val_ids, train_labels, val_labels = train_test_split(train_seq, token_labels, test_size=0.05, random_state=42)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_entities['Tokenized_Length'] = df_entities['Token'].apply(lambda x: len(tokenizer.tokenize(x)))


In [43]:
# Hyperparameters
LEARNING_RATE = 5e-4
EPOCHS = 10
BATCH_SIZE = 4

# Load pre_trained configuration
config = AutoConfig.from_pretrained(
    model_name,
    num_labels = len(voc_map),
    id2label = rev_map,
    label2id = voc_map
)

# Train Loop
def train_loop(model, train_ids, train_labels, val_ids, val_labels):
  # Initialize a dictionary to store checkpoints in training
  checkpoints = {
    'epoch': [],
    'model_state_dict': [],
    'optimizer_state_dict': [],
    'loss': [],
    'accuracy': [],
  }

  train_dataset = DataSeq(train_ids, train_labels)
  val_dataset = DataSeq(val_ids, val_labels)

  train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
  val_dataloader = DataLoader(val_dataset, batch_size=BATCH_SIZE)

  device = torch.device("mps") # Set device to Metal Performance Shaders

  # Initialize optimizer
  optimizer = SGD(model.parameters(), lr=LEARNING_RATE)

  for epoch_num in range(EPOCHS):

    total_acc_train = 0
    total_loss_train = 0
    model.to(device)  # Move model to GPU
    model.train()     # Set model to training mode

    for train_data, train_label in tqdm(train_dataloader):

      train_label = train_label.to(device)
      mask = train_data['attention_mask'].squeeze(1).to(device)
      input_id = train_data['input_ids'].squeeze(1).to(device)

      optimizer.zero_grad()                             # Reset gradients
      loss, logits = model(input_id, mask, train_label) # Forward pass

      # Calculate accuracy
      for i in range(logits.shape[0]):

        logits_clean = logits[i][train_label[i] != -100]
        label_clean = train_label[i][train_label[i] != -100]

        predictions = logits_clean.argmax(dim=1)
        acc = (predictions == label_clean).float().mean()
        total_acc_train += acc
        total_loss_train += loss.item()

      loss.backward()   # Backward pass
      optimizer.step()  # Update weights

    model.eval() # Set model to evaluation mode

    total_acc_val = 0
    total_loss_val = 0

    # Lists to store predictions and true labels
    all_predictions = []
    all_true_labels = []


    # Iterate over validation set
    for val_data, val_label in val_dataloader:

      val_label = val_label.to(device)
      mask = val_data['attention_mask'].squeeze(1).to(device)
      input_id = val_data['input_ids'].squeeze(1).to(device)

      loss, logits = model(input_id, mask, val_label)

      # Calculate accuracy
      for i in range(logits.shape[0]):

        logits_clean = logits[i][val_label[i] != -100]
        label_clean = val_label[i][val_label[i] != -100]

        predictions = logits_clean.argmax(dim=1)
        acc = (predictions == label_clean).float().mean()
        total_acc_val += acc
        total_loss_val += loss.item()

        # Collect predictions and true labels for F1 score calculation
        all_predictions.append(predictions.cpu().numpy())
        all_true_labels.append(label_clean.cpu().numpy())

    # Flatten the lists
    all_predictions = np.concatenate(all_predictions)
    all_true_labels = np.concatenate(all_true_labels)

    # Calculate macro-averaged F1 score
    f1 = f1_score(all_true_labels, all_predictions, average='macro')

    val_accuracy = total_acc_val / len(val_ids)
    val_loss = total_loss_val / len(val_ids)
    train_accuracy = total_acc_train / len(train_labels)
    train_loss = total_loss_train / len(train_labels)

    print(
        f'Epochs: {epoch_num + 1} | Loss: {train_loss: .3f} | Accuracy: {train_accuracy: .3f} | Val_Loss: {val_loss: .3f} | Accuracy: {val_accuracy: .3f} | F1-SCORE: {f1: .3f}')
      
    # Save checkpoints
    checkpoints['epoch'].append(epoch_num)
    checkpoints['model_state_dict'].append(model.state_dict())
    checkpoints['optimizer_state_dict'].append(optimizer.state_dict())
    checkpoints['loss'].append(val_loss)
    checkpoints['accuracy'].append(val_accuracy)
  
  torch.save(checkpoints, './fine_tuned_checkpoints')

config = AutoConfig.from_pretrained(
    model_source,
    num_labels = len(voc_map),
    id2label = rev_map,
    label2id = voc_map
)

# model = RobertaNamingModel(config=config)
train_loop(model, train_ids, train_labels, val_ids, val_labels)


100%|██████████| 1188/1188 [03:08<00:00,  6.32it/s]


Epochs: 1 | Loss:  0.440 | Accuracy:  0.878 | Val_Loss:  0.426 | Accuracy:  0.881 | F1-SCORE:  0.204


100%|██████████| 1188/1188 [03:06<00:00,  6.39it/s]


Epochs: 2 | Loss:  0.429 | Accuracy:  0.881 | Val_Loss:  0.415 | Accuracy:  0.887 | F1-SCORE:  0.217


100%|██████████| 1188/1188 [03:04<00:00,  6.44it/s]


Epochs: 3 | Loss:  0.419 | Accuracy:  0.884 | Val_Loss:  0.403 | Accuracy:  0.889 | F1-SCORE:  0.221


100%|██████████| 1188/1188 [03:02<00:00,  6.52it/s]


Epochs: 4 | Loss:  0.408 | Accuracy:  0.886 | Val_Loss:  0.393 | Accuracy:  0.892 | F1-SCORE:  0.226


100%|██████████| 1188/1188 [02:58<00:00,  6.65it/s]


Epochs: 5 | Loss:  0.400 | Accuracy:  0.888 | Val_Loss:  0.390 | Accuracy:  0.892 | F1-SCORE:  0.239


100%|██████████| 1188/1188 [02:58<00:00,  6.64it/s]


Epochs: 6 | Loss:  0.392 | Accuracy:  0.890 | Val_Loss:  0.376 | Accuracy:  0.895 | F1-SCORE:  0.246


100%|██████████| 1188/1188 [02:59<00:00,  6.61it/s]


Epochs: 7 | Loss:  0.383 | Accuracy:  0.892 | Val_Loss:  0.371 | Accuracy:  0.895 | F1-SCORE:  0.248


100%|██████████| 1188/1188 [03:00<00:00,  6.59it/s]


Epochs: 8 | Loss:  0.376 | Accuracy:  0.893 | Val_Loss:  0.369 | Accuracy:  0.898 | F1-SCORE:  0.259


100%|██████████| 1188/1188 [02:59<00:00,  6.62it/s]


Epochs: 9 | Loss:  0.368 | Accuracy:  0.895 | Val_Loss:  0.361 | Accuracy:  0.899 | F1-SCORE:  0.259


100%|██████████| 1188/1188 [02:59<00:00,  6.63it/s]


Epochs: 10 | Loss:  0.362 | Accuracy:  0.897 | Val_Loss:  0.355 | Accuracy:  0.902 | F1-SCORE:  0.267


In [None]:
model = EntityNamingModel()
optimizer = SGD(model.parameters(), lr=LEARNING_RATE)
model_number = 1
checkpoint = torch.load("./fine_tuned_checkpoints")
model.load_state_dict(checkpoint['model_state_dict'][model_number])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'][model_number])
epoch = checkpoint['epoch'][model_number]
loss = checkpoint['loss'][model_number]
accuracy = checkpoint['accuracy'][model_number]

model.eval()


In [17]:
data_path_VAL = "Listing_Titles.tsv"
dfVAL = pd.read_csv(data_path_VAL, sep="\t", dtype=str, keep_default_na=False, na_values=[""], quoting=csv.QUOTE_NONE, skiprows = lambda x : x > 30000)


In [21]:
orig = dfVAL['Title'].to_list()[29500:]#[5000:]
predictions = []
device = torch.device("mps")
inferDat = DataInf(orig)
load = DataLoader(inferDat, batch_size=32)
for data in tqdm(load, desc="Inferencing", unit="batches"):

    mask = data['attention_mask'].squeeze(1).to(device)
    input_id = data['input_ids'].squeeze(1).to(device)
    model.to(device)

    logits = model.forward(input_id, mask, None)[1]
    for i in range(logits.shape[0]):
        logit = logits[i]
        predictions.append(logit.argmax(dim=1).cpu())



Tokenizing: 100%|██████████| 500/500 [00:00<00:00, 13140.21texts/s]
Inferencing: 100%|██████████| 16/16 [00:04<00:00,  3.35batches/s]


In [22]:
tokenizer.tokenize(orig[0])

['▁Adidas',
 '▁Ultra',
 '▁Boost',
 '▁X',
 '▁3',
 'D',
 '▁S',
 '▁Stella',
 '▁Damen',
 '▁Sneaker',
 '▁Lauf',
 'schuhe',
 '▁Turn',
 'schuhe',
 '▁G',
 '28',
 '3',
 '36',
 '▁N',
 'EU']

In [27]:
words=[]
labels=[]
record=[]
for i, seq in tqdm(enumerate(predictions), total=25000):
    tokens = tokenizer.tokenize(orig[i])
    preds = seq.tolist()[1:]
    curr = 0
    for word in orig[i].split(' '):
        words.append(word)
        labels.append(rev_map[preds[curr]])
        record.append(i+5001)
        prelim_word = ""
        for j in range(curr, len(tokens)):
            tok = tokens[j]
            if tok[0] != '▁': # Change condition to =='#' when dealing with Bert
                prelim_word += (tok)
            else:
                prelim_word += (tok[1:])
            if prelim_word == word:
                curr = j+1
                prelim_word += (' ')
                break

  2%|▏         | 500/25000 [00:00<00:01, 15627.53it/s]


['',
 'Adidas ',
 'Ultra ',
 'Boost ',
 'X ',
 '3D ',
 'S ',
 'Stella ',
 'Damen ',
 'Sneaker ',
 'Laufschuhe ',
 'Turnschuhe ',
 'G28336 ',
 '',
 'adidas ',
 'Adilette ',
 'Boost ',
 'Slide ',
 'Sandal ',
 ', ',
 'Black ',
 ', ',
 'Size ',
 '4.0 ',
 'nv0t ',
 '',
 'Diesel ',
 'BIKKREN ',
 'Herrenschuhe ',
 'Turnschuhe ',
 'Leder ',
 'Freizeit ',
 'Men ',
 'Sneaker ',
 'Gr ',
 '. ',
 '',
 'Asics ',
 'Dynaflyte ',
 '3 ',
 '1011A253001 ',
 'schwarz ',
 '',
 'Nike ',
 'Air ',
 'Force ',
 '1 ',
 'Low ',
 'Damen ',
 '! ',
 'CQ7511 ',
 '071 ',
 '! ',
 'US ',
 '8 ',
 'EU ',
 '39 ',
 '! ',
 '25 ',
 'CM ',
 '! ',
 'NEU ',
 '',
 'nike ',
 'air ',
 'force ',
 '1 ',
 '07 ',
 'lv8 ',
 '3 ',
 'ungetragen ',
 '',
 'Nike ',
 'Sneakers ',
 'Damen ',
 'Freizeitschuhe ',
 'Turnschuhe ',
 'Gr ',
 '. ',
 'DE ',
 '37.5 ',
 'kein ',
 'Etike ',
 '... ',
 '#da ',
 '',
 'Converse ',
 'Sneakers ',
 'Damen ',
 'Freizeitschuhe ',
 'Turnschuhe ',
 'Gr ',
 '. ',
 'DE ',
 '40 ',
 'kein ',
 'Eti ',
 '... ',
 '# ',
 ''

In [62]:
fixed_words = []
fixed_labels = []
fixed_records = []
temp_string = ""

for i in tqdm(range(len(words)), desc="Record Mapping", unit="words"):
    if labels[i] != "":
        temp_string = words[i]
        for j in range(len(words[i+1:])):
            temp = words[j+i+1]
            if(labels[j+i+1] != ""):
                break
            else:
                temp_string += " " + temp
        fixed_words.append(temp_string)
        fixed_labels.append(labels[i])
        fixed_records.append(record[i])

dffer = {'Record Number' : fixed_records,
         'Aspect Name' : fixed_labels,
         'Aspect Value' : fixed_words,
        }

dffer = pd.DataFrame(dffer)

dffer.to_csv('sub9.tsv', sep="\t", header=None, index=None) 

Record Mapping: 100%|██████████| 265261/265261 [01:19<00:00, 3338.45words/s] 


In [None]:
test = {'Record Number' : record,
         'Aspect Name' : labels,
         'Aspect Value' : words,
        }

test = pd.DataFrame(test)


dffer = {'Record Number' : fixed_records,
         'Aspect Name' : fixed_labels,
         'Aspect Value' : fixed_words,
        }

dffer = pd.DataFrame(dffer)


dffer.to_csv('sub1.tsv', sep="\t", header=None, index=None) 

dffer.head(25)


In [None]:
test.head(10)

In [None]:
dfVAL['check'] = dfVAL.apply(lambda row: len(tokenizer.tokenize(row['Title'])), axis=1)

In [None]:
dfVAL['check'].max()