In [1]:
%load_ext autoreload
%autoreload 2

In [None]:
import sys
sys.path.insert(0, './src')

: 

: 

# Import libs

In [None]:
import os
from pathlib import Path

import pandas as pd
from transformers import BertTokenizerFast
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split
from tqdm import tqdm


from utils import dataset_processed
from eval_metric import rouge1_2
from next_token_dataset import NextTokenDataset, ValTokenDataset
from LSTM import LSTMAutocomplete

BASE_DIR = Path().resolve()
MAX_LEN = 140
BATCH_SIZE = 128


: 

: 

# 1. Clean raw data

In [None]:
dataset_processed(
    os.path.join(BASE_DIR, 'data', 'tweets.txt'),
    os.path.join(BASE_DIR, 'data', 'cleaned_tweets.csv')
)

: 

: 

In [None]:
dataset = pd.read_csv(os.path.join(BASE_DIR, 'data', 'cleaned_tweets.csv'), index_col=False)
dataset.head()

: 

: 

# 2. Split dataset by train, val, test

In [None]:
train, val = train_test_split(dataset, test_size=0.2, random_state=42)
val, test  = train_test_split(val, test_size=0.5, random_state=42)
print(f"Train texts: {len(train)}, Val texts: {len(val)}, Test texts: {len(test)}")

: 

: 

In [None]:
# for limit of calc resources make val and test selection shoter
val = val.sample(n=100, random_state=42)
test = test.sample(n=100, random_state=42)

: 

: 

# 3. Create datasets and data loader 

In [None]:
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")


train_dataset = NextTokenDataset(train['text'], tokenizer, seq_length=MAX_LEN)
val_dataset = ValTokenDataset(val['text'], tokenizer, seq_length=MAX_LEN)
test_dataset = ValTokenDataset(test['text'], tokenizer, seq_length=MAX_LEN)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE)
val_loader = DataLoader(val_dataset, batch_size=4)
test_loader = DataLoader(test_dataset, batch_size=4)

: 

: 

# 4. Train LSTM model

In [None]:
from train_LSTM import train_model as train_LSTM_model
model = LSTMAutocomplete(tokenizer.vocab_size)
train_LSTM_model(model, train_loader, val_loader, tokenizer, learning_rate=0.01, device='cpu')

: 

: 

# 5. Pretrained model

In [None]:
from transformers import pipeline

generator = pipeline("text-generation", model="distilgpt2")
result = generator("Я собираюсь", max_length=20, do_sample=True, top_k=50)
print(result[0]["generated_text"])

: 

: 

In [None]:
val_loader = DataLoader(val_dataset, batch_size=1)

rouge1 = 0.0
rouge2 = 0.0

val_pbar = tqdm(val_loader, desc=f'Calc metrics pre-trained model...')

for batch in val_pbar:
    input_ids = batch['input_ids'].to(device)
    targets = batch['target'].to(device)
    # masks = batch['masks'].to(device)

    # Remove extra dimension 
    targets = targets.squeeze()
    input_ids = input_ids.squeeze()

    n_pad = find_padding_start_np(targets)
    target = targets[:n_pad]
    n_pad = find_padding_start_np(input_ids)
    input_seq = input_ids[:n_pad]
    
    target_text = tokenizer.decode(target, skip_special_tokens=True)
    input_text = tokenizer.decode(input_seq, skip_special_tokens=True)

    pred = generator(input_text, max_length=20, do_sample=True, top_k=50)

    b_rouge1, b_rouge2 = rouge1_2(pred[0][["generated_text"]])

    rouge1 += b_rouge1
    rouge2 += b_rouge2
    
        
print(f'Pre-trained model: rouge-1: {rouge1/len(val_loader):.4f}, val rouge2: {rouge2/len(val_loader):.4f}')

: 

: 

# 5. Conclusions

Based on the metric ROUGE 1 and ROUGE 2, we can draw the following conclusion: the lightweight model LSTM is not bad for the task of autocompletion of text in tweets and can be recommended for use in mobile devices.