In [48]:
# Установка зависимостей
%pip freeze -> requirements.txt

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [24]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [25]:
import sys
sys.path.insert(0, './src')

# Import libs

In [45]:
import os
from pathlib import Path

import pandas as pd
import torch
from transformers import BertTokenizerFast
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split
from tqdm import tqdm


from utils import dataset_processed
from eval_metric import rouge1_2, find_padding_start_np
from next_token_dataset import NextTokenDataset, ValTokenDataset
from LSTM import LSTMAutocomplete

BASE_DIR = Path().resolve()
MAX_LEN = 140
BATCH_SIZE = 128
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# 1. Clean raw data

In [27]:
dataset_processed(
    os.path.join(BASE_DIR, 'data', 'tweets.txt'),
    os.path.join(BASE_DIR, 'data', 'cleaned_tweets.csv')
)

In [28]:
dataset = pd.read_csv(os.path.join(BASE_DIR, 'data', 'cleaned_tweets.csv'), index_col=False)
dataset.head()

Unnamed: 0,text
0,switchfoot awww thats a bummer you shoulda got...
1,is upset that he cant update his facebook by t...
2,kenichan i dived many times for the ball manag...
3,my whole body feels itchy and like its on fire
4,nationwideclass no its not behaving at all im ...


# 2. Split dataset by train, val, test

In [29]:
train, val = train_test_split(dataset, test_size=0.2, random_state=42)
val, test  = train_test_split(val, test_size=0.5, random_state=42)
print(f"Train texts: {len(train)}, Val texts: {len(val)}, Test texts: {len(test)}")

Train texts: 1280398, Val texts: 160050, Test texts: 160050


In [30]:
# for limit of calc resources make val and test selection shoter
train = train.sample(n=100, random_state=42)
val = val.sample(n=100, random_state=42)
test = test.sample(n=100, random_state=42)

# 3. Create datasets and data loader 

In [31]:
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")


train_dataset = NextTokenDataset(train['text'], tokenizer, seq_length=MAX_LEN)
val_dataset = ValTokenDataset(val['text'], tokenizer, seq_length=MAX_LEN)
test_dataset = ValTokenDataset(test['text'], tokenizer, seq_length=MAX_LEN)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE)
val_loader = DataLoader(val_dataset, batch_size=4)
test_loader = DataLoader(test_dataset, batch_size=4)

# 4. Train LSTM model

In [42]:
from train_LSTM import train_model as train_LSTM_model
model = LSTMAutocomplete(tokenizer.vocab_size)
train_LSTM_model(model, train_loader, val_loader, num_epochs=2, tokenizer=tokenizer, learning_rate=0.01, device=device)

Starting training on cpu
Training samples: 1733
Validation samples: 100


Epoch 1/2 [Train]: 100%|██████████| 14/14 [00:14<00:00,  1.04s/it, Loss=10.1479]
Calc metrics...: 100%|██████████| 25/25 [01:02<00:00,  2.49s/it, rouge1=0.0644  rouge2: 0.0000]

Epoch 1/2:
  Train Loss: 9.6796
  Val loss: 10.5958, rouge-1: 0.0644, val rouge2: 0.0000





Model and tokenizer saved!


Epoch 2/2 [Train]: 100%|██████████| 14/14 [00:14<00:00,  1.07s/it, Loss=7.8102]
Calc metrics...: 100%|██████████| 25/25 [01:02<00:00,  2.48s/it, rouge1=0.0644  rouge2: 0.0000]

Epoch 2/2:
  Train Loss: 7.4670
  Val loss: 10.3082, rouge-1: 0.0644, val rouge2: 0.0000





Model and tokenizer saved!


# 5. Pretrained model

In [43]:
from transformers import pipeline

generator = pipeline("text-generation", model="distilgpt2")
result = generator("Я собираюсь", max_length=20, do_sample=True, top_k=50)
print(result[0]["generated_text"])

TypeError: Too few arguments for numpy.ndarray

In [47]:
from transformers import pipeline

generator = pipeline("text-generation", model="distilgpt2")
val_loader = DataLoader(val_dataset, batch_size=1)
rouge1 = 0.0
rouge2 = 0.0

val_pbar = tqdm(val_loader, desc=f'Calc metrics pre-trained model...')

for batch in val_pbar:
    input_ids = batch['input_ids'].to(device)
    targets = batch['target'].to(device)
    # masks = batch['masks'].to(device)

    # Remove extra dimension 
    targets = targets.squeeze()
    input_ids = input_ids.squeeze()

    n_pad = find_padding_start_np(targets)
    target = targets[:n_pad]
    n_pad = find_padding_start_np(input_ids)
    input_seq = input_ids[:n_pad]
    
    target_text = tokenizer.decode(target, skip_special_tokens=True)
    input_text = tokenizer.decode(input_seq, skip_special_tokens=True)

    pred = generator(input_text, max_length=20, do_sample=True, top_k=50)

    b_rouge1, b_rouge2 = rouge1_2(pred[0][["generated_text"]])

    rouge1 += b_rouge1
    rouge2 += b_rouge2
    
        
print(f'Pre-trained model: rouge-1: {rouge1/len(val_loader):.4f}, val rouge2: {rouge2/len(val_loader):.4f}')

TypeError: Too few arguments for numpy.ndarray

# 5. Conclusions

Based on the metric ROUGE 1 and ROUGE 2, we can draw the following conclusion: the lightweight model LSTM is not bad for the task of autocompletion of text in tweets and can be recommended for use in mobile devices.