In [1]:
# Установка зависимостей
#%pip freeze -> requirements.txt

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import sys
sys.path.insert(0, './src')

# Import libs

In [4]:
import os
from pathlib import Path

import pandas as pd
import torch
from transformers import BertTokenizerFast
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split
from tqdm import tqdm


from utils import dataset_processed
from eval_metric import rouge1_2, find_padding_start_np
from next_token_dataset import NextTokenDataset, ValTokenDataset
from LSTM import LSTMAutocomplete

BASE_DIR = Path().resolve()
MAX_LEN = 140
BATCH_SIZE = 128
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# 1. Clean raw data

In [5]:
dataset_processed(
    os.path.join(BASE_DIR, 'data', 'tweets.txt'),
    os.path.join(BASE_DIR, 'data', 'cleaned_tweets.csv')
)

In [6]:
dataset = pd.read_csv(os.path.join(BASE_DIR, 'data', 'cleaned_tweets.csv'), index_col=False)
dataset.head()

Unnamed: 0,text
0,switchfoot awww thats a bummer you shoulda got...
1,is upset that he cant update his facebook by t...
2,kenichan i dived many times for the ball manag...
3,my whole body feels itchy and like its on fire
4,nationwideclass no its not behaving at all im ...


# 2. Split dataset by train, val, test

In [7]:
train, val = train_test_split(dataset, test_size=0.2, random_state=42)
val, test  = train_test_split(val, test_size=0.5, random_state=42)
print(f"Train texts: {len(train)}, Val texts: {len(val)}, Test texts: {len(test)}")

Train texts: 1280398, Val texts: 160050, Test texts: 160050


In [8]:
# for limit of calc resources make val and test selection shoter
train = train.sample(n=1_000, random_state=42)
val = val.sample(n=100, random_state=42)
test = test.sample(n=100, random_state=42)

# 3. Create datasets and data loader 

In [9]:
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")


train_dataset = NextTokenDataset(train['text'], tokenizer, seq_length=MAX_LEN)
val_dataset = ValTokenDataset(val['text'], tokenizer, seq_length=MAX_LEN)
test_dataset = ValTokenDataset(test['text'], tokenizer, seq_length=MAX_LEN)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE)
val_loader = DataLoader(val_dataset, batch_size=4)
test_loader = DataLoader(test_dataset, batch_size=4)

# 4. Train LSTM model

In [10]:
from train_LSTM import train_model as train_LSTM_model
model = LSTMAutocomplete(tokenizer.vocab_size)
train_LSTM_model(model, train_loader, val_loader, num_epochs=5, tokenizer=tokenizer, learning_rate=0.01, device=device)

Starting training on cpu
Training samples: 16332
Validation samples: 100


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
Epoch 1/5 [Train]:   0%|          | 0/128 [00:00<?, ?it/s]TOKENIZERS_PARALLELISM=(true | false)
Epoch 1/5 [Train]: 100%|██████████| 128/128 [00:45<00:00,  2.84it/s, Loss=7.9942]
Calc metrics...: 100%|██████████| 25/25 [00:49<00:00,  1.96s/it, rouge1=0.0311  rouge2: 0.0000]


Epoch 1/5:
  Train Loss: 8.3584
  Val loss: 8.0203, rouge-1: 0.0311, val rouge2: 0.0000
Model and tokenizer saved!


Epoch 2/5 [Train]: 100%|██████████| 128/128 [00:43<00:00,  2.97it/s, Loss=7.8628]
Calc metrics...: 100%|██████████| 25/25 [00:49<00:00,  1.99s/it, rouge1=0.0644  rouge2: 0.0000]


Epoch 2/5:
  Train Loss: 7.6127
  Val loss: 8.1064, rouge-1: 0.0644, val rouge2: 0.0000
Model and tokenizer saved!


Epoch 3/5 [Train]: 100%|██████████| 128/128 [00:45<00:00,  2.82it/s, Loss=7.9024]
Calc metrics...: 100%|██████████| 25/25 [00:51<00:00,  2.06s/it, rouge1=0.0311  rouge2: 0.0000]


Epoch 3/5:
  Train Loss: 7.6711
  Val loss: 8.5320, rouge-1: 0.0311, val rouge2: 0.0000
Model and tokenizer saved!


Epoch 4/5 [Train]: 100%|██████████| 128/128 [00:46<00:00,  2.77it/s, Loss=7.1967]
Calc metrics...: 100%|██████████| 25/25 [00:51<00:00,  2.06s/it, rouge1=0.0311  rouge2: 0.0000]


Epoch 4/5:
  Train Loss: 7.4289
  Val loss: 8.8110, rouge-1: 0.0311, val rouge2: 0.0000
Model and tokenizer saved!


Epoch 5/5 [Train]: 100%|██████████| 128/128 [00:43<00:00,  2.94it/s, Loss=7.8620]
Calc metrics...: 100%|██████████| 25/25 [00:49<00:00,  1.99s/it, rouge1=0.0311  rouge2: 0.0000]


Epoch 5/5:
  Train Loss: 7.6890
  Val loss: 8.4456, rouge-1: 0.0311, val rouge2: 0.0000
Model and tokenizer saved!


# 5. Pretrained model

In [11]:
from transformers import pipeline

generator = pipeline("text-generation", model="distilgpt2")
result = generator("Я собираюсь", max_length=20, do_sample=True, top_k=50)
print(result[0]["generated_text"])

model.safetensors:   0%|          | 0.00/353M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Device set to use mps:0
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Both `max_new_tokens` (=256) and `max_length`(=20) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


Я собираюсь исказ изеков сфи самитраюи залекся жезам изеков сореку изеков симитраюи залекся залекся залекся залекся залекся залекся залекся залекся залекся залекся залекся собираюсь изеков сореку изеков сореку изеков сореку изеков сореку изеков сореку и


In [15]:
from transformers import pipeline

generator = pipeline("text-generation", model="distilgpt2")
val_loader = DataLoader(val_dataset, batch_size=1)
rouge1 = 0.0
rouge2 = 0.0

val_pbar = tqdm(val_loader, desc=f'Calc metrics pre-trained model...')

for batch in val_pbar:
    input_ids = batch['input_ids'].to(device)
    targets = batch['target'].to(device)
    # masks = batch['masks'].to(device)

    # Remove extra dimension 
    targets = targets.squeeze()
    input_ids = input_ids.squeeze()

    n_pad = find_padding_start_np(targets)
    target = targets[:n_pad]
    n_pad = find_padding_start_np(input_ids)
    input_seq = input_ids[:n_pad]
    
    target_text = tokenizer.decode(target, skip_special_tokens=True)
    input_text = tokenizer.decode(input_seq, skip_special_tokens=True)

    pred = generator(input_text, max_new_tokens=MAX_LEN, do_sample=True, top_k=50)

    b_rouge1, b_rouge2 = rouge1_2(
        predictions=pred[0]["generated_text"],
        references=target_text 
    )

    rouge1 += b_rouge1
    rouge2 += b_rouge2
    
        
print(f'Pre-trained model: rouge-1: {rouge1/len(val_loader):.4f}, val rouge2: {rouge2/len(val_loader):.4f}')

Device set to use mps:0
Calc metrics pre-trained model...:   0%|          | 0/100 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Calc metrics pre-trained model...:   1%|          | 1/100 [00:01<03:11,  1.94s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Calc metrics pre-trained model...:   2%|▏         | 2/100 [00:03<03:02,  1.87s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Calc metrics pre-trained model...:   3%|▎         | 3/100 [00:05<02:57,  1.83s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Calc metrics pre-trained model...:   4%|▍         | 4/100 [00:06<02:09,  1.35s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Calc metrics pre-trained model...:   5%|▌         | 5/100 [00:07<02:25,  1.53s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Calc metrics pre-trained model...:   6%|▌         | 6/100 [0

Pre-trained model: rouge-1: 0.0405, val rouge2: 0.0000





# 5. Conclusions

Based on the metric ROUGE 1 and ROUGE 2, we can draw the following conclusion: the lightweight model LSTM is not bad for the task of autocompletion of text in tweets and can be recommended for use in mobile devices.