In [9]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [10]:
import sys
sys.path.insert(0, './src')

# Import libs

In [11]:
import os
from pathlib import Path

import pandas as pd
from transformers import BertTokenizerFast
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split


from utils import dataset_processed
from next_token_dataset import NextTokenDataset, ValTokenDataset
from LSTM import LSTMAutocomplete

BASE_DIR = Path().resolve()
MAX_LEN = 140
BATCH_SIZE = 128


# 1. Clean raw data

In [12]:
dataset_processed(
    os.path.join(BASE_DIR, 'data', 'tweets.txt'),
    os.path.join(BASE_DIR, 'data', 'cleaned_tweets.csv')
)

In [13]:
dataset = pd.read_csv(os.path.join(BASE_DIR, 'data', 'cleaned_tweets.csv'), index_col=False)
dataset.head()

Unnamed: 0,text
0,switchfoot awww thats a bummer you shoulda got...
1,is upset that he cant update his facebook by t...
2,kenichan i dived many times for the ball manag...
3,my whole body feels itchy and like its on fire
4,nationwideclass no its not behaving at all im ...


# 2. Split dataset by train, val, test

In [14]:
train, val = train_test_split(dataset, test_size=0.2, random_state=42)
val, test  = train_test_split(val, test_size=0.5, random_state=42)
print(f"Train texts: {len(train)}, Val texts: {len(val)}, Test texts: {len(test)}")

Train texts: 1280398, Val texts: 160050, Test texts: 160050


# for limit of calc resources make val and test selection shoter
val = val.sample(n=100, random_state=42)
test = test.sample(n=100, random_state=42)

# 3. Create datasets and data loader 

In [None]:
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")


train_dataset = NextTokenDataset(train['text'], tokenizer, seq_length=MAX_LEN)
val_dataset = ValTokenDataset(val['text'], tokenizer, seq_length=MAX_LEN)
test_dataset = ValTokenDataset(test['text'], tokenizer, seq_length=MAX_LEN)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE)
val_loader = DataLoader(val_dataset, batch_size=4)
test_loader = DataLoader(test_dataset, batch_size=4)

# 4. Train LSTM model

In [None]:
from train_LSTM import train_model as train_LSTM_model
model = LSTMAutocomplete(tokenizer.vocab_size)
train_LSTM_model(model, train_loader, val_loader, tokenizer, learning_rate=0.01, device='cpu')

Starting training on cpu
Training samples: 1
Validation samples: 1


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Epoch 1/10 [Train]: 100%|██████████| 1/1 [00:00<00:00,  4.27it/s, Loss=10.3263]
Calc metrics...: 100%|██████████| 1/1 [00:01<00:00,  1.39s/it, rouge1=0.0000  rouge2: 0.0000]


Epoch 1/10:
  Train Loss: 10.3263
  Val loss: 10.3256, rouge-1: 0.0000, val rouge2: 0.0000


Epoch 2/10 [Train]: 100%|██████████| 1/1 [00:00<00:00,  9.18it/s, Loss=10.2980]
Calc metrics...: 100%|██████████| 1/1 [00:01<00:00,  1.79s/it, rouge1=0.0000  rouge2: 0.0000]


Epoch 2/10:
  Train Loss: 10.2980
  Val loss: 10.3250, rouge-1: 0.0000, val rouge2: 0.0000


Epoch 3/10 [Train]: 100%|██████████| 1/1 [00:00<00:00,  5.37it/s, Loss=10.2245]
Calc metrics...: 100%|██████████| 1/1 [00:01<00:00,  1.59s/it, rouge1=0.0000  rouge2: 0.0000]


Epoch 3/10:
  Train Loss: 10.2245
  Val loss: 10.3238, rouge-1: 0.0000, val rouge2: 0.0000


Epoch 4/10 [Train]: 100%|██████████| 1/1 [00:00<00:00,  8.61it/s, Loss=10.0536]
Calc metrics...:   0%|          | 0/1 [00:00<?, ?it/s]


KeyboardInterrupt: 

In [None]:
len(train_loader)