In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
sys.path.insert(0, './src')

# Import libs

In [3]:
import os
from pathlib import Path

import pandas as pd
from transformers import BertTokenizerFast
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split


from utils import dataset_processed
from next_token_dataset import NextTokenDataset, ValTokenDataset
from LSTM import LSTMAutocomplete

BASE_DIR = Path().resolve()
MAX_LEN = 140
BATCH_SIZE = 256


  from .autonotebook import tqdm as notebook_tqdm


# 1. Clean raw data

In [4]:
dataset_processed(
    os.path.join(BASE_DIR, 'data', 'tweets.txt'),
    os.path.join(BASE_DIR, 'data', 'cleaned_tweets.csv')
)
dataset = pd.read_csv(os.path.join(BASE_DIR, 'data', 'cleaned_tweets.csv'), index_col=False)
dataset.head()

Unnamed: 0,text
0,switchfoot awww thats a bummer you shoulda got...
1,is upset that he cant update his facebook by t...
2,kenichan i dived many times for the ball manag...
3,my whole body feels itchy and like its on fire
4,nationwideclass no its not behaving at all im ...


# 2. Split dataset by train, val, test

In [5]:
train, val = train_test_split(dataset, test_size=0.2, random_state=42)
val, test  = train_test_split(val, test_size=0.5, random_state=42)
print(f"Train texts: {len(train)}, Val texts: {len(val)}, Test texts: {len(test)}")

Train texts: 1280398, Val texts: 160050, Test texts: 160050


In [6]:
# for limit of calc resources make val and test selection shoter
val = val.sample(n=100, random_state=42)
test = test.sample(n=100, random_state=42)

# 3. Create datasets and data loader 

In [7]:
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")


train_dataset = NextTokenDataset(train, tokenizer, seq_length=MAX_LEN)
val_dataset = ValTokenDataset(val, tokenizer, seq_length=MAX_LEN)
test_dataset = ValTokenDataset(test, tokenizer, seq_length=MAX_LEN)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE)
val_loader = DataLoader(val_dataset, batch_size=1)
test_loader = DataLoader(test_dataset, batch_size=1)



# 4. Train LSTM model

In [None]:
test_list = [1,2,3,4,5,6,7]
test_list[:-3]

[1, 2, 3, 4]

: 

In [9]:
from train_LSTM import train_model as train_LSTM_model
model = LSTMAutocomplete(tokenizer.vocab_size)
train_LSTM_model(model, train_loader, val_loader, tokenizer.vocab, device='cpu')

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable 

Starting training on cpu
Training samples: 3
Validation samples: 1


TOKENIZERS_PARALLELISM=(true | false)
Epoch 1/10 [Train]: 100%|██████████| 1/1 [00:00<00:00,  4.62it/s, Loss=10.3465]


Epoch 1/10:
  Train Loss: 10.3465
  Val loss: 10.3241, rouge-1: 0.0000, val rouge2: 0.0000


Epoch 2/10 [Train]: 100%|██████████| 1/1 [00:00<00:00,  9.28it/s, Loss=10.3675]


Epoch 2/10:
  Train Loss: 10.3675
  Val loss: 10.3220, rouge-1: 0.0000, val rouge2: 0.0000


Epoch 3/10 [Train]: 100%|██████████| 1/1 [00:00<00:00,  9.56it/s, Loss=10.2899]


Epoch 3/10:
  Train Loss: 10.2899
  Val loss: 10.3194, rouge-1: 0.0000, val rouge2: 0.0000


Epoch 4/10 [Train]: 100%|██████████| 1/1 [00:00<00:00,  9.06it/s, Loss=10.2950]


Epoch 4/10:
  Train Loss: 10.2950
  Val loss: 10.3167, rouge-1: 0.0000, val rouge2: 0.0000


Epoch 5/10 [Train]: 100%|██████████| 1/1 [00:00<00:00,  9.78it/s, Loss=10.2050]


Epoch 5/10:
  Train Loss: 10.2050
  Val loss: 10.3136, rouge-1: 0.0000, val rouge2: 0.0000


Epoch 6/10 [Train]: 100%|██████████| 1/1 [00:00<00:00,  9.37it/s, Loss=10.1418]


Epoch 6/10:
  Train Loss: 10.1418
  Val loss: 10.3103, rouge-1: 0.0000, val rouge2: 0.0000


Epoch 7/10 [Train]: 100%|██████████| 1/1 [00:00<00:00,  9.85it/s, Loss=10.0311]


Epoch 7/10:
  Train Loss: 10.0311
  Val loss: 10.3066, rouge-1: 0.0000, val rouge2: 0.0000


Epoch 8/10 [Train]: 100%|██████████| 1/1 [00:00<00:00,  9.81it/s, Loss=9.9809]


Epoch 8/10:
  Train Loss: 9.9809
  Val loss: 10.3026, rouge-1: 0.0000, val rouge2: 0.0000


Epoch 9/10 [Train]: 100%|██████████| 1/1 [00:00<00:00,  9.09it/s, Loss=9.8839]


Epoch 9/10:
  Train Loss: 9.8839
  Val loss: 10.2984, rouge-1: 0.0000, val rouge2: 0.0000


Epoch 10/10 [Train]: 100%|██████████| 1/1 [00:00<00:00,  6.87it/s, Loss=9.8156]


Epoch 10/10:
  Train Loss: 9.8156
  Val loss: 10.2937, rouge-1: 0.0000, val rouge2: 0.0000


: 

: 

: 

: 

: 

: 