In [1]:
from transformers import GPT2LMHeadModel, GPT2TokenizerFast
import pandas as pd

In [2]:
df = pd.read_csv('../data/messages.csv')
df.head()

Unnamed: 0,Member,Message
0,Zain,Ok so all this server is a hub for friends to ...
1,Zain,Do you have an Ikea?
2,Zain,yee boy
3,Zain,Yeah
4,Zain,Noice


In [3]:
pretrained_weights = 'gpt2'
tokenizer = GPT2TokenizerFast.from_pretrained(pretrained_weights)
model = GPT2LMHeadModel.from_pretrained(pretrained_weights)

In [4]:
messages = df['Message'].to_list()

In [5]:
ids = tokenizer.encode(messages[0])
ids

[18690,
 523,
 477,
 428,
 4382,
 318,
 257,
 12575,
 329,
 2460,
 284,
 711,
 1830,
 11,
 8537,
 11,
 8181,
 448,
 11,
 290,
 2429,
 81,
 453,
 423,
 1257,
 13,
 770,
 4382,
 318,
 6229,
 284,
 307,
 257,
 9014,
 329,
 262,
 8745,
 1292,
 21682,
 37018,
 4382,
 326,
 468,
 1716,
 845,
 1588,
 290,
 7379,
 1164,
 88,
 13,
 383,
 8745,
 1292,
 21682,
 37018,
 4382,
 468,
 635,
 1716,
 845,
 28621,
 290,
 23196,
 378,
 355,
 257,
 1255,
 286,
 663,
 2546,
 13,
 1320,
 318,
 1521,
 716,
 2911,
 278,
 284,
 1394,
 428,
 2055,
 5365,
 1402,
 523,
 326,
 356,
 460,
 307,
 4075,
 290,
 423,
 1257,
 13]

In [6]:
decode_result = tokenizer.decode(ids)
print(decode_result)

Ok so all this server is a hub for friends to play games, chat, hangout, and genrally have fun. This server is ment to be a replacement for the orginal frostbite server that has become very large and unwieldy. The orginal frostbite server has also become very inactive and stagnate as a result of its size. That is why am hopeing to keep this community relatively small so that we can be active and have fun.


# Generate Predictions

In [7]:
import torch

In [8]:
t = torch.LongTensor(ids)[None]
preds = model.generate(t)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Input length of input_ids is 94, but ``max_length`` is set to 20.This can lead to unexpected behavior. You should consider increasing ``config.max_length`` or ``max_length``.


In [9]:
preds.shape,preds[0]

(torch.Size([1, 94]),
 tensor([18690,   523,   477,   428,  4382,   318,   257, 12575,   329,  2460,
           284,   711,  1830,    11,  8537,    11,  8181,   448,    11,   290,
          2429,    81,   453,   423,  1257,    13,   770,  4382,   318,  6229,
           284,   307,   257,  9014,   329,   262,  8745,  1292, 21682, 37018,
          4382,   326,   468,  1716,   845,  1588,   290,  7379,  1164,    88,
            13,   383,  8745,  1292, 21682, 37018,  4382,   468,   635,  1716,
           845, 28621,   290, 23196,   378,   355,   257,  1255,   286,   663,
          2546,    13,  1320,   318,  1521,   716,  2911,   278,   284,  1394,
           428,  2055,  5365,  1402,   523,   326,   356,   460,   307,  4075,
           290,   423,  1257,    13]))

In [10]:
prediction_result = tokenizer.decode(preds[0].numpy())
print(prediction_result)

Ok so all this server is a hub for friends to play games, chat, hangout, and genrally have fun. This server is ment to be a replacement for the orginal frostbite server that has become very large and unwieldy. The orginal frostbite server has also become very inactive and stagnate as a result of its size. That is why am hopeing to keep this community relatively small so that we can be active and have fun.


# Bridging the gap with fastai

In [11]:
from fastai.text.all import *

In [12]:
df_train = df.head(66488)
df_val = df.tail(16622)
all_texts = df['Message'].values

In [13]:
class TransformersTokenizer(Transform):
    def __init__(self, tokenizer): self.tokenizer = tokenizer
    def encodes(self, x): 
        toks = self.tokenizer.tokenize(x)
        return tensor(self.tokenizer.convert_tokens_to_ids(toks))
    def decodes(self, x): return TitledStr(self.tokenizer.decode(x.cpu().numpy()))

In [14]:
splits = [range_of(df_train), list(range(len(df_train), len(all_texts)))]
tls = TfmdLists(all_texts, TransformersTokenizer(tokenizer), splits=splits, dl_type=LMDataLoader)

In [15]:
tls.train[0],tls.valid[0]

(tensor([18690,   523,   477,   428,  4382,   318,   257, 12575,   329,  2460,
           284,   711,  1830,    11,  8537,    11,  8181,   448,    11,   290,
          2429,    81,   453,   423,  1257,    13,   770,  4382,   318,  6229,
           284,   307,   257,  9014,   329,   262,  8745,  1292, 21682, 37018,
          4382,   326,   468,  1716,   845,  1588,   290,  7379,  1164,    88,
            13,   383,  8745,  1292, 21682, 37018,  4382,   468,   635,  1716,
           845, 28621,   290, 23196,   378,   355,   257,  1255,   286,   663,
          2546,    13,  1320,   318,  1521,   716,  2911,   278,   284,  1394,
           428,  2055,  5365,  1402,   523,   326,   356,   460,   307,  4075,
           290,   423,  1257,    13]),
 tensor([ 5171,   470,  2652,  2208,   890, 42796]))

In [16]:
tls.tfms(tls.train.items[0]).shape, tls.tfms(tls.valid.items[0]).shape

(torch.Size([94]), torch.Size([6]))

In [17]:
show_at(tls.train, 0)

Ok so all this server is a hub for friends to play games, chat, hangout, and genrally have fun. This server is ment to be a replacement for the orginal frostbite server that has become very large and unwieldy. The orginal frostbite server has also become very inactive and stagnate as a result of its size. That is why am hopeing to keep this community relatively small so that we can be active and have fun.


In [18]:
show_at(tls.valid, 0)

can't stay super long tho


In [20]:
print(type(tls))

<class 'fastai.data.core.TfmdLists'>


In [19]:
bs,sl = (4,256)
dls = tls.dataloaders(bs=bs, sl=sl)

TypeError: TextEncodeInput must be Union[TextInputSequence, Tuple[InputSequence, InputSequence]]

In [None]:
dls.show_batch(max_n=2)