# Text Encoding Model

In [1]:
import fastai
print(fastai.__version__) # version check

2.7.15


In [2]:
from fastai.text.all import *

In [3]:
# Load the data
path = Path('data/txt/train')

In [4]:
batch_size  = 64 # default value for fastai v2 

In [5]:
# Create a DataBlock for language modeling
dblock = DataBlock(
    blocks=TextBlock.from_folder(path, is_lm=True), # Define the block as TextBlock for LM
    get_items=get_text_files,                      # Get the text files
    splitter=FuncSplitter(lambda x: False)         # Do not split the data into train/validation sets
)

# Create DataLoaders
dls = dblock.dataloaders(path, bs=batch_size)


In [6]:
# Check the DataLoader
dls.show_batch(max_n=3)

Unnamed: 0,text,text_
0,"xxbos a small bird with a yellow and black belly and a grey back and head . xxbos this bird is blue with red on it and has a long , pointy beak . xxbos large bird with a flat beak and mostly brown feathers . xxbos this bird has wings that are black and has an orange belly xxbos the bird is small with long black bill and white ring around","a small bird with a yellow and black belly and a grey back and head . xxbos this bird is blue with red on it and has a long , pointy beak . xxbos large bird with a flat beak and mostly brown feathers . xxbos this bird has wings that are black and has an orange belly xxbos the bird is small with long black bill and white ring around nape"
1,"wings , crown and tail with white stripe nape . xxbos this bird has a long , thick pink bill with a black tip and mostly white plumage with brown on the coverts , secondaries & primaries . xxbos white bird with black topped wings and a yellow curved beak . xxbos this bird is mostly grey with a black crown , dark grey tailfeathers , and large black eyes . xxbos",", crown and tail with white stripe nape . xxbos this bird has a long , thick pink bill with a black tip and mostly white plumage with brown on the coverts , secondaries & primaries . xxbos white bird with black topped wings and a yellow curved beak . xxbos this bird is mostly grey with a black crown , dark grey tailfeathers , and large black eyes . xxbos this"
2,has a large crest with a big white spot in it . xxbos a yellow bird with a white patch on it 's wings . black tail feathers and a green beak xxbos this bird 's colors are a xxunk contrast of burnt orange and iridescent green . xxbos this is a bird with a white belly and grey wings . xxbos a brown bird with xxunk brown crown and a brown,a large crest with a big white spot in it . xxbos a yellow bird with a white patch on it 's wings . black tail feathers and a green beak xxbos this bird 's colors are a xxunk contrast of burnt orange and iridescent green . xxbos this is a bird with a white belly and grey wings . xxbos a brown bird with xxunk brown crown and a brown outer


### What This Means:
- `xxbos` at the start of each line indicates the beginning of a new sentence or data point.
- `xxunk` replaces words that are not in the model's vocabulary, allowing the model to handle unseen words gracefully.

In [10]:
## debugging ssl certificate error 
import ssl
import certifi

ssl_context = ssl.create_default_context(cafile=certifi.where())

In [11]:
# Create a learner for language modeling using AWD_LSTM architecture
learn = language_model_learner(dls, AWD_LSTM, drop_mult=0.3)

In [9]:
# checking
learn.predict('this bird has', n_words=30, temperature=0.75)

'this bird has a relatively large body size and a body size of over 3 feet ( 3 & ) . It is also the head - body , body , and'

In [None]:
encoder = learn.model[0]

In [None]:
encoder.eval()
with torch.no_grad():
    for xb,yb in progress_bar(learn.data.train_dl):
        print(xb.shape)
        print(encoder(xb)[0][-1].shape)
        break

In [None]:
learn.save_encoder('enc')