In [17]:
!pip install opendatasets
import opendatasets as od
od.download("https://www.kaggle.com/eemanmajumder/the-anime-dataset")

Collecting opendatasets
  Downloading opendatasets-0.1.20-py3-none-any.whl (14 kB)
Installing collected packages: opendatasets
Successfully installed opendatasets-0.1.20
Please provide your Kaggle credentials to download this dataset. Learn more: http://bit.ly/kaggle-creds
Your Kaggle username: eemanmajumder
Your Kaggle Key: ··········
Downloading the-anime-dataset.zip to ./the-anime-dataset


100%|██████████| 3.17M/3.17M [00:00<00:00, 159MB/s]







In [3]:
import pandas as pd
import random
import re
from tqdm import tqdm
import numpy as np
from spacy.tokenizer import Tokenizer
from spacy.lang.en import English
import torch
import spacy
nlp = English()
import torch.nn as nn
import nltk
pd.options.display.max_columns = 500
import warnings
warnings.filterwarnings(action='ignore')
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [4]:
data = pd.read_csv('eda-data.csv',index_col=0)
synopsis = data.synopsis
print('Number of Anime synopsis we have: ',len(synopsis))

Number of Anime synopsis we have:  16610


In [5]:
i = random.randint(0,len(synopsis))
print('Synopsis example\n\nAnime:{} \nSynopsis:{}\n'.format(data['anime_name'].values[i],synopsis.values[i]))

Synopsis example

Anime:Flanders no Inu, Boku no Patrasche 
Synopsis:A boy named Nello becomes an orphan at the age of two when his mother dies.  His grandfather, who lives in a small village takes him in. One day, Nello finds a dog who was almost beaten to death and named him Patrasche. Due to the good care of his grandfather, the dog recovers, and from then on, Nello and Patrasche are inseparable. Since they are very poor, Nello has to help his grandfather by selling milk. Patrasche helps him pull the milk cart that Nello uses to sell milk in the town.(Source: Wikipedia)



In [6]:
def remove_source(text):
    cln_text = text
    if '(Source' in cln_text:
        cln_text,_,_ = cln_text.partition('(Source')
    elif '[Written ' in cln_text:
        cln_text,_,_ = cln_text.partition('[Written')
        
    return cln_text

In [7]:
def clean_synopsis(data):
    # removing hentai and kids tags
    data = data[(data.Hentai != 1) & (data.Kids != 1)]
    synopsis = data.synopsis
    
    # removing very small synopsis
    synopsis = synopsis.apply(lambda x: x if ((len(str(x).strip().split())<=300) and len(str(x).strip().split())>30  ) else -1)
    synopsis = synopsis[synopsis!=-1]
    
    # removing source text
    synopsis = synopsis.apply(lambda x: remove_source(x))
    
    # removing japanese characters
    synopsis = synopsis.apply(lambda x: re.sub("([^\x00-\x7F])+"," ",x))
    
    # remove symbols
    rx = re.compile('^[&#/@`)(;<=\'"$%>]')
    synopsis = synopsis.apply(lambda x: rx.sub('',x))
    synopsis = synopsis.apply(lambda x: x.replace('>',""))
    synopsis = synopsis.apply(lambda x: x.replace('`',""))
    synopsis = synopsis.apply(lambda x: x.replace(')',""))
    synopsis = synopsis.apply(lambda x: x.replace('(',""))
    

    # removing adaptation animes (some relevant might get deleted but there aren`t a lot so we wont be affected as much)
    synopsis = synopsis[synopsis.apply(lambda x: 'adaptation' not in str(x).lower())]    
    synopsis = synopsis[synopsis.apply(lambda x: 'music video' not in str(x).lower())]
    synopsis = synopsis[synopsis.apply(lambda x: 'based on' not in str(x).lower())]
    synopsis = synopsis[synopsis.apply(lambda x: 'spin-off' not in str(x).lower())]
    
    return synopsis.reset_index(drop=True)

cleaned_synopsis = clean_synopsis(data)
print('Size: ',len(cleaned_synopsis))

Size:  7309


In [8]:
class config:    
    tokenizer = nltk.word_tokenize    
    #data = AnimeDataset(cleaned_synopsis)
    batch_size = 32
    #vocab_size = data.vocab_size
    seq_len = 30
        
    emb_dim = 100
    epochs = 15
    hidden_dim = 512
    model_path = 'lm_lrdecay_drop.bin'

In [9]:
def create_dataset(synopsis,batch_size,seq_len):
    np.random.seed(0)
    synopsis = synopsis.apply(lambda x: str(x).lower()).values
    synopsis_text = ' '.join(synopsis)
    
    
    tokens = config.tokenizer(synopsis_text)
    global num_batches
    num_batches = int(len(tokens)/(seq_len*batch_size))
    tokens = tokens[:num_batches*batch_size*seq_len]
    
    words = sorted(set(tokens))        
    w2i = {w:i for i,w in enumerate(words)}
    i2w = {i:w for i,w in enumerate(words)}
    
    tokens = [w2i[tok] for tok in tokens]
    target = np.zeros_like((tokens))
    target[:-1] = tokens[1:]
    target[-1] = tokens[0]
    
    input_tok = np.reshape(tokens,(batch_size,-1))
    target_tok = np.reshape(target,(batch_size,-1))
    
    print(input_tok.shape)
    print(target_tok.shape)
    
    vocab_size = len(i2w)
    return input_tok,target_tok,vocab_size,w2i,i2w

def create_batches(input_tok,target_tok,batch_size,seq_len):
    
    num_batches = np.prod(input_tok.shape)//(batch_size*seq_len)
    
    for i in range(0,num_batches*seq_len,seq_len):
        yield input_tok[:,i:i+seq_len], target_tok[:,i:i+seq_len]
               

In [10]:
class LSTMModel(nn.Module):    
    def __init__(self,hid_dim,emb_dim,vocab_size,num_layers=1):
        super(LSTMModel,self).__init__()
        self.hid_dim = hid_dim
        self.emb_dim = emb_dim
        self.num_layers = num_layers
        self.vocab_size = vocab_size+1
        self.embedding = nn.Embedding(self.vocab_size,self.emb_dim)
        self.lstm = nn.LSTM(self.emb_dim,self.hid_dim,batch_first = True,num_layers = self.num_layers)
        self.drop = nn.Dropout(0.3)
        self.linear = nn.Linear(self.hid_dim,vocab_size) # from here we will randomly sample a word
        
    def forward(self,x,prev_hid):
        x = self.embedding(x)
        x,hid = self.lstm(x,prev_hid)
        x = self.drop(x)
        x = self.linear(x)
        return x,hid
    
    def zero_state(self,batch_size):
        return (torch.zeros(self.num_layers,batch_size,self.hid_dim),torch.zeros(self.num_layers,batch_size,self.hid_dim))

In [11]:
class AverageMeter:
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

In [12]:
def loss_fn(predicted,target):
    loss = nn.CrossEntropyLoss()
    return loss(predicted,target)

In [13]:
def train_fn(model,device,dataloader,optimizer):
    model.train()
    tk0 = tqdm(dataloader,position=0,leave=True,total = num_batches)
    train_loss = AverageMeter()  
    hid_state,cell_state = model.zero_state(config.batch_size)
    hid_state = hid_state.to(device)
    cell_state = cell_state.to(device)
    losses = []
    for inp,target in tk0:
                
        inp = torch.tensor(inp,dtype=torch.long).to(device)
        target = torch.tensor(target,dtype=torch.long).to(device)

        optimizer.zero_grad()        
        pred,(hid_state,cell_state) = model(inp,(hid_state,cell_state))
        #print(pred.transpose(1,2).shape)
        
        loss = loss_fn(pred.transpose(1,2),target)
        
        hid_state = hid_state.detach()
        cell_state = cell_state.detach()
        
        loss.backward()

        _ = torch.nn.utils.clip_grad_norm_(model.parameters(),max_norm=2) # to avoid gradient explosion
        optimizer.step()
        
        train_loss.update(loss.detach().item())
        tk0.set_postfix(loss = train_loss.avg)
        losses.append(loss.detach().item())
    return np.mean(losses)

In [14]:
input_tok,target_tok,vocab_size,w2i,i2w = create_dataset(cleaned_synopsis,batch_size=config.batch_size,seq_len=config.seq_len)

(32, 25380)
(32, 25380)


In [20]:
def run():
    device = 'cuda'
    model = LSTMModel(vocab_size=vocab_size,emb_dim=config.emb_dim,hid_dim=config.hidden_dim,num_layers=3).to(device)
    optimizer = torch.optim.Adam(model.parameters(),lr=0.001)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer=optimizer, mode = 'min', patience=2, verbose=True, factor=0.5)
    epochs = config.epochs
    
    best_loss = 999
    for i in range(1,epochs+1):
        train_dataloader = create_batches(batch_size=config.batch_size,input_tok=input_tok,seq_len=config.seq_len,target_tok=target_tok)
        print('Epoch..',i)
        loss = train_fn(model,device,train_dataloader,optimizer)
        if loss<best_loss:
            best_loss = loss
            torch.save(model.state_dict(),config.model_path)
        scheduler.step(loss)
        torch.cuda.empty_cache()
    return model

In [21]:
model = run()

Epoch.. 1


100%|██████████| 846/846 [01:52<00:00,  7.55it/s, loss=7.23]


Epoch.. 2


100%|██████████| 846/846 [01:55<00:00,  7.32it/s, loss=6.54]


Epoch.. 3


100%|██████████| 846/846 [01:57<00:00,  7.18it/s, loss=6.05]


Epoch.. 4


100%|██████████| 846/846 [01:58<00:00,  7.16it/s, loss=5.78]


Epoch.. 5


100%|██████████| 846/846 [01:58<00:00,  7.16it/s, loss=5.58]


Epoch.. 6


100%|██████████| 846/846 [01:58<00:00,  7.17it/s, loss=5.38]


Epoch.. 7


100%|██████████| 846/846 [01:58<00:00,  7.16it/s, loss=5.21]


Epoch.. 8


100%|██████████| 846/846 [01:58<00:00,  7.16it/s, loss=5.04]


Epoch.. 9


100%|██████████| 846/846 [01:58<00:00,  7.16it/s, loss=4.89]


Epoch.. 10


100%|██████████| 846/846 [01:58<00:00,  7.16it/s, loss=4.74]


Epoch.. 11


100%|██████████| 846/846 [01:58<00:00,  7.17it/s, loss=4.6]


Epoch.. 12


100%|██████████| 846/846 [01:58<00:00,  7.17it/s, loss=4.48]


Epoch.. 13


100%|██████████| 846/846 [01:58<00:00,  7.17it/s, loss=4.36]


Epoch.. 14


100%|██████████| 846/846 [01:57<00:00,  7.17it/s, loss=4.26]


Epoch.. 15


100%|██████████| 846/846 [01:57<00:00,  7.17it/s, loss=4.16]


In [22]:
def inference(model,input_text,device,top_k=5,length = 100):
    output = ''
    model.eval()
    tokens = config.tokenizer(input_text)
        
    h,c = model.zero_state(1)
    h = h.to(device)
    c = c.to(device)
    
    for t in tokens:
        output = output+t+' '
        pred,(h,c) = model(torch.tensor(w2i[t.lower()]).view(1,-1).to(device),(h,c))
        #print(pred.shape)
    for i in range(length):
        _,top_ix = torch.topk(pred[0],k = top_k)
               
        choices = top_ix[0].tolist()                
        choice = np.random.choice(choices)
        out = i2w[choice]
        output = output + out + ' '
        pred,(h,c) = model(torch.tensor(choice,dtype=torch.long).view(1,-1).to(device),(h,c))
    return output

In [24]:
device = 'cpu'
mod = LSTMModel(emb_dim=config.emb_dim,hid_dim=config.hidden_dim,vocab_size=vocab_size,num_layers=3).to(device)
mod.load_state_dict(torch.load(config.model_path))
print('AI generated Anime synopsis:')
inference(model = mod, input_text = 'Bob the Architect  ', top_k = 30, length = 1000, device = device)

AI generated Anime synopsis:


"Bob the Architect in this series ; '' he finds how many friends about and falls along when demons must show it at any time with a mysterious young man she gets up , while dealing in his new lives ... as such he is transported a secret where these kids can seem for an entire ally.years before this . despite completing another life when you must find an old quest that was just an energetic stall.though the wrestling world to its serialization ... at st. 's past academy during class and finds the message 's lair from this unusual enemy for a life she was told out when he finds his friends ; one after him after something about more from being known until `` mcfat ... as there means you ... is the smartest , athletic school worker whose son s wife will now watch any new treasure for those ' `` i would survive together with something it wo expected his way true : his family for an uncanny kurta spirit who goes at the sky by any eccentric bread at school city usually called texhnolyze and be