In [11]:
# base
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# matplotlib
%matplotlib inline
plt.style.use('ggplot')

# display
from IPython.display import display

# autoreload
%load_ext autoreload
%autoreload 2

# warnings
import warnings
warnings.filterwarnings('ignore')

# fix random seed
from numpy.random import seed as set_random_seed
set_random_seed(42)

# explainability
# import shap, lime, eli5
# shap.initjs()

#mlflow
import mlflow
mlflow.set_tracking_uri("http://127.0.0.1:5000/")

from datetime import datetime
import emoji

import streamlit as st
def cache(f):
    return f
st.cache = cache

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [12]:
from utils import read_chat
chat = read_chat()

In [13]:
chat

Unnamed: 0,datetime,name,text
0,2018-05-10 13:45:00,Alon Wolf 🐺,איזה אתרים קדושים יש בירושלים\r
1,2018-05-10 13:45:00,Alon Wolf 🐺,לפי מקומות - תל-אביב - 'משימה שקשורה לים'\r\nי...
2,2018-05-10 13:46:00,Alon Wolf 🐺,לדוגמא 👆🏼\r
3,2018-05-10 13:47:00,גורגמל ״מצחיקונת״ 🍠,זה מה שאני אמרתי!\r
4,2018-05-10 13:47:00,גורגמל ״מצחיקונת״ 🍠,תעשה בירושלים את הפלאפל\r
...,...,...,...
39994,2020-11-20 07:46:00,Alon Wolf 🐺,זה * בול * מה שרציתי שיהיה לנו\r
39995,2020-11-20 07:46:00,Alon Wolf 🐺,<המדיה לא נכללה>\r
39996,2020-11-20 07:47:00,Alon Wolf 🐺,איזה כיף (:\r
39997,2020-11-20 07:47:00,Alon Wolf 🐺,תאמרי לה לשים אותם גבוה גבוה\r


In [14]:
import re

In [15]:

MEDIA_TOKEN = 'MEDIA'
URL_TOKEN = 'URL'
TOKENS = [MEDIA_TOKEN, URL_TOKEN]

def preprocess_text(text):
    text = re.sub('<המדיה לא נכללה>', MEDIA_TOKEN, text)
    text = re.sub('https?://\S+|www\.\S+', URL_TOKEN, text)
    return text[:-1] # removes end
text = chat['text'].apply(preprocess_text)

In [16]:
text

0                            איזה אתרים קדושים יש בירושלים
1        לפי מקומות - תל-אביב - 'משימה שקשורה לים'\r\nי...
2                                                לדוגמא 👆🏼
3                                        זה מה שאני אמרתי!
4                                  תעשה בירושלים את הפלאפל
                               ...                        
39994                       זה * בול * מה שרציתי שיהיה לנו
39995                                                MEDIA
39996                                          איזה כיף (:
39997                         תאמרי לה לשים אותם גבוה גבוה
39998                                                MEDIA
Name: text, Length: 39999, dtype: object

In [17]:
from extras.nlp.hebtokenizer import tokenize as nlp_tokenize

def tokenize_text(txt):
    tok = [word for lng, word in nlp_tokenize(txt)]
    return ['###'] + tok + ['$$$']

tokenized = text.apply(tokenize_text)

In [18]:
max_len = tokenized.apply(len).max()
max_len

721

In [19]:
tokenized

0            [###, איזה, אתרים, קדושים, יש, בירושלים, $$$]
1        [###, לפי, מקומות, -, תל-אביב, -, ', משימה, שק...
2                                   [###, לדוגמא, 👆🏼, $$$]
3                       [###, זה, מה, שאני, אמרתי, !, $$$]
4                   [###, תעשה, בירושלים, את, הפלאפל, $$$]
                               ...                        
39994    [###, זה, * , בול, * , מה, שרציתי, שיהיה, לנו,...
39995                                    [###, MEDIA, $$$]
39996                          [###, איזה, כיף, (, :, $$$]
39997        [###, תאמרי, לה, לשים, אותם, גבוה, גבוה, $$$]
39998                                    [###, MEDIA, $$$]
Name: text, Length: 39999, dtype: object

In [3]:

# with mlflow.start_run(run_name="test"+ str(datetime.now())):
#      mlflow.log_param('n_features', 5)

In [7]:
def save_experiment(experiment_name, **params):
    with mlflow.start_run(run_name= experiment_name+ '_' +str(datetime.now())):
        for k, v in params.items():
            mlflow.log_param(k, v)

In [1]:
1

1

In [45]:
import torch
from torch import nn

class Model(nn.Module):
    def __init__(self, dataset):
        super(Model, self).__init__()
        self.lstm_size = 128
        self.embedding_dim = 128
        self.num_layers = 3

        n_vocab = len(dataset.uniq_words)
        self.embedding = nn.Embedding(
            num_embeddings=n_vocab,
            embedding_dim=self.embedding_dim,
        )
        self.lstm = nn.LSTM(
            input_size=self.lstm_size,
            hidden_size=self.lstm_size,
            num_layers=self.num_layers,
            dropout=0.2,
        )
        self.fc = nn.Linear(self.lstm_size, n_vocab)

    def forward(self, x, prev_state):
        embed = self.embedding(x)
        output, state = self.lstm(embed, prev_state)
        logits = self.fc(output)
        return logits, state

    def init_state(self, sequence_length):
        return (torch.zeros(self.num_layers, sequence_length, self.lstm_size),
                torch.zeros(self.num_layers, sequence_length, self.lstm_size))

In [46]:
TEXT = text
TOKENIZED = tokenized

In [47]:
import torch
import pandas as pd
from collections import Counter

class Dataset(torch.utils.data.Dataset):
    def __init__(
        self,
        args = {},
    ):
        self.args = args
        self.words = self.load_words()
        self.uniq_words = self.get_uniq_words()

        self.index_to_word = {index: word for index, word in enumerate(self.uniq_words)}
        self.word_to_index = {word: index for index, word in enumerate(self.uniq_words)}

        self.words_indexes = [self.word_to_index[w] for w in self.words]

    def load_words(self):
        return TOKENIZED.apply(lambda l : ' '.join(l))

    def get_uniq_words(self):
        word_counts = Counter(self.words)
        return sorted(word_counts, key=word_counts.get, reverse=True)

    def __len__(self):
        return len(self.words_indexes) - self.args['sequence_length']

    def __getitem__(self, index):
        return (
            torch.tensor(self.words_indexes[index:index+self.args['sequence_length']]),
            torch.tensor(self.words_indexes[index+1:index+self.args['sequence_length']+1]),
        )


In [None]:
import argparse
import torch
import numpy as np
from torch import nn, optim
from torch.utils.data import DataLoader
# from model import Model
# from dataset import Dataset

def train(dataset, model, args):
    model.train()

    dataloader = DataLoader(dataset, batch_size=args['batch_size'])
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.01)

    for epoch in range(args['max_epochs']):
        state_h, state_c = model.init_state(args['sequence_length'])

        for batch, (x, y) in enumerate(dataloader):
            optimizer.zero_grad()

            y_pred, (state_h, state_c) = model(x, (state_h, state_c))
            loss = criterion(y_pred.transpose(1, 2), y)

            state_h = state_h.detach()
            state_c = state_c.detach()

            loss.backward()
            optimizer.step()

            print({ 'epoch': epoch, 'batch': batch, 'loss': loss.item() })
            
def predict(dataset, model, text, next_words=100):
    model.eval()

    words = text.split(' ')
    state_h, state_c = model.init_state(len(words))

    for i in range(0, next_words):
        x = torch.tensor([[dataset.word_to_index[w] for w in words[i:]]])
        y_pred, (state_h, state_c) = model(x, (state_h, state_c))

        last_word_logits = y_pred[0][-1]
        p = torch.nn.functional.softmax(last_word_logits, dim=0).detach().numpy()
        word_index = np.random.choice(len(last_word_logits), p=p)
        words.append(dataset.index_to_word[word_index])

    return words

In [None]:
# parser = argparse.ArgumentParser()
# parser.add_argument('--max-epochs', type=int, default=10)
# parser.add_argument('--batch-size', type=int, default=256)
# parser.add_argument('--sequence-length', type=int, default=4)
# args = parser.parse_args()

args = {
    'max_epochs' : 10,
    'batch_size' : 32,
    'sequence_length' : max_len,
    
}

dataset = Dataset(args)
model = Model(dataset)

train(dataset, model, args)
print(predict(dataset, model, text='בוקר טוב'))

{'epoch': 0, 'batch': 0, 'loss': 10.138138771057129}
{'epoch': 0, 'batch': 1, 'loss': 10.123454093933105}
{'epoch': 0, 'batch': 2, 'loss': 10.108195304870605}
{'epoch': 0, 'batch': 3, 'loss': 10.091282844543457}
{'epoch': 0, 'batch': 4, 'loss': 10.07086181640625}
{'epoch': 0, 'batch': 5, 'loss': 10.043609619140625}
{'epoch': 0, 'batch': 6, 'loss': 10.003386497497559}
{'epoch': 0, 'batch': 7, 'loss': 9.936370849609375}
{'epoch': 0, 'batch': 8, 'loss': 9.820688247680664}
{'epoch': 0, 'batch': 9, 'loss': 9.643900871276855}
{'epoch': 0, 'batch': 10, 'loss': 9.406193733215332}
{'epoch': 0, 'batch': 11, 'loss': 9.14319133758545}
{'epoch': 0, 'batch': 12, 'loss': 8.913790702819824}
{'epoch': 0, 'batch': 13, 'loss': 8.714279174804688}
{'epoch': 0, 'batch': 14, 'loss': 8.538949012756348}
{'epoch': 0, 'batch': 15, 'loss': 8.399476051330566}
{'epoch': 0, 'batch': 16, 'loss': 8.312835693359375}
{'epoch': 0, 'batch': 17, 'loss': 8.234450340270996}
