<a href="https://colab.research.google.com/github/DaryaTereshchenko/ukr/blob/main/embedding_ukr_pickle.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install tokenize_uk

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting tokenize_uk
  Downloading tokenize_uk-0.2.0.tar.gz (22 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: tokenize_uk
  Building wheel for tokenize_uk (setup.py) ... [?25l[?25hdone
  Created wheel for tokenize_uk: filename=tokenize_uk-0.2.0-py2.py3-none-any.whl size=4589 sha256=4f10481a6c825c580d5e9e8a73212b4ba4c4077f69391bc9e9b661108fd25ea3
  Stored in directory: /root/.cache/pip/wheels/40/76/70/7307eebd4479b6d3d3eba430016755ccf53334beaa6b86b470
Successfully built tokenize_uk
Installing collected packages: tokenize_uk
Successfully installed tokenize_uk-0.2.0


In [None]:
import bz2
import pandas as pd
import numpy as np
import unicodedata
import tokenize_uk
from pathlib import Path
import string
from tqdm import tqdm
import re
from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [None]:
from keras.utils import to_categorical
import tensorflow as tf
from keras.models import Model
from keras.layers import Dense, Flatten, Input, Dropout, Embedding, BatchNormalization

In [None]:
!wget https://lang.org.ua/static/downloads/models/news.cased.tokenized.glove.300d.bz2

--2023-02-14 15:41:23--  https://lang.org.ua/static/downloads/models/news.cased.tokenized.glove.300d.bz2
Resolving lang.org.ua (lang.org.ua)... 95.216.74.77
Connecting to lang.org.ua (lang.org.ua)|95.216.74.77|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 338940856 (323M) [application/octet-stream]
Saving to: ‘news.cased.tokenized.glove.300d.bz2’


2023-02-14 15:41:40 (20.6 MB/s) - ‘news.cased.tokenized.glove.300d.bz2’ saved [338940856/338940856]



In [None]:
!unzip /content/drive/MyDrive/train_large.zip

Archive:  /content/drive/MyDrive/train_large.zip
  inflating: train_large.csv         


In [None]:
glove = bz2.BZ2File("news.cased.tokenized.glove.300d.bz2", "r")

In [None]:
data = pd.read_csv("train_large.csv", low_memory=False, on_bad_lines='skip').sort_values(by=["source"])

In [None]:
subset = data[data["source"].isin([0,1,2,3])]

In [None]:
X = subset["text"]
y = subset["source"].values.reshape(-1,1)

In [None]:
def clean_text(line):
    replaced = re.sub('\n', '', line)
    right_quote = unicodedata.lookup('RIGHT DOUBLE QUOTATION MARK')
    left_quote = unicodedata.lookup('LEFT DOUBLE QUOTATION MARK')
    normalized = replaced.replace(right_quote, '\"').replace(left_quote, '\"')
    tokenized = ' '.join(tokenize_uk.tokenize_uk.tokenize_words(normalized))
    return tokenized

In [None]:
embedding_dict = {}

for line in glove:
    values = line.split()
    word = values[0]
    word = word.decode('utf-8')
    vectors = np.asarray(values[1:], "float32")
    embedding_dict[word] = vectors


In [None]:
MAX_WORS = 150

In [None]:
sentences = X.map(lambda x: clean_text(x))
sentences = X.map(lambda x: x.translate(str.maketrans('', '', string.punctuation)))

In [None]:
sentences

7744     Все почалося з того що капітан Мур вирішив зіб...
19989    Тетяна і її син Темур\r\nНайдорожче розлучення...
45998    Конституційний Суд України позбавив Національн...
39936    Карантин локдаун став словом 2020 року Його ст...
26770    Ювілей станції Вернадський Українські вчені ві...
                               ...                        
43321    Протягом 1418 вересня Нацбанк продав на міжбан...
48603    У Кабміні вважають що після падіння обсягів ім...
43614    Сервіс доставки їжі UberEats 3 червня припиняє...
48605    Ринкова вартість криптовалюти Bitcoin сягнула ...
2686     Великі торговельні центри в столиці треба відк...
Name: text, Length: 25677, dtype: object

In [None]:
def transform(sent):
  s = []
  for word in sent.split():
    if embedding_dict.get(word) is not None:
      s.append(embedding_dict.get(word))
      if len(s) == MAX_WORS:
        return np.stack(s, axis=0)
  if len(s) < MAX_WORS:
    zeros = np.zeros((MAX_WORS - len(s), 300))
    new_matrix = np.concatenate((zeros, s), axis=0)
    return new_matrix


In [None]:
embeddig_matrix = []
for i in tqdm(sentences):
  embedding = transform(i)
  embeddig_matrix.append(embedding)

100%|██████████| 25677/25677 [00:09<00:00, 2699.78it/s]


In [None]:
embeddig_matrix = np.stack(embeddig_matrix, axis=0)
embeddigs = embeddig_matrix.reshape(embeddig_matrix.shape[0], -1)
embeddigs = pd.DataFrame(embeddigs)
embeddigs["source"] = y

In [None]:
embeddigs

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,44991,44992,44993,44994,44995,44996,44997,44998,44999,source
0,0.289065,-0.302258,-0.067125,-0.142829,-0.571341,-0.247752,-0.647140,-0.315832,-0.309583,0.056869,...,-0.062587,0.220745,0.188114,-0.219599,0.187794,-0.533731,0.141170,-0.203350,0.103120,2
1,0.107319,-0.163065,-0.572483,-0.187558,0.308994,0.008620,-0.755999,0.303792,-0.714780,0.386464,...,0.182555,0.053285,0.197741,-0.467305,0.795056,-0.307584,0.037243,0.173997,-0.184379,3
2,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,-0.293032,0.042476,0.620857,-0.338957,0.628428,-0.500935,-0.217840,0.239745,-0.007407,3
3,0.639948,0.139452,-0.255787,0.325295,-0.217746,0.383579,0.142693,-0.187305,0.103691,0.258715,...,-0.453204,0.287243,-0.354645,0.239500,-0.215203,-0.056409,0.205022,0.159679,-0.440074,2
4,-0.179089,-0.271164,-0.179615,-0.101294,-0.351352,-0.237466,-0.519836,0.181826,-0.415627,-0.777196,...,0.081289,-0.448468,-0.023365,-0.410670,-0.664822,-0.677202,0.071613,0.081858,-0.489330,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25672,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,-0.133993,0.431885,-0.373035,-0.187254,-0.400546,0.641448,0.211965,-0.133053,0.394565,1
25673,-0.011475,-0.362595,0.400265,-0.096101,0.508484,0.030144,-0.127819,-0.825140,0.263326,0.011580,...,-0.009434,0.190915,-0.211210,0.443357,-0.651554,-0.401812,-0.788020,-0.321245,-0.436221,2
25674,-0.083649,-0.256452,-0.117438,0.246674,-1.133102,-0.857174,-0.106594,0.375665,0.361931,0.167915,...,-0.019849,0.060704,-0.539611,-0.321925,-0.482078,0.295565,-0.239073,0.281052,-0.128144,3
25675,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,-0.178413,0.102045,0.460402,0.688739,0.330359,-0.024408,-0.240008,0.013830,-0.491883,2


In [None]:
import pickle
from google.colab import drive
drive.mount(r'/content/gdrive/')
path = "/content/gdrive/MyDrive/"

Mounted at /content/gdrive/


In [None]:
pickle_out = open(path + "embraw.pickle", "wb")
pickle.dump(embeddigs, pickle_out)
pickle_out.close()