This code converts text to files with tokens: train and apply SentencePiece library, preprocessing and tokenize text.

In [1]:
import pandas as pd
import numpy as np
from zipfile import ZipFile
import os
import csv
import re
import sentencepiece as spm
import time
import logging
import emoji
import json
import copy

from spm_train import spm_train
from spm_apply import spm_apply
from data_convert import clean_input_text, convert_into_train_file

In [2]:
pd.set_option("max_colwidth", 150)

Additional data loading

In [3]:
list_of_files = os.listdir("dataset/")
text_file = []
for file in list_of_files:
    with open("dataset/" + file, "r", encoding="utf-8") as train:
        reader = csv.reader(train, delimiter="\t")
        for line in reader:
            text_file.append(line)
            
print(len(text_file))

114239


Define the condition that the length of the sentence must be at least 30

In [4]:
texts = []
for i in text_file:
    if i != []:
        for j in i:
            if len(j) > 30:
                texts.append(j)
text_df = pd.DataFrame(texts, columns = ['text'])

Clean the texts

In [5]:
def clean_input_text(df, column_name: str):
    train_df = pd.DataFrame(columns=[column_name])
    train_df[column_name] = df[column_name]
    assert len(train_df) == len(df)
    text_list = []
    for text in train_df[column_name]:
#         text_clean = re.compile(r'[.+-,!"*#$%&’(),/:;<=>?@/\^_`-·]')
#         num = re.compile(r'\d+')

#         emoji_pattern = re.compile("["
#                                    u"\U0001F600-\U0001F64F"  # emoticons
#                                    u"\U0001F300-\U0001F5FF"  # symbols & pictographs
#                                    u"\U0001F680-\U0001F6FF"  # transport & map symbols
#                                    u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
#                                    u"\U00002702-\U000027B0"
#                                    u"\U000024C2-\U0001F251"
#                                    "]+", flags=re.UNICODE)

#         allchars = [str for str in text]
#         emoji_list = [c for c in allchars if c in emoji.UNICODE_EMOJI]
#         clean_text = ' '.join([str for str in text.split() if not any(i in str for i in emoji_list)])

#         new_text = emoji_pattern.sub(r'', clean_text)
#         cln_text = text_clean.sub(r'', new_text)
#         cl_text = num.sub(r'', cln_text)
#         c_text = cl_text.replace('/', '')
        low_text = text.lower()

        text_list.append(str(low_text))
    return text_list

In [6]:
clean_text = clean_input_text(text_df, column_name = 'text')

In [7]:
clean_text_arr = np.array(clean_text)
clean_text_arr.shape

(92359,)

In [32]:
clean_text_array = np.reshape(clean_text_arr, (len(clean_text_arr),1))

In [14]:
#wtire a file for training
with open('spm_train_file.txt', "w", encoding='utf-8') as output:
    for line in clean_text_array:
        output.write(" ".join(line) + "\n") 

Train SentencePiece (BPE) on the additional dataset for different ways

In [25]:
ADDITIONAL_SPECIAL_TOKENS = "<e1>", "</e1>", "<e2>", "</e2>"

In [None]:
spm_train(input_file = 'spm_train_file.txt', model_type = 'bpe', vocab_size = '10000', model_prefix = 'model_bpe_dip_10000')

In [None]:
spm_train(input_file = 'spm_train_file.txt', model_type = 'bpe', vocab_size = '30000', model_prefix = 'model_bpe_dip_30000')

In [26]:
spm_train(input_file = 'spm_train_file.txt', model_type = 'bpe', vocab_size = '50000', model_prefix = 'model_bpe_dip_50000')

Model: model_bpe_dip_50000
Time: 24.254483222961426


'The model was trained saccessfuly'

In [3]:
def read_examples_from_file(data_dir):
    #file_path = os.path.join(data_dir, "{}.txt".format(mode))
    guid_index = 1
    examples = dict()
    texts = []
    labels = []
    with open(data_dir, "r", encoding="utf-8") as f:
        for line in f.readlines():
            line = line.strip().split("\t")
            if len(line) == 2:
                text_a = line[0]
                label = line[1]
            else:
                text_a = line[0]
                label = "NONE"
            texts.append(text_a)
            labels.append(label)
            guid_index += 1
    assert len(labels) == len(texts)

    return texts, labels

In [4]:
train_data = read_examples_from_file('data/train_balanced.csv')
eval_data = read_examples_from_file('data/eval_balanced.csv')
test_data = read_examples_from_file('data/test_file.csv')
labels = [0,1]
num_labels = 2

In [5]:
train_text, train_labels = train_data
train_text = pd.DataFrame(train_text, columns = ['text'])
train_text_ = clean_input_text(train_text, column_name = 'text')

Tokenization and vectorization with SentencePiece

In [8]:
train_text, train_labels = train_data


train_tokens, train_vectors = spm_apply(train_text, 'model_bpe_dip.model')

Time: 0.08305883407592773
List with subword was written successfuly!
Len of input_text: 140
Len of subword_tokens_list: 140



In [9]:
eval_text, eval_labels = eval_data

eval_tokens, eval_vectors = spm_apply(eval_text, 'model_bpe_dip.model')

Time: 0.04002809524536133
List with subword was written successfuly!
Len of input_text: 123
Len of subword_tokens_list: 123



In [10]:
test_text, test_labels = test_data

test_tokens, test_vectors = spm_apply(test_text, 'model_bpe_dip.model')

Time: 0.02601766586303711
List with subword was written successfuly!
Len of input_text: 74
Len of subword_tokens_list: 74



In [11]:
print(train_tokens[11])
print(type(train_vectors[11]))

['▁', '<e1>', 'система', '</e1>', '▁предназначена', '▁для', '▁', '<e2>', 'анализа', '▁множества', '▁ген', 'ом', 'ов', '</e2>', '▁и', '▁их', '▁классификации', ',', '▁а', '▁именно', '▁для', '▁анализа', '▁ген', 'оти', 'пов', '▁внутри', '▁одного', '▁вида', '▁жи', 'вых', '▁органи', 'з', 'мов', ',', '▁поскольку', '▁методы', '▁направл', 'ены', '▁для', '▁выделения', '▁различий', '▁ген', 'ом', 'ов', ',', '▁имеющих', '▁схо', 'ж', 'ую', '▁структуру', '.']
<class 'list'>


Pulling words out of text

In [12]:
def get_words_vectors(data_vectors, vectors_ot_tokens):
    f_w = []
    s_w = []
    start1 = []
    end1 = []
    start2 = []
    end2 = []
    for idx, row in enumerate(data_vectors):
        for i,ex in enumerate(row):
            if ex == 3:
                start1.append(i)
            elif ex == 4:
                end1.append(i)
            elif ex == 5:
                start2.append(i)
            elif ex == 6:
                end2.append(i)

        first_word = []
        second_word = []

        for i in range(start1[idx]+1, end1[idx]):
            first_word.append(vectors_ot_tokens[idx][i])

        for j in range(start2[idx]+1, end2[idx]):
            second_word.append(vectors_ot_tokens[idx][j])

        f_w.append(first_word)
        s_w.append(second_word)
    return f_w, s_w

In [13]:
f_w_vectors_train, s_w_vectors_train = get_words_vectors(train_vectors, train_vectors)
f_w_tokens_train, s_w_tokens_train = get_words_vectors(train_vectors, train_tokens)

f_w_vectors_eval, s_w_vectors_eval = get_words_vectors(eval_vectors, eval_vectors)
f_w_tokens_eval, s_w_tokens_eval = get_words_vectors(eval_vectors, eval_tokens)

f_w_vectors_test, s_w_vectors_test = get_words_vectors(test_vectors, test_vectors)
f_w_tokens_test, s_w_tokens_test = get_words_vectors(test_vectors, test_tokens)

In [14]:
train_text_df = pd.DataFrame(train_text, columns = ['text'])
train_text_df['label'] = train_labels
train_text_df['tokens'] = train_tokens
train_text_df['vectors'] = train_vectors

In [15]:
eval_text_df = pd.DataFrame(eval_text, columns = ['text'])
eval_text_df['label'] = eval_labels
eval_text_df['tokens'] = eval_tokens
eval_text_df['vectors'] = eval_vectors

In [16]:
test_text_df = pd.DataFrame(test_text, columns = ['text'])
test_text_df['label'] = test_labels
test_text_df['tokens'] = test_tokens
test_text_df['vectors'] = test_vectors

In [32]:
train_text_df.to_csv('train_data_with_vectors_5.csv', sep = ',', index = False)
eval_text_df.to_csv('eval_data_with_vectors_5.csv', sep = ',', index = False)
test_text_df.to_csv('test_data_with_vectors_3.csv', sep = ',', index = False)

Split words (first and second) with Is-A relations and Other

In [17]:
isa_f = []
nonisa_f = []
for i,j in zip(f_w_tokens_eval,eval_labels):
    if j == '1':
        isa_f.append(i)
    else:
        nonisa_f.append(i)

new_f = []
for i in isa_f:
    a = ",".join(i).replace(',', '').replace('▁',' ')
    new_f.append([a])
    
for i in nonisa_f:
    a = ",".join(i).replace(',', '').replace('▁',' ')
    new_f.append([a])

In [18]:
isa_s = []
nonisa_s = []
for i,j in zip(s_w_tokens_eval,eval_labels):
    if j == '1':
        isa_s.append(i)
    else:
        nonisa_s.append(i)
        
new_s = []
for i in isa_s:
    a = ",".join(i).replace(',', '').replace('▁',' ')
    new_s.append([a])
    
for i in nonisa_s:
    a = ",".join(i).replace(',', '').replace('▁',' ')
    new_s.append([a])

In [24]:
new_f = np.array(new_f)
new_s = np.array(new_s)
dsf = pd.DataFrame(new_f, columns = ['f'])
dsf['s'] = new_s

In [24]:
dsf.to_csv('words_eval.csv', sep = ',', index = False)

In [57]:
def df_words(f_w, s_w):
    f_w_arr = np.array(f_w)
    s_w_arr = np.array(s_w)
    first_words = np.reshape(f_w_arr, (len(f_w),1))
    second_words = np.reshape(s_w_arr, (len(s_w),1))
    words_vectors_df = pd.DataFrame(first_words, columns = ['f_w'])
    words_vectors_df['s_w'] = second_words
    return words_vectors_df

In [58]:
words_vectors_train = df_words(f_w_vectors_train, s_w_vectors_train)
words_vectors_eval = df_words(f_w_vectors_eval, s_w_vectors_eval)
words_vectors_test = df_words(f_w_vectors_test, s_w_vectors_test)

words_tokens_train = df_words(f_w_tokens_train, s_w_tokens_train)
words_tokens_eval = df_words(f_w_tokens_eval, s_w_tokens_eval)
words_tokens_test = df_words(f_w_tokens_test, s_w_tokens_test)

  
  This is separate from the ipykernel package so we can avoid doing imports until


In [59]:
words_vectors_train.to_csv('words_vectors_train.csv', sep = ',', index = False)
words_vectors_eval.to_csv('words_vectors_eval.csv', sep = ',', index = False)
words_vectors_test.to_csv('words_vectors_test.csv', sep = ',', index = False)

words_tokens_train.to_csv('words_tokens_train.csv', sep = ',', index = False)
words_tokens_eval.to_csv('words_tokens_eval.csv', sep = ',', index = False)
words_tokens_test.to_csv('words_tokens_test.csv', sep = ',', index = False)