In [137]:
dataset_path = "../../nechkasova-tokenizer/assets/annotated-corpus/train"

In [138]:
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/onechkasova/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [139]:
import re
import string

def read_data(file_path):
    with open(file_path, encoding="ISO-8859-1") as file:
        lines = file.readlines()
    return lines

def clean_text(text):
    text = re.sub(f"[{re.escape(string.punctuation)}]", '', text)
    return text.lower()

In [140]:
def process_file(file_path):
    lines = read_data(file_path)
    tokens = []

    for line in lines:
        if line.strip():
            try:
                token, stem, lemma = line.strip().split('\t')
            except:
                continue
            cleaned_token = clean_text(token)
            if cleaned_token and cleaned_token not in stop_words:
                tokens.append(cleaned_token)
    
    return tokens

In [141]:
import os

def process_directory(directory_path):
    all_tokens = []

    for root, dirs, files in os.walk(directory_path):
        for file in files:
            file_path = os.path.join(root, file)
            print(f"Process file {file_path}")
            tokens = process_file(file_path)
            all_tokens.append(tokens)

    return all_tokens

In [142]:
tokens = process_directory(dataset_path)
# print(len(tokens))

print(tokens[0])

Process file ../../nechkasova-tokenizer/assets/annotated-corpus/train/talk.politics.mideast/75384.tsv
Process file ../../nechkasova-tokenizer/assets/annotated-corpus/train/talk.politics.mideast/75390.tsv
Process file ../../nechkasova-tokenizer/assets/annotated-corpus/train/talk.politics.mideast/75421.tsv
Process file ../../nechkasova-tokenizer/assets/annotated-corpus/train/talk.politics.mideast/76114.tsv
Process file ../../nechkasova-tokenizer/assets/annotated-corpus/train/talk.politics.mideast/76100.tsv
Process file ../../nechkasova-tokenizer/assets/annotated-corpus/train/talk.politics.mideast/76316.tsv
Process file ../../nechkasova-tokenizer/assets/annotated-corpus/train/talk.politics.mideast/76302.tsv
Process file ../../nechkasova-tokenizer/assets/annotated-corpus/train/talk.politics.mideast/75958.tsv
Process file ../../nechkasova-tokenizer/assets/annotated-corpus/train/talk.politics.mideast/76289.tsv
Process file ../../nechkasova-tokenizer/assets/annotated-corpus/train/talk.politic

In [143]:
from gensim.models import Word2Vec

model = Word2Vec(sentences=tokens, vector_size=100, window=5, min_count=1, workers=4)

In [144]:
def get_sentences(content):
    sentence_endings_pattern = re.compile(r'(?<!\w\.\w.)(?<!\w\. \w.)(?<![A-Z][a-z]\.)(?<!\s\.\s)(?<=\.|\?|\!)\s(?![A-Z][A-Za-z]\.)(?!\w\. \w.)|(?<![,])\n(?![a-zA-Z0-9])')
    sentences = sentence_endings_pattern.split(content)

    sentences = [sentence.strip() for sentence in sentences if sentence.strip()]

    sentences_included_key_value = []

    for sentence in sentences:
        key_value_pattern = re.compile(r'[A-Z](\w+-)*\w+(\s\w+)*:[ ]*([\W\w]+[ ,])+(?!,\n)')
        if key_value_pattern.match(sentence):
            add_sentence = re.split(r'(?<!,)\n', sentence)
            sentences_included_key_value.extend(add_sentence)
        else:
            sentences_included_key_value.append(sentence)
    # sentences = [sentence.replace('\n', ' ') for sentence in sentences]
    return sentences_included_key_value

def tokenize_sentence(sentence):
    pattern = r'\+?\d[\d\-\(\)\s]{7,}\d' \
              r'|\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b' \
              r'|\b(?:Mr|Ms|Mrs|Dr|Prof|St)\.\s[A-Z][a-z]+' \
              r'|\b\d{1,2}:\d{2}\s?(?:[AaPp]\.?[Mm]\.?)\b' \
              r'|\w+|[^\w\s]'
    tokens = re.findall(pattern, sentence)
    tokens = [token.lower() for token in tokens]
    return tokens

In [145]:
import numpy as np

def vectorize_text(text, model):
    sentences = get_sentences(text)
    sentence_vectors = []
    
    for sentence in sentences:
        cleaned_text = clean_text(sentence)
        tokens = tokenize_sentence(cleaned_text)
        if len(tokens) == 0:
            continue
        word_vectors = [model.wv[word] if word in model.wv else np.zeros(100) for word in tokens]
        sentence_vector = np.mean(word_vectors, axis=0)
        sentence_vectors.append(sentence_vector)
        
    text_vector = np.mean(sentence_vectors, axis=0)
    
    return text_vector

In [146]:
text = "programs applications developing. Sentence"
file_tsv_content = {}

vector = vectorize_text(text, model)
print(len(vector))
print(vector)

100
[-0.13126607  0.8730702  -0.06724253 -0.885519    0.08366035 -0.30814037
  0.35347325  0.903188   -0.16618164  0.36020273 -0.7230122  -0.68214893
 -0.6995809  -0.37656865 -0.41267174 -0.03892402 -0.4163221   0.25145212
  0.07673503 -0.16314605  0.9750533   0.09192957  0.6985528  -0.32583922
 -0.6530629   0.65714455  1.0465605  -0.56862974  0.21398452 -0.22796029
  0.19867368 -0.3149768  -0.14391652  0.18632062  0.3421592   0.24810258
 -0.03822811 -0.5836861   0.19549377 -1.2125963   0.95326644 -0.6653902
 -0.42790714 -0.10863309  0.40729508  0.0820406  -0.9431194  -0.0998101
 -0.5804617   0.02275422 -0.5076728  -0.47001517  0.15881845 -0.04672308
 -0.6333916  -0.14354447  0.3217734  -0.22995335 -0.96305084 -0.31937885
  0.5656536   0.36967504 -0.19960733  0.10369359 -0.21936794  0.32454956
  0.4123906   0.09131682 -0.10854468 -0.17188214 -1.2623756  -0.4196571
  0.12608686 -0.2406549  -0.22845912  0.16012797 -0.07090542 -0.27815485
  0.03591631  0.6676295   0.25190914 -0.63243634 -

programs = applications, developing (programming)
interface, buggy, functions, UNIX, code, signals, Internet
people, Canada, sport, tatoo


email = phone, FAX
address, message, contact
team, games, baseball

lecture = seminar, courses, 
education, laboratories, conference
situation, comments, season

In [147]:
from scipy.spatial.distance import cosine

def cosine_similarity(vector_a, vector_b):
    return 1 - cosine(vector_a, vector_b)

def calculate_distances(word, similar_words, domain_words, different_words, model, cosine_similarity_function):
    word_vector = model.wv[word]
    distances = []
    
    for group_name, words in [('Похожие слова', similar_words), 
                              ('Слова из той же области', domain_words), 
                              ('Совершенно другие слова', different_words)]:
        for w in words:
            if w in model.wv:
                distance = cosine_similarity_function(word_vector, model.wv[w])
                distances.append((group_name, w, distance))
    
    return sorted(distances, key=lambda x: x[2], reverse=True)

In [148]:
similar_words = ["applications", "developing"]
domain_words = ["interface", "buggy", "functions", "UNIX", "code", "signals", "Internet"]
different_words = ["people", "Canada", "sport", "tatoo"]

distances = calculate_distances('programs', similar_words, domain_words, different_words, model, cosine_similarity)

for group, word, distance in distances:
    print(f"{group}: {word} - Косинусное сходство: {distance:.4f}")

Похожие слова: applications - Косинусное сходство: 0.9297
Слова из той же области: interface - Косинусное сходство: 0.8781
Слова из той же области: functions - Косинусное сходство: 0.8515
Слова из той же области: code - Косинусное сходство: 0.7499
Похожие слова: developing - Косинусное сходство: 0.7332
Слова из той же области: signals - Косинусное сходство: 0.7328
Слова из той же области: buggy - Косинусное сходство: 0.7085
Совершенно другие слова: sport - Косинусное сходство: 0.6177
Совершенно другие слова: people - Косинусное сходство: 0.3908
Совершенно другие слова: tatoo - Косинусное сходство: 0.3477


In [150]:
similar_words = ["address", "FAX"]
domain_words = ["phone", "message", "contact"]
different_words = ["team", "games", "baseball"]

distances = calculate_distances('email', similar_words, domain_words, different_words, model, cosine_similarity)

for group, word, distance in distances:
    print(f"{group}: {word} - Косинусное сходство: {distance:.4f}")

Слова из той же области: contact - Косинусное сходство: 0.9125
Похожие слова: address - Косинусное сходство: 0.8907
Слова из той же области: phone - Косинусное сходство: 0.7829
Слова из той же области: message - Косинусное сходство: 0.7118
Совершенно другие слова: baseball - Косинусное сходство: 0.2654
Совершенно другие слова: games - Косинусное сходство: 0.1018
Совершенно другие слова: team - Косинусное сходство: 0.0334


In [152]:
similar_words = ["seminar", "courses"]
domain_words = ["education", "laboratories", "conference"]
different_words = ["situation", "comments", "season"]

distances = calculate_distances('lecture', similar_words, domain_words, different_words, model, cosine_similarity)

for group, word, distance in distances:
    print(f"{group}: {word} - Косинусное сходство: {distance:.4f}")

Похожие слова: seminar - Косинусное сходство: 0.8976
Похожие слова: courses - Косинусное сходство: 0.8141
Слова из той же области: laboratories - Косинусное сходство: 0.7191
Слова из той же области: education - Косинусное сходство: 0.6684
Совершенно другие слова: situation - Косинусное сходство: 0.5917
Совершенно другие слова: comments - Косинусное сходство: 0.5840
Слова из той же области: conference - Косинусное сходство: 0.5772
Совершенно другие слова: season - Косинусное сходство: 0.4353


In [153]:
import numpy as np

def cosine_similarity_manual(vector_a, vector_b):
    dot_product = np.dot(vector_a, vector_b)
    norm_a = np.linalg.norm(vector_a)
    norm_b = np.linalg.norm(vector_b)
    return dot_product / (norm_a * norm_b)

In [154]:
similar_words = ["seminar", "courses"]
domain_words = ["education", "laboratories", "conference"]
different_words = ["situation", "comments", "season"]

distances = calculate_distances('lecture', similar_words, domain_words, different_words, model, cosine_similarity_manual)

for group, word, distance in distances:
    print(f"{group}: {word} - Косинусное сходство: {distance:.4f}")

Похожие слова: seminar - Косинусное сходство: 0.8976
Похожие слова: courses - Косинусное сходство: 0.8141
Слова из той же области: laboratories - Косинусное сходство: 0.7191
Слова из той же области: education - Косинусное сходство: 0.6684
Совершенно другие слова: situation - Косинусное сходство: 0.5917
Совершенно другие слова: comments - Косинусное сходство: 0.5840
Слова из той же области: conference - Косинусное сходство: 0.5772
Совершенно другие слова: season - Косинусное сходство: 0.4353


In [165]:
def read_content(file_path):
    filename = file_path.split('/')[-1]
    try:
        with open(file_path, encoding="ISO-8859-1") as file:
            content = file.read()
            return content, filename
    except Exception as e:
        print(f"Couldn't read file {filename}: {e}")
        return None

def vectorize_file(file_path):
    content, filename = read_content(file_path)
    vector = vectorize_text(content, model)
    
    vector_str = "\t".join([str(component) for component in vector])
    result_line = f"{filename}\t{vector_str}"
    
    return result_line

def vectorize_directory(directory_path, file_name):
    all_lines = []
    for root, dirs, files in os.walk(directory_path):
        for file in files:
            file_path = os.path.join(root, file)
            line = vectorize_file(file_path)
            all_lines.append(line)
            
    tsv_filepath = os.path.join('..', 'assets', 'annotated-corpus', file_name + '.tsv')

    os.makedirs(os.path.dirname(tsv_filepath), exist_ok=True)

    with open(tsv_filepath, 'w') as f:
        f.write("\n".join(all_lines))

In [166]:
dataset_path = "../../dataset/20news-bydate-test"

vectorize_directory(dataset_path, 'test')

In [167]:
dataset_path = "../../dataset/20news-bydate-train"

vectorize_directory(dataset_path, 'train')