<font size="5">2. Simple Token-Based Search Using Lemmatization and Stemming. This task involves creating a simple token-based search using lemmatization and stemming techniques. Below is a Python function template that takes a user's input and returns the most relevant sentence from a set of 50 sentences.</font>

In [39]:
banking_sentences = []
with open('banking_sentences.txt', 'r', encoding='utf-8') as file:
    banking_sentences = file.readlines()

In [40]:
# Punctuation and newlines
import string
exclude = set(string.punctuation)
exclude.add('\n')

def remove_punctuation(text):
    return ''.join(char for char in text if char not in exclude)

In [41]:
# stopwords removing
import nltk
nltk.download('stopwords')

from stop_words import get_stop_words
from nltk.corpus import stopwords

stop_words_en = list(get_stop_words('en'))        
nltk_words_en = list(stopwords.words('english')) 
stop_words_ru = list(get_stop_words('ru'))         
nltk_words_ru = list(stopwords.words('russian')) 

stop_words_en = set(get_stop_words('en')) | set(stopwords.words('english'))
stop_words_ru = set(get_stop_words('ru')) | set(stopwords.words('russian'))

stop_words_en_ru = set(stop_words_en) | set(stop_words_ru)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [42]:
def remove_stopwords(text):
    return ' '.join(word for word in text.split() if word.lower() not in stop_words_en_ru)


In [43]:
# Lemmatizers
import nltk
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer

en_lemmatizer = WordNetLemmatizer()

from pymystem3 import Mystem
ru_lemmatizer = Mystem()

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [44]:
# Stemmers
from nltk.stem import PorterStemmer
en_stemmer = PorterStemmer()

from nltk.stem.snowball import SnowballStemmer
ru_stemmer = SnowballStemmer("russian")

In [45]:
# Language detector
from lingua import Language, LanguageDetectorBuilder
languages = [Language.RUSSIAN, Language.ENGLISH]
lang_detector = LanguageDetectorBuilder.from_languages(*languages).build()

In [46]:
# Tokenizer
def nltk_tokenize(text):
    words = nltk.word_tokenize(text)
    return words

In [47]:
ru_lemmatizer.lemmatize('кошек')

['кошка', '\n']

In [48]:
# Lemmatized sentences
# with open('banking_sentences_lemmatized.txt', 'a', encoding='utf-8') as file:
#     file.truncate(0)
#     for index, sentence in enumerate(banking_sentences):
#         tokenized_sentence = nltk_tokenize(remove_punctuation(remove_stopwords(sentence)))
#         root_sentence = []
#         for word in tokenized_sentence:
#             detected_lang = lang_detector.detect_language_of(word.lower())
#             if detected_lang == Language.ENGLISH:
#                 root_sentence.append(en_lemmatizer.lemmatize(word.lower()))
#             elif detected_lang == Language.RUSSIAN:
#                 root_sentence.append(ru_lemmatizer.lemmatize(word.lower())[0])

#         print(index, "/", len(banking_sentences), ": ", root_sentence)
#         file.write(' '.join(root_sentence) + '\n')

In [49]:
# Stemmed sentences
# with open('banking_sentences_stemmed.txt', 'a', encoding='utf-8') as file:
#     file.truncate(0)
#     for index, sentence in enumerate(banking_sentences):
#         tokenized_sentence = nltk_tokenize(remove_punctuation(remove_stopwords(sentence)))
#         root_sentence = []
#         for word in tokenized_sentence:
#             detected_lang = lang_detector.detect_language_of(word.lower())
#             if detected_lang == Language.ENGLISH:
#                 root_sentence.append(en_stemmer.stem(word.lower()))
#             elif detected_lang == Language.RUSSIAN:
#                 root_sentence.append(ru_stemmer.stem(word.lower()))

#         print(index, "/", len(banking_sentences), ": ", root_sentence)
#         file.write(' '.join(root_sentence) + '\n')

In [50]:
def preprocess(sentence, approach='lemmatize'):
    root_sentence = []
    if approach == 'lemmatize':
        tokenized_sentence = nltk_tokenize(remove_punctuation(remove_stopwords(sentence)))
        for word in tokenized_sentence:
            detected_lang = lang_detector.detect_language_of(word.lower())
            if detected_lang == Language.ENGLISH:
                root_sentence.append(en_lemmatizer.lemmatize(word.lower()))
            elif detected_lang == Language.RUSSIAN:
                root_sentence.append(ru_lemmatizer.lemmatize(word.lower())[0])
    elif approach == 'stemming':
        tokenized_sentence = nltk_tokenize(remove_punctuation(remove_stopwords(sentence)))
        root_sentence = []
        for word in tokenized_sentence:
            detected_lang = lang_detector.detect_language_of(word.lower())
            if detected_lang == Language.ENGLISH:
                root_sentence.append(en_stemmer.stem(word.lower()))
            elif detected_lang == Language.RUSSIAN:
                root_sentence.append(ru_stemmer.stem(word.lower())) 
    print(root_sentence)         
    return root_sentence

In [51]:
banking_sentences_lemmatized = [line.split() for line in open('banking_sentences_lemmatized.txt', 'r', encoding='utf-8').readlines()]
banking_sentences_stemmed = [line.split() for line in open('banking_sentences_stemmed.txt', 'r', encoding='utf-8').readlines()]

In [52]:
def search(sentence, approach='lemmatize'):
    root_sentence_set = set(preprocess(sentence, approach))
    base = banking_sentences_lemmatized if approach == 'lemmatize' else banking_sentences_stemmed

    match_scores = [len(root_sentence_set.intersection(base_sentence)) for base_sentence in base]

    max_score = max(match_scores)

    max_index = match_scores.index(max_score)

    print('Approach: ', approach)
    print(banking_sentences[max_index])
    print("Max score: ", max_score)

    return max_score, max_index

In [53]:
query = 'сберегательные счета банка'
search(query, 'lemmatize')

['сберегательный', 'счет', 'банка']
Approach:  lemmatize
Банк предлагает разнообразные сберегательные счета с разными процентными ставками, чтобы удовлетворить индивидуальные потребности.

Max score:  2


(2, 51)

In [54]:
search(query, 'stemming')

['сберегательн', 'счет', 'банк']
Approach:  stemming
Банк предлагает разнообразные сберегательные счета с разными процентными ставками, чтобы удовлетворить индивидуальные потребности.

Max score:  3


(3, 51)

<font size="5">3. Text Cleaning by Removing Blacklist Words or Phrases</font>
 

In [86]:
from nltk.util import bigrams
from difflib import SequenceMatcher

blacklist = ['nigga', 'uzbek', 'гей', 'Токаев лучший', 'Путин', 'Токаев Тигр']

def clean_text(text, threshold=0.6):
    # tokens = nltk_tokenize(remove_punctuation(remove_stopwords(text.lower())))
    # bigram_list = list(bigrams(tokens))

    split_text = text.split(' ')
    hide_indices = []
    for i in range(len(split_text)):
        match_scores = [SequenceMatcher(None, split_text[i], bad_word).ratio() for bad_word in blacklist]
        if max(match_scores) > threshold and split_text[i] not in stop_words_en_ru:
            hide_indices.append((i, i))

    for i in range(len(split_text) - 1):
        match_scores_bigram = [SequenceMatcher(None, split_text[i] + split_text[i + 1], bad_word).ratio() for bad_word in blacklist]
        if max(match_scores_bigram) > threshold and split_text[i] not in stop_words_en_ru and split_text[i + 1] not in stop_words_en_ru:
            hide_indices.append((i, i + 1))

    for i, j in hide_indices:
        split_text[i] = '*' * len(split_text[i])
        split_text[j] = '*' * len(split_text[j])

    return ' '.join(split_text)

# Example usage
sample_text = "Here is a sample text including the word Путин is niggers and гей and Токаев Тигр uzbek."
print(clean_text(sample_text))

Here is a sample text including the **** ***** is ******* and *** and ****** **** ******
