### NLP2_1 https://www.hackerrank.com/challenges/detect-the-email-addresses/problem?isFullScreen=true

In [75]:
import regex


def print_matches(pattern, file_path):
    matches = set()
    with open(file_path, encoding="utf-8") as file:
        for line in file:
            matches.update(regex.findall(pattern, line))
    for match in matches:
        print(match)

In [76]:
s = '[a-zA-Z0-9_.]*'
pattern = regex.compile(rf'{s}@{s}')
print_matches(pattern, "./text1.txt")

interviewstreet@hackerrank.com
hackers@hackerrank.com
product@hackerrank.com


### NLP2_2 https://www.hackerrank.com/challenges/detect-the-domain-name/problem?isFullScreen=true

In [77]:
pattern = regex.compile(r'https?:\/\/(?:www.|ww2.)?([^\/]*)\/')
print_matches(pattern, "./text2.txt")

askoxford.com
bnsf.com
hydrogencarsnow.com
web.archive.org
mrvc.indianrail.gov.in


### ML1_4: Реализовать stemming, lemmatization & BoW на следующем датасете: https://cloud.mail.ru/public/Z4L3/vB8GcgTtK (Russian Toxic-abuse comments)
#### https://www.kaggle.com/datasets/blackmoon/russian-language-toxic-comments
#### Дубликат файла: https://cloud.mail.ru/public/Z4L3/vB8GcgTtK



In [86]:
import csv

with open("./labeled.csv", "r") as f:
    reader = csv.reader(f, delimiter=",")
    data = [comment for [comment, toxic] in reader]

data = data[1:]

In [87]:
from nltk.tokenize import WordPunctTokenizer

tokenizer = WordPunctTokenizer()

data_tok = []
for sentence in data:
    data_tok.append(tokenizer.tokenize(sentence.lower()))

print(tokenizer.tokenize(data[0]))

['Верблюдов', '-', 'то', 'за', 'что', '?', 'Дебилы', ',', 'бл', '...']


In [88]:
import nltk
import ssl
from collections import Counter

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

#nltk.download('wordnet')
#nltk.download('omw-1.4')

lemmatizer = nltk.WordNetLemmatizer()
lemms = []
for words in data_tok:
    gen = (lemmatizer.lemmatize(word) for word in words)
    lemms.extend(gen)

counter = Counter(lemms)
dictionary = dict(counter)
print(counter.most_common(10))

[(',', 33932), ('.', 26863), ('и', 12684), ('в', 11974), ('не', 10301), ('-', 7906), ('на', 7003), ('что', 5986), ('а', 5008), ('?', 4395)]


In [89]:
from nltk.stem import SnowballStemmer

stemmer = SnowballStemmer(language="russian")
stemmed = []

for words in data_tok:
    d = [stemmer.stem(word) for word in words]
    stemmed.extend(d)

In [97]:
flat_list = [word for words in data_tok for word in words]

print('total number of words: ', len(set(flat_list)))
print('after lemmatization: ', len(set(lemms)))
print('after stemming: ', len(set(stemmed)))

vocabulary = dict(Counter(flat_list))  #словарь для BoW

#special_char=[",",":"," ",";",".","?"]
#for key in special_char:
#          vocabulary.pop(key, None)

word_to_search = 'и'
print(f'"{word_to_search}" встречается {vocabulary[word_to_search]} раз')

total number of words:  68638
after lemmatization:  68602
after stemming:  33642
"и" встречается 12684 раз


In [105]:
def vectorize(tokens):
    return [tokens.count(w) for w in vocabulary]


n = 20
k = 20
print(f'Первые {n} элементов словаря:')
for i, (key, value) in enumerate(vocabulary.items()):
    print(key)
    if i > n:
        break

comment = data_tok[0]
print(f'Токены комментария:{comment}')
bow_vector = vectorize(comment)
print(f'Первые {k} компонент вектора слов, соответствующего комментарию:{bow_vector[:k]}')

Первые 20 элементов словаря:
верблюдов
-
то
за
что
?
дебилы
,
бл
...
хохлы
это
отдушина
затюканого
россиянина
мол
вон
а
у
хохлов
еще
хуже
Токены комментария:['верблюдов', '-', 'то', 'за', 'что', '?', 'дебилы', ',', 'бл', '...']
Первые 20 компонент вектора слов, соответствующего комментарию:[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [134]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

vectorizer = CountVectorizer()
word_count = vectorizer.fit_transform(data)
print(len(vectorizer.get_feature_names()))
print(f'{data[0][:-1]}  : {sum(word_count.toarray()[0])}')

68423
Верблюдов-то за что? Дебилы, бл...  : 6


In [135]:
tfidf_transformer = TfidfTransformer(smooth_idf=True, use_idf=True)
tfidf_transformer.fit(word_count)
df_idf = pd.DataFrame(tfidf_transformer.idf_, index=vectorizer.get_feature_names(), columns=["idf_weights"])
df_idf.sort_values(by=['idf_weights'], ascending=False)

Unnamed: 0,idf_weights
обновиться,9.882739
пещеры,9.882739
печёнку,9.882739
пешеблядь,9.882739
пешее,9.882739
...,...
то,2.639941
это,2.575201
что,2.295682
на,2.169848


In [142]:
tf_idf_vector = tfidf_transformer.transform(word_count)
first_document_vector = tf_idf_vector[0]
df_tfifd = pd.DataFrame(first_document_vector.T.todense(), index=vectorizer.get_feature_names(), columns=["tfidf"])
df_tfifd.sort_values(by=["tfidf"], ascending=False)

Unnamed: 0,tfidf
верблюдов,0.613101
бл,0.558288
дебилы,0.470959
за,0.198487
то,0.170782
...,...
киселева,0.000000
киселевым,0.000000
киселем,0.000000
кисель,0.000000


In [143]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfIdfVectorizer=TfidfVectorizer(smooth_idf=True, use_idf=True)
tfIdf = tfIdfVectorizer.fit_transform(data)
df = pd.DataFrame(tfIdf[0].T.todense(), index=tfIdfVectorizer.get_feature_names(), columns=["tfidf"])
df.sort_values(by=["tfidf"], ascending=False)

Unnamed: 0,tfidf
верблюдов,0.613101
бл,0.558288
дебилы,0.470959
за,0.198487
то,0.170782
...,...
киселева,0.000000
киселевым,0.000000
киселем,0.000000
кисель,0.000000
