In [1]:
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

import nltk
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.corpus import stopwords

import re

# 1. Токенизация по предложениям

In [2]:
text = 'Тут похардкорнее. Ищем ботов в социальных сетях. Как отличить бота, который оставляет проплаченные коментарии под постами? Тут очень инетересно!)'

In [3]:
# Разбить текст на предложения, по точкам

sentences = nltk.sent_tokenize(text)
for sentence in sentences:
    print(sentence)
    print()

Тут похардкорнее.

Ищем ботов в социальных сетях.

Как отличить бота, который оставляет проплаченные коментарии под постами?

Тут очень инетересно!)



# 2. Токенизация по словам

In [4]:
# Разделим предыдущие предложения на слова

for sentence in sentences:
    words = nltk.word_tokenize(sentence)
    print(words)
    print()

['Тут', 'похардкорнее', '.']

['Ищем', 'ботов', 'в', 'социальных', 'сетях', '.']

['Как', 'отличить', 'бота', ',', 'который', 'оставляет', 'проплаченные', 'коментарии', 'под', 'постами', '?']

['Тут', 'очень', 'инетересно', '!', ')']



# 3. Лемматизация и стемминг текста

In [5]:
# Не совсем понятно

def compare_stemmer_and_lemmatizer(stemmer, lemmatizer, word, pos):
    """
    Print the results of stemmind and lemmitization using the passed stemmer, lemmatizer, word and pos (part of speech)
    """
    print("Stemmer:", stemmer.stem(word))
    print("Lemmatizer:", lemmatizer.lemmatize(word, pos))
    print()

lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()
compare_stemmer_and_lemmatizer(stemmer, lemmatizer, word = "seen", pos = wordnet.VERB)
compare_stemmer_and_lemmatizer(stemmer, lemmatizer, word = "drove", pos = wordnet.VERB)

Stemmer: seen
Lemmatizer: see

Stemmer: drove
Lemmatizer: drive



# 4. Стоп-слова

In [6]:
# Выводит список стоп слов

print(stopwords.words("russian"))

['и', 'в', 'во', 'не', 'что', 'он', 'на', 'я', 'с', 'со', 'как', 'а', 'то', 'все', 'она', 'так', 'его', 'но', 'да', 'ты', 'к', 'у', 'же', 'вы', 'за', 'бы', 'по', 'только', 'ее', 'мне', 'было', 'вот', 'от', 'меня', 'еще', 'нет', 'о', 'из', 'ему', 'теперь', 'когда', 'даже', 'ну', 'вдруг', 'ли', 'если', 'уже', 'или', 'ни', 'быть', 'был', 'него', 'до', 'вас', 'нибудь', 'опять', 'уж', 'вам', 'ведь', 'там', 'потом', 'себя', 'ничего', 'ей', 'может', 'они', 'тут', 'где', 'есть', 'надо', 'ней', 'для', 'мы', 'тебя', 'их', 'чем', 'была', 'сам', 'чтоб', 'без', 'будто', 'чего', 'раз', 'тоже', 'себе', 'под', 'будет', 'ж', 'тогда', 'кто', 'этот', 'того', 'потому', 'этого', 'какой', 'совсем', 'ним', 'здесь', 'этом', 'один', 'почти', 'мой', 'тем', 'чтобы', 'нее', 'сейчас', 'были', 'куда', 'зачем', 'всех', 'никогда', 'можно', 'при', 'наконец', 'два', 'об', 'другой', 'хоть', 'после', 'над', 'больше', 'тот', 'через', 'эти', 'нас', 'про', 'всего', 'них', 'какая', 'много', 'разве', 'три', 'эту', 'моя', 'впр

In [7]:
# Уберем стоп слова

stop_words = set(stopwords.words("russian"))
sentence = "Ну что же ты делаешь то? А?"

words = nltk.word_tokenize(sentence)
without_stop_words = [word for word in words if not word in stop_words]
print(without_stop_words)

['Ну', 'делаешь', '?', 'А', '?']


In [8]:
# Более быстрый метод

stop_words = set(stopwords.words("russian"))
sentence = "Ну что же ты делаешь то? А?"

words = nltk.word_tokenize(sentence)
without_stop_words = []
for word in words:
    if word not in stop_words:
        without_stop_words.append(word)

print(without_stop_words)

['Ну', 'делаешь', '?', 'А', '?']


# 5. Регулярные выражения.

In [9]:
# Уберем знаки припинания из нашего предложения

pattern = r"[^\w]"
print(re.sub(pattern, " ", text))

Тут похардкорнее  Ищем ботов в социальных сетях  Как отличить бота  который оставляет проплаченные коментарии под постами  Тут очень инетересно  


# 6. Мешок слов

In [10]:
# Прочитаем файл и разделим на строки

with open("Text_test_1", "r") as file:
    documents = file.read().splitlines()
    
print(documents)

['Уже сейчас есть несколько проектов:', '', '1. Анализ рынка аналитиков. Попарсим каналы в тедеге, где аналитики ищут работу, составим профиль идеального кандидата, выясним, какие навыки и скиды требуют работадатели. Будет отличным украшением вашего резюме!)', '', '2. Тут похардкорнее. Ищем ботов в социальных сетях. Как отличить бота, который оставляет проплаченные коментарии под постами? Тут очень инетересно!)', '', '3. Коронавирус потряс мир, а что там с графиками? Давайте изучим, как заболеваемость коррелировала с основными показателями экономики в странах, научимся прогнозировать эти метрики. Будет интересно!)', '', 'Вписывайтесь в эти проекты, или предлагайте свои!)']


In [11]:
# Составим словарь

count_vectorizer = CountVectorizer()

In [12]:
# Создадим модель мешка слов

bag_of_words = count_vectorizer.fit_transform(documents)

In [13]:
# Показать модель мешка слов как фрейм данных pandas

feature_names = count_vectorizer.get_feature_names()
pd.DataFrame(bag_of_words.toarray(), columns = feature_names)

Unnamed: 0,анализ,аналитики,аналитиков,бота,ботов,будет,вашего,вписывайтесь,выясним,где,...,странах,там,тедеге,требуют,тут,уже,украшением,что,экономики,эти
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,1,1,0,0,1,1,0,1,1,...,0,0,1,1,0,0,1,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,1,1,0,0,0,0,0,...,0,0,0,0,2,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,1,0,0,0,0,...,1,1,0,0,0,0,0,1,1,1
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1


In [14]:
# Посчитаем частоту встречаеммости слов

tfidf_vectorizer = TfidfVectorizer()
values = tfidf_vectorizer.fit_transform(documents)

# Показать модель как датафрейм pandas

feature_names = tfidf_vectorizer.get_feature_names()
pd.DataFrame(values.toarray(), columns = feature_names)

Unnamed: 0,анализ,аналитики,аналитиков,бота,ботов,будет,вашего,вписывайтесь,выясним,где,...,странах,там,тедеге,требуют,тут,уже,украшением,что,экономики,эти
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.447214,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.201156,0.201156,0.201156,0.0,0.0,0.1699,0.201156,0.0,0.201156,0.201156,...,0.0,0.0,0.201156,0.201156,0.0,0.0,0.201156,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.225227,0.225227,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.450453,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.188204,0.0,0.0,0.0,0.0,...,0.222828,0.222828,0.0,0.0,0.0,0.0,0.0,0.222828,0.222828,0.188204
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.418363,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.353356
