# Инструменты для работы с языком 

## Задача: классификация твитов по тональности

У нас есть датасет из твитов, про каждый указано, как он эмоционально окрашен: положительно или отрицательно. Задача: предсказывать эмоциональную окраску.

Скачиваем куски датасета ([источник](http://study.mokoron.com/)): [положительные](https://www.dropbox.com/s/fnpq3z4bcnoktiv/positive.csv?dl=0), [отрицательные](https://www.dropbox.com/s/r6u59ljhhjdg6j0/negative.csv).

In [7]:
!wget https://www.dropbox.com/s/fnpq3z4bcnoktiv/positive.csv
!wget https://www.dropbox.com/s/r6u59ljhhjdg6j0/negative.csv

--2022-06-14 11:14:48--  https://www.dropbox.com/s/fnpq3z4bcnoktiv/positive.csv
Resolving www.dropbox.com (www.dropbox.com)... 162.125.3.18, 2620:100:601b:18::a27d:812
Connecting to www.dropbox.com (www.dropbox.com)|162.125.3.18|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: /s/raw/fnpq3z4bcnoktiv/positive.csv [following]
--2022-06-14 11:14:48--  https://www.dropbox.com/s/raw/fnpq3z4bcnoktiv/positive.csv
Reusing existing connection to www.dropbox.com:443.
HTTP request sent, awaiting response... 302 Found
Location: https://uc4825e5c4682409a20c59dcba6c.dl.dropboxusercontent.com/cd/0/inline/BnKOd5Kd47dqtALXkJBv3Co_VohQyxLIoKDI0mdbH7_zvxyC6Zh9mP_PSfs7gpYXsxAsEhZxsfwoZpZKqns3euuLLjiSwPIa_byiwfkRbZraLZfKO7HP2Dc7K8Ie0jumnScfvMuHaP5iIZVPUPnGi9wvoc2jZtb5M6FWF-K20MPvOg/file# [following]
--2022-06-14 11:14:48--  https://uc4825e5c4682409a20c59dcba6c.dl.dropboxusercontent.com/cd/0/inline/BnKOd5Kd47dqtALXkJBv3Co_VohQyxLIoKDI0mdbH7_zvxyC6Zh9mP_PSfs7gpYXsxAs

In [3]:
!pip install pymorphy2

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pymorphy2
  Downloading pymorphy2-0.9.1-py3-none-any.whl (55 kB)
[K     |████████████████████████████████| 55 kB 2.5 MB/s 
[?25hCollecting dawg-python>=0.7.1
  Downloading DAWG_Python-0.7.2-py2.py3-none-any.whl (11 kB)
Collecting pymorphy2-dicts-ru<3.0,>=2.4
  Downloading pymorphy2_dicts_ru-2.4.417127.4579844-py2.py3-none-any.whl (8.2 MB)
[K     |████████████████████████████████| 8.2 MB 10.7 MB/s 
[?25hInstalling collected packages: pymorphy2-dicts-ru, dawg-python, pymorphy2
Successfully installed dawg-python-0.7.2 pymorphy2-0.9.1 pymorphy2-dicts-ru-2.4.417127.4579844


In [4]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from sklearn.metrics import f1_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from string import punctuation

from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords, wordnet
from string import punctuation
from nltk.stem import WordNetLemmatizer

from collections import Counter

from pymorphy2 import MorphAnalyzer

import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Embedding, GlobalAveragePooling1D, Conv1D, GRU, LSTM, Dropout
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

import warnings
warnings.filterwarnings("ignore")

In [8]:
# считываем данные и заполняем общий датасет
positive = pd.read_csv('positive.csv', sep=';', usecols=[3], names=['text'])
positive['label'] = ['positive'] * len(positive)
negative = pd.read_csv('negative.csv', sep=';', usecols=[3], names=['text'])
negative['label'] = ['negative'] * len(negative)
df = positive.append(negative)

In [9]:
df.tail()

Unnamed: 0,text,label
111918,Но не каждый хочет что то исправлять:( http://...,negative
111919,скучаю так :-( только @taaannyaaa вправляет мо...,negative
111920,"Вот и в школу, в говно это идти уже надо(",negative
111921,"RT @_Them__: @LisaBeroud Тауриэль, не грусти :...",negative
111922,Такси везет меня на работу. Раздумываю приплат...,negative


In [10]:
x_train, x_test, y_train, y_test = train_test_split(df.text, df.label)

## Задание 1.

**Задание**: обучите три классификатора: 

1) на токенах с высокой частотой 

2) на токенах со средней частотой 

3) на токенах с низкой частотой

Сравните полученные результаты, оцените какие токены наиболее важные для классификации.

In [11]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [12]:
corpus = [token for tweet in df.text for token in word_tokenize(tweet) if token not in punctuation]
freq_dict = Counter(corpus)
freq_dict_sorted= sorted(freq_dict.items(), key=lambda x: -x[1])
list(freq_dict_sorted)[:10]



freq_dict_800 = sorted(freq_dict.items(), key=lambda x: -x[1])[:800]

In [13]:
pymorphy2_analyzer = MorphAnalyzer()

In [14]:
def text_preparation(text, start_board, end_board):
    tokens = word_tokenize(text)
    return [token for token in tokens if token in [item[0] for item in freq_dict_800][start_board:end_board] 
            and token not in punctuation]

Классификатор на токенах с высокой частотой

In [18]:
%%time
vec = TfidfVectorizer(ngram_range=(1, 1), tokenizer=lambda x: text_preparation(x, 0, 150))
x_train_bow = vec.fit_transform(x_train)
clf = LogisticRegression(random_state=42)
clf.fit(x_train_bow, y_train)
pred = clf.predict(vec.transform(x_test))
print(classification_report(pred, y_test))

              precision    recall  f1-score   support

    negative       0.62      0.63      0.63     27571
    positive       0.65      0.64      0.64     29138

    accuracy                           0.63     56709
   macro avg       0.63      0.63      0.63     56709
weighted avg       0.63      0.63      0.63     56709

CPU times: user 3min 44s, sys: 542 ms, total: 3min 44s
Wall time: 3min 54s


Классификатор на токенах со средней частотой

In [15]:
%%time
vec = TfidfVectorizer(ngram_range=(1, 1), tokenizer=lambda x: text_preparation(x, 151, 350))
x_train_bow = vec.fit_transform(x_train)
clf = LogisticRegression(random_state=42)
clf.fit(x_train_bow, y_train)
pred = clf.predict(vec.transform(x_test))
print(classification_report(pred, y_test))

              precision    recall  f1-score   support

    negative       0.37      0.62      0.46     16603
    positive       0.78      0.56      0.65     40106

    accuracy                           0.58     56709
   macro avg       0.57      0.59      0.56     56709
weighted avg       0.66      0.58      0.60     56709

CPU times: user 3min 19s, sys: 466 ms, total: 3min 19s
Wall time: 3min 20s


Классификатор на токенах с низкой частотой

In [16]:
%%time
vec = TfidfVectorizer(ngram_range=(1, 1), tokenizer=lambda x: text_preparation(x, 351, 800))
x_train_bow = vec.fit_transform(x_train)
clf = LogisticRegression(random_state=42)
clf.fit(x_train_bow, y_train)
pred = clf.predict(vec.transform(x_test))
print(classification_report(pred, y_test))

              precision    recall  f1-score   support

    negative       0.36      0.66      0.46     15153
    positive       0.82      0.57      0.67     41556

    accuracy                           0.59     56709
   macro avg       0.59      0.61      0.57     56709
weighted avg       0.70      0.59      0.62     56709

CPU times: user 3min 36s, sys: 454 ms, total: 3min 36s
Wall time: 3min 37s


Наилучший результат показали высокочастотные токены

## Задание 2.

найти фичи с наибольшей значимостью, и вывести их

In [19]:
feature_names = vec.get_feature_names()
feature_names

["''",
 '..',
 '...',
 '....',
 '2',
 '3',
 '``',
 'http',
 'а',
 'без',
 'блин',
 'больше',
 'будет',
 'буду',
 'бы',
 'был',
 'была',
 'было',
 'быть',
 'в',
 'вас',
 'вообще',
 'вот',
 'время',
 'все',
 'всегда',
 'всего',
 'всем',
 'всех',
 'всё',
 'вы',
 'где',
 'год',
 'да',
 'даже',
 'делать',
 'день',
 'для',
 'до',
 'дома',
 'его',
 'ее',
 'если',
 'есть',
 'еще',
 'ещё',
 'же',
 'за',
 'завтра',
 'знаю',
 'и',
 'из',
 'или',
 'их',
 'к',
 'как',
 'когда',
 'кто',
 'лучше',
 'люблю',
 'меня',
 'мне',
 'много',
 'могу',
 'может',
 'можно',
 'мой',
 'моя',
 'мы',
 'на',
 'надо',
 'нас',
 'не',
 'нет',
 'ни',
 'ничего',
 'но',
 'ну',
 'о',
 'один',
 'он',
 'она',
 'они',
 'опять',
 'от',
 'очень',
 'плохо',
 'по',
 'пока',
 'после',
 'потом',
 'почему',
 'про',
 'просто',
 'раз',
 'с',
 'себе',
 'себя',
 'сегодня',
 'сейчас',
 'со',
 'спасибо',
 'спать',
 'так',
 'такая',
 'такие',
 'такое',
 'такой',
 'там',
 'тебе',
 'тебя',
 'теперь',
 'то',
 'тоже',
 'только',
 'тут',
 'ты',


### Задание 3.

1) сравнить count/tf-idf/hashing векторайзеры/полносвязанную сетку (построить classification_report)

2) подобрать оптимальный размер для hashing векторайзера 

3) убедиться что для сетки нет переобучения

In [20]:
import nltk
nltk.download('stopwords')

noise = stopwords.words('russian') + list(punctuation)

def text_preparation(text):
    return [t for t in [pymorphy2_analyzer.parse(token)[0].normal_form 
                                for token in word_tokenize(text)] if t not in noise]

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [21]:
%%time
# CountVectorizer

c_vec = CountVectorizer(ngram_range=(1, 1), tokenizer = text_preparation)

x_train_bow = c_vec.fit_transform(x_train)
clf = LogisticRegression(random_state=42)
clf.fit(x_train_bow, y_train)
pred = clf.predict(c_vec.transform(x_test))
print(classification_report(pred, y_test))

              precision    recall  f1-score   support

    negative       0.79      0.76      0.78     28954
    positive       0.76      0.79      0.78     27755

    accuracy                           0.78     56709
   macro avg       0.78      0.78      0.78     56709
weighted avg       0.78      0.78      0.78     56709

CPU times: user 9min 59s, sys: 9.97 s, total: 10min 9s
Wall time: 10min 13s


In [22]:
%%time
# TfidfVectorizer

tfidf_vec = TfidfVectorizer(ngram_range=(1, 1), tokenizer = text_preparation)
x_train_bow = tfidf_vec.fit_transform(x_train)
clf = LogisticRegression(random_state=42)
clf.fit(x_train_bow, y_train)
pred = clf.predict(tfidf_vec.transform(x_test))
print(classification_report(pred, y_test))

              precision    recall  f1-score   support

    negative       0.76      0.77      0.76     27620
    positive       0.78      0.77      0.77     29089

    accuracy                           0.77     56709
   macro avg       0.77      0.77      0.77     56709
weighted avg       0.77      0.77      0.77     56709

CPU times: user 9min 49s, sys: 7.19 s, total: 9min 56s
Wall time: 9min 51s


In [23]:
%%time
# HashingVectorizer

h_vec = HashingVectorizer(n_features = 80, tokenizer = text_preparation)
x_train_bow = h_vec.fit_transform(x_train)
clf = LogisticRegression(random_state=42)
clf.fit(x_train_bow, y_train)
pred = clf.predict(h_vec.transform(x_test))
print(classification_report(pred, y_test))

              precision    recall  f1-score   support

    negative       0.63      0.59      0.61     29401
    positive       0.58      0.62      0.60     27308

    accuracy                           0.60     56709
   macro avg       0.60      0.61      0.60     56709
weighted avg       0.61      0.60      0.60     56709

CPU times: user 9min 50s, sys: 1.32 s, total: 9min 51s
Wall time: 9min 50s


In [24]:
# Полносвязная нейронная сеть

# labelEncode целевую переменную
encoder = LabelEncoder()
y_train = encoder.fit_transform(y_train)
y_test = encoder.fit_transform(y_test)

# Создадим Датасет tensorflow
train_data = tf.data.Dataset.from_tensor_slices((x_train, y_train))
valid_data = tf.data.Dataset.from_tensor_slices((x_test, y_test))

#Выделим batch-и
train_data = train_data.batch(16)
valid_data = valid_data.batch(16)

AUTOTUNE = tf.data.AUTOTUNE

train_data = train_data.cache().prefetch(buffer_size=AUTOTUNE)
valid_data = valid_data.cache().prefetch(buffer_size=AUTOTUNE)

In [25]:
vocab_size = 10000
seq_len = 100

def custom_standardization(input_data):
    return input_data


vectorize_layer = TextVectorization(  
    standardize = custom_standardization,
    max_tokens=vocab_size,
    output_mode='int',
    output_sequence_length=seq_len)


# Make a text-only dataset (no labels) and call adapt to build the vocabulary.
text_data = train_data.map(lambda x, y: x)
vectorize_layer.adapt(text_data)

In [26]:
embedding_dim=200

class myNet(tf.keras.Model):
    def __init__(self):
        super(myNet, self).__init__()
        self.emb = Embedding(vocab_size, embedding_dim, name="embedding")
        self.conv1 = Conv1D(200, (3))
        self.conv2 = Conv1D(200, (3))
        self.gPool = GlobalAveragePooling1D()
        self.fc1 = Dense(100, activation='relu')
        self.fc2 = Dense(1)


    def call(self, x):
        x = vectorize_layer(x)
        x = self.emb(x)
        x1 = self.conv1(x)
        x = self.conv2(x)
        x = self.gPool((x + x1)/2)
        x = self.fc1(x)
        return self.fc2(x)

In [27]:
%%time
model = myNet()


model.compile(optimizer='adam',
              loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              metrics=['accuracy'])

model.fit(train_data, validation_data=valid_data, epochs=7)

Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7
CPU times: user 9min 8s, sys: 43.2 s, total: 9min 51s
Wall time: 8min 45s


Наилучший результат показала нейросеть