In [None]:
!pip install pymorphy2

In [24]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import pymorphy2
from tqdm import tqdm
import re
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
from keras import layers
from keras.models import Sequential
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
df = pd.read_csv('/content/drive/MyDrive/Machine Learning (ITHUB) /ДИ 2023/lessons/data/Tweets.csv')
df.dropna(inplace=True)
df.head()

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative
2,088c60f138,my boss is bullying me...,bullying me,negative
3,9642c003ef,what interview! leave me alone,leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative


In [4]:
norm_text=[]
# 1 - только буквы
# 2 - перевод в нижний регистр
# 3 - привести все слова в единую форму
morph=pymorphy2.MorphAnalyzer()
for s in tqdm(df['text']):
    s1 = re.sub(r'[^\w\s]+|[\d]+', r'',s).strip()
    s1 = s1.lower()
    s1 = word_tokenize(s1)
    words=[]
    for i in s1:
        pv = morph.parse(i)
        words.append(pv[0].normal_form)
    sentence=' '.join(words)
    norm_text.append(sentence)

100%|██████████| 27480/27480 [00:13<00:00, 1968.76it/s]


In [5]:
df['norm_text']=norm_text
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(df['norm_text'])

X = tokenizer.texts_to_sequences(df['norm_text'])

vocab_size = len(tokenizer.word_index) + 1

In [6]:
maxlen = 30
X = pad_sequences(X, padding='post', maxlen=maxlen)
print(X[0, :])

[285  16  68   1 119  47   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0]


In [7]:
encoder = LabelEncoder() #кодируем каждую метку класса числом
y = encoder.fit_transform(df['sentiment'])
y = to_categorical(y)

In [8]:
embedding_dim = 50

model_emb = Sequential()
model_emb.add(layers.Embedding(input_dim=vocab_size,
                           output_dim=embedding_dim,
                           input_length=maxlen))
model_emb.add(layers.Flatten())
model_emb.add(layers.Dense(50, activation='relu'))
model_emb.add(layers.Dense(y.shape[1], activation='softmax'))

model_emb.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=["accuracy"])
model_emb.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 30, 50)            1395700   
                                                                 
 flatten (Flatten)           (None, 1500)              0         
                                                                 
 dense (Dense)               (None, 50)                75050     
                                                                 
 dense_1 (Dense)             (None, 3)                 153       
                                                                 
Total params: 1470903 (5.61 MB)
Trainable params: 1470903 (5.61 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [9]:
X_train_full,X_test,y_train_full,y_test = train_test_split(X,y,test_size=0.2,random_state=0)
X_train,X_valid,y_train,y_valid = train_test_split(X_train_full,y_train_full,test_size=0.2,random_state=0)

In [10]:
history=model_emb.fit(X_train, y_train, epochs =5,batch_size=5,validation_data=(X_valid,y_valid))

acc=model_emb.evaluate(X_test,y_test)
print(np.array(acc)[1].round(2))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
0.61


## N-граммы

In [19]:
stopwords_en = stopwords.words("english")
vectorizer = CountVectorizer(max_features=500, min_df=20, max_df=0.7, stop_words=stopwords_en,ngram_range=(1,1))
text_cv_1g = vectorizer.fit_transform(df['norm_text'])
text_cv_1g = pd.DataFrame(text_cv_1g.toarray(),columns=vectorizer.get_feature_names_out())
text_cv_1g.head()

Unnamed: 0,able,account,actually,ago,ah,aint,almost,alone,already,also,...,yea,yeah,year,years,yes,yesterday,yet,youll,youre,youtube
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [21]:
vectorizer_2g = CountVectorizer(max_features=500, min_df=20, max_df=0.7, stop_words=stopwords_en,ngram_range=(2,2))
text_cv_2g = vectorizer_2g.fit_transform(df['norm_text'])
text_cv_2g = pd.DataFrame(text_cv_2g.toarray(),columns=vectorizer_2g.get_feature_names_out())
text_cv_2g.head()

Unnamed: 0,back home,back work,bad day,bank holiday,best friend,better soon,britains got,cant believe,cant even,cant find,...,well im,wish could,wont let,work today,work tomorrow,would like,would love,yeah im,year old,youre welcome
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [22]:
df_full = pd.concat([text_cv_1g,text_cv_2g],axis=1)
df_full.shape

(27480, 717)

In [25]:
scal_X_cv = StandardScaler().fit_transform(df_full)
scal_X_cv = pd.DataFrame(scal_X_cv, columns=df_full.columns)

In [26]:
X_train_full,X_test,y_train_full,y_test = train_test_split(scal_X_cv,y,test_size=0.2,random_state=0)
X_train,X_valid,y_train,y_valid = train_test_split(X_train_full,y_train_full,test_size=0.2,random_state=0)

In [28]:
model = Sequential()
model.add(layers.Dense(500, activation='relu', input_dim=X_train.shape[1]))
model.add(layers.Dense(256, activation='relu'))
model.add(layers.Dense(64, activation='relu'))
model.add(layers.Dense(y.shape[1], activation='softmax'))

model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=["accuracy"])

history=model.fit(np.array(X_train), y_train, epochs =5,batch_size=5,validation_data=(np.array(X_valid),y_valid))

acc=model.evaluate(np.array(X_test),y_test)
print(np.array(acc)[1].round(2))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
0.64


## Рекуррентные нейронные сети (RNN,LSTM)

Теория: https://sysblok.ru/knowhow/mama-myla-lstm-kak-ustroeny-rekurrentnye-nejroseti-s-dolgoj-kratkosrochnoj-pamjatju/

In [32]:
X_train_full,X_test,y_train_full,y_test = train_test_split(X,y,test_size=0.2,random_state=0)
X_train,X_valid,y_train,y_valid = train_test_split(X_train_full,y_train_full,test_size=0.2,random_state=0)

In [29]:
model_lstm = Sequential()
model_lstm.add(layers.Embedding(input_dim=vocab_size,
                           output_dim=embedding_dim,
                           input_length=maxlen))

model_lstm.add(layers.LSTM(64))
model_lstm.add(layers.Dense(y.shape[1], activation='softmax'))

model_lstm.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=["accuracy"])
model_lstm.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 30, 50)            1395700   
                                                                 
 lstm (LSTM)                 (None, 64)                29440     
                                                                 
 dense_10 (Dense)            (None, 3)                 195       
                                                                 
Total params: 1425335 (5.44 MB)
Trainable params: 1425335 (5.44 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [33]:
history=model_lstm.fit(X_train, y_train, epochs =5,batch_size=5,validation_data=(X_valid,y_valid))

acc=model_lstm.evaluate(X_test,y_test)
print(np.array(acc)[1].round(2))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
0.7


## DROPOUT слой

Теория: https://habr.com/ru/companies/wunderfund/articles/330814/

In [34]:
model_lstm_drop = Sequential()
model_lstm_drop.add(layers.Embedding(input_dim=vocab_size,
                           output_dim=embedding_dim,
                           input_length=maxlen))
model_lstm_drop.add(layers.LSTM(64))
model_lstm_drop.add(layers.Dropout(0.5))
model_lstm_drop.add(layers.Dense(y.shape[1], activation='softmax'))

model_lstm_drop.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=["accuracy"])
model_lstm_drop.summary()

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 30, 50)            1395700   
                                                                 
 lstm_1 (LSTM)               (None, 64)                29440     
                                                                 
 dropout (Dropout)           (None, 64)                0         
                                                                 
 dense_11 (Dense)            (None, 3)                 195       
                                                                 
Total params: 1425335 (5.44 MB)
Trainable params: 1425335 (5.44 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [35]:
history=model_lstm_drop.fit(X_train, y_train, epochs =5,batch_size=5,validation_data=(X_valid,y_valid))

acc=model_lstm_drop.evaluate(X_test,y_test)
print(np.array(acc)[1].round(2))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
0.7
