3. Рекурентні нейронні мережі

In [100]:
import pandas as pd
import numpy as np
import string
import re
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras import layers
from sklearn.utils import class_weight

data = pd.read_csv('article_level_data.csv')

df = data.drop(['Unnamed: 0'], axis=1)
data.head(10)

Unnamed: 0.1,Unnamed: 0,article,class
0,0,NLP is a multidisciplinary field that draws fr...,0
1,1,There are a variety of emerging applications f...,0
2,2,As each new means of communication and social ...,0
3,3,"These suggestions include:, Learn about the pu...",0
4,4,In recent years there has been growing concern...,0
5,5,"By the late 1970s, researchers at many major u...",0
6,6,Netnews postings are simply text files that be...,0
7,7,The servers are linked into a branching distri...,0
8,8,"In the 1940s, the main objective in developing...",0
9,9,According to the OSI (open systems interconnec...,0


In [101]:
nltk.download('stopwords')
nltk.download('punkt_tab')
nltk.download('wordnet')
stopWords = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Dmytro\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Dmytro\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Dmytro\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [102]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub("@\S+", " ", text)
    text = re.sub("https*\S+", " ", text)
    text = re.sub("#\S+", " ", text)
    text = re.sub("\d", " ", text)
    text = re.sub('[%s]' % re.escape(string.punctuation), ' ', text)
    text = re.sub('\n', ' ', text)
    text = re.sub('\s{2,}',' ', text)
    text = ' '.join([word for word in text.split(' ') if word not in stopWords])
    return text

df['article'] = df['article'].apply(preprocess_text)

df.head(10)

Unnamed: 0,article,class
0,nlp multidisciplinary field draws linguistics ...,0
1,variety emerging applications nlp including fo...,0
2,new means communication social interaction int...,0
3,suggestions include learn purpose newsgroup po...,0
4,recent years growing concern internet users ma...,0
5,late researchers many major universities using...,0
6,netnews postings simply text files begin set s...,0
7,servers linked branching distribution system m...,0
8,main objective developing first digital comput...,0
9,according osi open systems interconnection mod...,0


In [103]:
X = df['article']
y = df['class']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

vocab = 10000
max_length = 100
tokenizer = Tokenizer(num_words=vocab, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)
train_sequences = tokenizer.texts_to_sequences(X_train)
test_sequences = tokenizer.texts_to_sequences(X_test)


In [104]:
X_train_padded = pad_sequences(train_sequences, maxlen=max_length, padding='post')
X_test_padded = pad_sequences(test_sequences, maxlen=max_length, padding='post')

embedding_dim = 200

model = Sequential([
    layers.Embedding(vocab, embedding_dim),
    layers.LSTM(64, return_sequences=True),
    layers.LSTM(32),
    layers.Dense(1, activation='sigmoid')
])

In [117]:
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])
history = model.fit(X_train_padded, y_train, epochs=10,  batch_size=128, shuffle = True, validation_data=(X_test_padded, y_test))

Epoch 1/10
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 82ms/step - accuracy: 0.9855 - loss: 0.0784 - val_accuracy: 0.8039 - val_loss: 0.6298
Epoch 2/10
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step - accuracy: 0.9913 - loss: 0.0485 - val_accuracy: 0.8284 - val_loss: 0.6165
Epoch 3/10
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 50ms/step - accuracy: 0.9920 - loss: 0.0387 - val_accuracy: 0.8284 - val_loss: 0.6525
Epoch 4/10
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 49ms/step - accuracy: 0.9923 - loss: 0.0346 - val_accuracy: 0.8431 - val_loss: 0.7489
Epoch 5/10
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 48ms/step - accuracy: 0.9913 - loss: 0.0528 - val_accuracy: 0.8431 - val_loss: 0.7617
Epoch 6/10
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 49ms/step - accuracy: 0.9964 - loss: 0.0226 - val_accuracy: 0.8382 - val_loss: 0.6885
Epoch 7/10
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━

In [118]:
loss, accuracy = model.evaluate(X_test_padded, y_test)
print('Test accuracy :', accuracy)

[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.6935 - loss: 1.3265 
Test accuracy : 0.6911764740943909


б) використовуючи pretrained word embeddings

In [110]:
embedding_dim = 200
glove_file = './glove.6B.200d.txt'  
embeddings_index = {}

with open(glove_file, 'r', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

vocab_size = len(tokenizer.word_index) + 1
embedding_matrix = np.zeros((vocab, embedding_dim))

for word, i in tokenizer.word_index.items():
    if i < vocab:
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector


In [111]:
model_pretrained = Sequential([
    layers.Embedding(vocab, embedding_dim, weights=[embedding_matrix], trainable=False),
    layers.LSTM(64, return_sequences=True),
    layers.LSTM(32),
    layers.Dense(1, activation='sigmoid')
])


model_pretrained.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

history_pretrained = model_pretrained.fit(X_train_padded, y_train, epochs=10, batch_size=32, validation_data=(X_test_padded, y_test))


Epoch 1/10
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 33ms/step - accuracy: 0.6490 - loss: 0.6559 - val_accuracy: 0.6912 - val_loss: 0.5790
Epoch 2/10
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 23ms/step - accuracy: 0.7634 - loss: 0.5135 - val_accuracy: 0.6127 - val_loss: 0.6500
Epoch 3/10
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 23ms/step - accuracy: 0.6574 - loss: 0.6066 - val_accuracy: 0.7108 - val_loss: 0.5478
Epoch 4/10
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 27ms/step - accuracy: 0.8089 - loss: 0.4459 - val_accuracy: 0.7598 - val_loss: 0.5076
Epoch 5/10
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 23ms/step - accuracy: 0.8705 - loss: 0.3440 - val_accuracy: 0.7696 - val_loss: 0.5063
Epoch 6/10
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 23ms/step - accuracy: 0.8685 - loss: 0.3309 - val_accuracy: 0.7843 - val_loss: 0.4854
Epoch 7/10
[1m26/26[0m [32m━━━━

In [112]:
loss, accuracy = model_pretrained.evaluate(X_test_padded, y_test)
print('Test accuracy :', accuracy)

[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.7431 - loss: 0.5849 
Test accuracy : 0.7892156839370728
