In [111]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.sequence import pad_sequences
import pandas as pd
import numpy as np
from tensorflow.keras.layers import Input, Embedding, Conv1D, MaxPooling1D, LSTM, Dense
import spacy

nlp = spacy.load("ru_core_news_sm")

fake = Faker('ru_RU')
num_samples = 4000
data = {'message': [fake.sentence() for _ in range(num_samples)],
        'label': [fake.random_element(elements=('Общение', 'Спам',)) for _ in range(num_samples)]}

df = pd.DataFrame(data)

# Функция лемматизации текста
def lemmatize_text(text):
    doc = nlp(text)
    lemmatized_text = " ".join([token.lemma_ for token in doc])
    return lemmatized_text

# Применение лемматизации к столбцу 'message'
df['lemmatized_message'] = df['message'] #.apply(lemmatize_text)


train_texts, test_texts, train_labels, test_labels = train_test_split(df['lemmatized_message'], df['label'], test_size=0.3, random_state=42)

le = LabelEncoder()
train_labels_encoded = le.fit_transform(train_labels)
test_labels_encoded = le.transform(test_labels)

max_words = 55000
tokenizer = Tokenizer(num_words=max_words, oov_token='<OOV>')
tokenizer.fit_on_texts(train_texts)

train_sequences = tokenizer.texts_to_sequences(train_texts)
test_sequences = tokenizer.texts_to_sequences(test_texts)

maxlen = 20
train_data = pad_sequences(train_sequences, maxlen=maxlen, padding='post')
test_data = pad_sequences(test_sequences, maxlen=maxlen, padding='post')

early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
model_checkpoint = ModelCheckpoint('best_model.keras', save_best_only=True)

# model = Sequential()
# model.add(Embedding(input_dim=max_words, output_dim=250, input_length=maxlen))
# model.add(LSTM(units=256, return_sequences=True))
# model.add(Dropout(0.4))
# model.add(LSTM(units=256, return_sequences=True))
# model.add(Dropout(0.4))
# model.add(LSTM(units=128, return_sequences=True))
# model.add(Dropout(0.5))
# model.add(BatchNormalization())
# model.add(LSTM(units=128))
# model.add(Dense(units=64, activation='relu'))
# model.add(Dense(units=len(df['label'].unique()), activation='softmax'))

model = Sequential()
model.add(Embedding(input_dim=max_words, output_dim=250, input_length=maxlen))
model.add(Conv1D(filters=128, kernel_size=3, activation='relu', padding='same'))
model.add(Conv1D(filters=128, kernel_size=3, activation='relu', padding='same'))
model.add(MaxPooling1D(pool_size=2))
model.add(Conv1D(filters=64, kernel_size=3, activation='relu', padding='same'))
model.add(Conv1D(filters=64, kernel_size=3, activation='relu', padding='same'))
model.add(MaxPooling1D(pool_size=2))
model.add(LSTM(256, return_sequences=True))  # Return sequences for stacking LSTM layers
model.add(Dropout(0.5))
model.add(LSTM(256))  # Another LSTM layer
model.add(Dropout(0.5))
model.add(Dense(units=len(df['label'].unique()), activation='softmax'))

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

history = model.fit(train_data, train_labels_encoded, epochs=100, batch_size=34, validation_split=0.2,
                    callbacks=[early_stopping, model_checkpoint])


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100


In [112]:
test_loss, test_accuracy = model.evaluate(test_data, test_labels_encoded)
print(f'Test Accuracy: {test_accuracy * 100:.2f}%')

train_loss, train_accuracy = model.evaluate(train_data, train_labels_encoded)
print(f'Train Accuracy: {train_accuracy * 100:.2f}%')

test_predictions = model.predict(test_data)
test_predictions_classes = np.argmax(test_predictions, axis=1)

test_labels_original = le.inverse_transform(test_labels_encoded)
test_predictions_classes_original = le.inverse_transform(test_predictions_classes)

print("\nClassification Report:")
print(classification_report(test_labels_original, test_predictions_classes_original))

train_predictions = model.predict(train_data)
train_predictions_classes = np.argmax(train_predictions, axis=1)

train_predictions_classes_original = le.inverse_transform(train_predictions_classes)

print("\nTrain Classification Report:")
print(classification_report(train_labels, train_predictions_classes_original))

Test Accuracy: 48.83%
Train Accuracy: 50.82%

Classification Report:
              precision    recall  f1-score   support

     Общение       0.00      0.00      0.00       614
        Спам       0.49      1.00      0.66       586

    accuracy                           0.49      1200
   macro avg       0.24      0.50      0.33      1200
weighted avg       0.24      0.49      0.32      1200

20/88 [=====>........................] - ETA: 0s

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



Train Classification Report:
              precision    recall  f1-score   support

     Общение       0.00      0.00      0.00      1377
        Спам       0.51      1.00      0.67      1423

    accuracy                           0.51      2800
   macro avg       0.25      0.50      0.34      2800
weighted avg       0.26      0.51      0.34      2800



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [26]:
from tensorflow.keras.models import load_model

# Загрузка сохраненной модели
loaded_model = load_model('best_model.keras')

text_to_predict = ['Где мои деньги']
sequences = tokenizer.texts_to_sequences(text_to_predict)
maxlen = 20
padded_sequences = pad_sequences(sequences, maxlen=maxlen, padding='post')
predictions = loaded_model.predict(padded_sequences)
predicted_classes = np.argmax(predictions, axis=1)

print("Predicted Classes:", le.inverse_transform(predicted_classes))

Predicted Classes: ['Спам']


In [47]:
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Input, Conv1D, MaxPooling1D, UpSampling1D, LSTM, Dropout, BatchNormalization, Dense
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
import pandas as pd
from faker import Faker
from sklearn.metrics import classification_report
import numpy as np

def unet_lstm(input_shape):
    inputs = Input(shape=input_shape)
    conv1 = Conv1D(64, 3, activation='relu', padding='same')(inputs)
    conv1 = Conv1D(64, 3, activation='relu', padding='same')(conv1)
    pool1 = MaxPooling1D(pool_size=2)(conv1)
    conv2 = Conv1D(128, 3, activation='relu', padding='same')(pool1)
    conv2 = Conv1D(128, 3, activation='relu', padding='same')(conv2)
    pool2 = MaxPooling1D(pool_size=2)(conv2)
    lstm1 = LSTM(256, return_sequences=True)(pool2)
    lstm1 = Dropout(0.4)(lstm1)
    up1 = UpSampling1D(size=2)(lstm1)
    conv3 = Conv1D(128, 3, activation='relu', padding='same')(up1)
    conv3 = Conv1D(128, 3, activation='relu', padding='same')(conv3)
    up2 = UpSampling1D(size=2)(conv3)
    conv4 = Conv1D(64, 3, activation='relu', padding='same')(up2)
    conv4 = Conv1D(64, 3, activation='relu', padding='same')(conv4)
    decoded = Conv1D(1, 3, activation='sigmoid', padding='same')(conv4)
    model = Model(inputs, decoded)
    return model


fake = Faker('ru_RU')
num_samples = 4000
data = {'message': [fake.sentence() for _ in range(num_samples)],
        'label': [fake.random_element(elements=('Общение', 'Спам',)) for _ in range(num_samples)]}

df = pd.DataFrame(data)
train_texts, test_texts, train_labels, test_labels = train_test_split(df['message'], df['label'], test_size=0.3, random_state=42)

le = LabelEncoder()
train_labels_encoded = le.fit_transform(train_labels)
test_labels_encoded = le.transform(test_labels)

max_words = 55000
tokenizer = Tokenizer(num_words=max_words, oov_token='<OOV>')
tokenizer.fit_on_texts(train_texts)

train_sequences = tokenizer.texts_to_sequences(train_texts)
test_sequences = tokenizer.texts_to_sequences(test_texts)

maxlen = 20
train_data = pad_sequences(train_sequences, maxlen=maxlen, padding='post')
test_data = pad_sequences(test_sequences, maxlen=maxlen, padding='post')

early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
model_checkpoint = ModelCheckpoint('best_model.keras', save_best_only=True)

input_shape = (maxlen, 1)
model = unet_lstm(input_shape)
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

le = LabelEncoder()
train_labels_encoded = le.fit_transform(train_labels)
test_labels_encoded = le.transform(test_labels)

train_labels_encoded = train_labels_encoded.reshape((-1, 1))
history = model.fit(train_data, train_labels_encoded, epochs=100, batch_size=64, validation_split=0.2,
                    callbacks=[early_stopping, model_checkpoint])


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100


In [48]:

test_loss, test_accuracy = model.evaluate(test_data, test_labels_encoded)
print(f'Test Accuracy: {test_accuracy * 100:.2f}%')

train_loss, train_accuracy = model.evaluate(train_data, train_labels_encoded)
print(f'Train Accuracy: {train_accuracy * 100:.2f}%')

test_predictions = model.predict(test_data)
# test_predictions_classes = np.argmax(test_predictions, axis=1)

# # Use inverse_transform directly on the LabelEncoder
# test_predictions_classes_original = le.inverse_transform(test_predictions_classes)

# print("\nClassification Report:")
# print(classification_report(test_labels, test_predictions_classes_original))

Test Accuracy: 50.25%
Train Accuracy: 50.51%
