#Задание 1

In [None]:
import nltk
import keras
from nltk.corpus import brown

nltk.download('brown')

selected_categories = ['government', 'hobbies', 'humor', 'news']
texts = []
labels = []

for category in selected_categories:
    fileids = brown.fileids(categories=category)
    for fileid in fileids:
        texts.append(' '.join(brown.words(fileid)))
        labels.append(category)

[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Package brown is already up-to-date!


In [None]:
from sklearn.preprocessing import LabelEncoder

# Преобразуем метки в числовой формат
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(labels)

In [None]:
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Токенизация текстов
tokenizer = Tokenizer(num_words=10000)  # Ограничиваем словарь 10 000 слов
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

# Добавление padding
max_sequence_length = 200  # Максимальная длина текста
data = pad_sequences(sequences, maxlen=max_sequence_length, padding='post')

In [None]:
from sklearn.model_selection import train_test_split

# Разделение данных
X_train, X_test, y_train, y_test = train_test_split(data, encoded_labels, test_size=0.2, random_state=42)

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.regularizers import l2

vocab_size = 10000  # Размер словаря
embedding_dim = 128  # Размерность эмбеддингов
lstm_units = 64
num_classes = len(label_encoder.classes_)

model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_sequence_length),
    LSTM(lstm_units),
    Dropout(0.5),
    Dense(num_classes, activation='softmax')
])

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()

In [None]:
# Обучение модели
history = model.fit(X_train, y_train, epochs=20, batch_size=10, validation_data=(X_test, y_test))

Epoch 1/20
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 47ms/step - accuracy: 0.3107 - loss: 1.3780 - val_accuracy: 0.2500 - val_loss: 1.3591
Epoch 2/20
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step - accuracy: 0.4570 - loss: 1.3173 - val_accuracy: 0.3750 - val_loss: 1.2766
Epoch 3/20
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step - accuracy: 0.3970 - loss: 1.2211 - val_accuracy: 0.3750 - val_loss: 1.2680
Epoch 4/20
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step - accuracy: 0.5219 - loss: 1.1292 - val_accuracy: 0.3750 - val_loss: 1.2540
Epoch 5/20
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step - accuracy: 0.6540 - loss: 0.9797 - val_accuracy: 0.4167 - val_loss: 1.2181
Epoch 6/20
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step - accuracy: 0.7236 - loss: 0.8133 - val_accuracy: 0.4167 - val_loss: 1.1944
Epoch 7/20
[1m10/10[0m [32m━━━━

In [None]:
# Оценка модели
test_loss, test_accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {test_accuracy:.4f}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 43ms/step - accuracy: 0.5417 - loss: 1.6595
Test Accuracy: 0.5417


In [None]:
import numpy as np
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from sklearn.model_selection import train_test_split
import nltk
from nltk.corpus import brown

# Загрузка данных
nltk.download('brown')
selected_categories = ['government', 'hobbies', 'humor', 'news']
texts = []
labels = []

for category in selected_categories:
    fileids = brown.fileids(categories=category)
    for fileid in fileids:
        texts.append(' '.join(brown.words(fileid)))
        labels.append(category)

# Преобразуем метки в числовой формат
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(labels)

tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

# Упорядочивание данных по длине текстов
text_lengths = [len(seq) for seq in sequences]
sorted_indices = np.argsort(text_lengths)
sorted_sequences = [sequences[i] for i in sorted_indices]
sorted_labels = [encoded_labels[i] for i in sorted_indices]

# Разбиение на мини-блоки
def create_mini_batches(sequences, labels, batch_size):
    batches = []
    batch_labels = []
    current_batch = []
    current_labels = []
    max_len_in_batch = 0

    for seq, label in zip(sequences, labels):
        if len(current_batch) < batch_size:
            current_batch.append(seq)
            current_labels.append(label)
            max_len_in_batch = max(max_len_in_batch, len(seq))
        else:
            padded_batch = pad_sequences(current_batch, maxlen=max_len_in_batch, padding='post')
            batches.append(padded_batch)
            batch_labels.append(np.array(current_labels))
            current_batch = [seq]
            current_labels = [label]
            max_len_in_batch = len(seq)

    if current_batch:
        padded_batch = pad_sequences(current_batch, maxlen=max_len_in_batch, padding='post')
        batches.append(padded_batch)
        batch_labels.append(np.array(current_labels))

    return batches, batch_labels

batch_size = 32
mini_batches, mini_batch_labels = create_mini_batches(sorted_sequences, sorted_labels, batch_size)

vocab_size = 10000
embedding_dim = 128
lstm_units = 64
num_classes = len(label_encoder.classes_)

model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max(text_lengths)),
    LSTM(lstm_units),
    Dropout(0.5),
    Dense(num_classes, activation='softmax')
])

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Обучение модели на мини-блоках
for epoch in range(30):
    for batch, batch_labels in zip(mini_batches, mini_batch_labels):
        model.train_on_batch(batch, batch_labels)

    test_loss, test_accuracy = model.evaluate(X_test, y_test, verbose=0)
    print(f"Epoch {epoch + 1}, Test Accuracy: {test_accuracy:.4f}")

[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Package brown is already up-to-date!


Epoch 1, Test Accuracy: 0.4167
Epoch 2, Test Accuracy: 0.4167
Epoch 3, Test Accuracy: 0.4583
Epoch 4, Test Accuracy: 0.4583
Epoch 5, Test Accuracy: 0.4583
Epoch 6, Test Accuracy: 0.4583
Epoch 7, Test Accuracy: 0.4583
Epoch 8, Test Accuracy: 0.4583
Epoch 9, Test Accuracy: 0.5000
Epoch 10, Test Accuracy: 0.4583
Epoch 11, Test Accuracy: 0.4583
Epoch 12, Test Accuracy: 0.4583
Epoch 13, Test Accuracy: 0.5000
Epoch 14, Test Accuracy: 0.5000
Epoch 15, Test Accuracy: 0.5417
Epoch 16, Test Accuracy: 0.5417
Epoch 17, Test Accuracy: 0.5833
Epoch 18, Test Accuracy: 0.6250
Epoch 19, Test Accuracy: 0.7083
Epoch 20, Test Accuracy: 0.6250
Epoch 21, Test Accuracy: 0.6250
Epoch 22, Test Accuracy: 0.6250
Epoch 23, Test Accuracy: 0.6667
Epoch 24, Test Accuracy: 0.7500
Epoch 25, Test Accuracy: 0.7917
Epoch 26, Test Accuracy: 0.7500
Epoch 27, Test Accuracy: 0.7917
Epoch 28, Test Accuracy: 0.7500
Epoch 29, Test Accuracy: 0.7500
Epoch 30, Test Accuracy: 0.7083


#Задание 2

In [None]:
import pandas as pd

data = pd.read_csv('train_FD001.txt', sep=" ", header=None)
data = data.dropna(axis=1, how='all')

# Группировка по engine_id
grouped_data = data.groupby(0)

In [None]:
X = data.iloc[:, 2:26]

In [None]:
rul = grouped_data[1].max()

In [None]:
data['RUL'] = rul[data[0]].values - data[1]

In [None]:
X = X.loc[:, X.nunique() > 1]  # Удаление константных колонок

In [None]:
sorted_data = data.groupby(0).apply(lambda x: x.sort_values(by=[1])).reset_index(drop=True)

  sorted_data = data.groupby(0).apply(lambda x: x.sort_values(by=[1])).reset_index(drop=True)


In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
X_normalized = scaler.fit_transform(X)

# Обрезка RUL
data['RUL'] = data['RUL'].clip(upper=150)

In [None]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, BatchNormalization

X_train, X_test, y_train, y_test = train_test_split(X_normalized, data['RUL'], test_size=0.2, random_state=42)

model = Sequential()
model.add(LSTM(50, input_shape=(X_train.shape[1], 1)))
model.add(Dropout(0.5))
model.add(BatchNormalization())
model.add(Dense(1))
model.compile(optimizer='adam', loss='mse', metrics=['mae'])

model.fit(X_train, y_train, epochs=20, batch_size=32, validation_data=(X_test, y_test))

Epoch 1/20


  super().__init__(**kwargs)


[1m516/516[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 8ms/step - loss: 10379.1846 - mae: 91.5785 - val_loss: 4507.8018 - val_mae: 58.4598
Epoch 2/20
[1m516/516[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 7ms/step - loss: 4341.9321 - mae: 57.4926 - val_loss: 965.9366 - val_mae: 26.7212
Epoch 3/20
[1m516/516[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 7ms/step - loss: 1209.1276 - mae: 28.3811 - val_loss: 926.4371 - val_mae: 24.6302
Epoch 4/20
[1m516/516[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 7ms/step - loss: 921.1943 - mae: 24.7441 - val_loss: 812.6607 - val_mae: 22.9313
Epoch 5/20
[1m516/516[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 7ms/step - loss: 898.9640 - mae: 24.4302 - val_loss: 772.5686 - val_mae: 22.3843
Epoch 6/20
[1m516/516[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 7ms/step - loss: 919.0493 - mae: 24.7937 - val_loss: 778.7954 - val_mae: 22.2541
Epoch 7/20
[1m516/516[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[

<keras.src.callbacks.history.History at 0x7fd160462250>

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

y_pred = model.predict(X_test)

# Расчет метрик
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"RMSE: {rmse}")
print(f"R²: {r2}")

[1m129/129[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step
RMSE: 25.242717032514424
R²: 0.7388875484466553


In [None]:
X_train_raw, X_test_raw, y_train_raw, y_test_raw = train_test_split(X_normalized, data['RUL'], test_size=0.2, random_state=42)

model_raw = Sequential()
model_raw.add(LSTM(50, input_shape=(X_train_raw.shape[1], 1)))
model_raw.add(Dropout(0.5))
model_raw.add(BatchNormalization())
model_raw.add(Dense(1))
model_raw.compile(optimizer='adam', loss='mse')

#early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

model_raw.fit(X_train_raw, y_train_raw, epochs=10, batch_size=32, validation_data=(X_test_raw, y_test_raw))

y_pred_raw = model_raw.predict(X_test_raw)

# Расчет метрик
rmse_raw = np.sqrt(mean_squared_error(y_test_raw, y_pred_raw))
mae_raw = mean_absolute_error(y_test_raw, y_pred_raw)
r2_raw = r2_score(y_test_raw, y_pred_raw)

print(f"RMSE (без упорядочивания): {rmse_raw}")
print(f"MAE (без упорядочивания): {mae_raw}")
print(f"R² (без упорядочивания): {r2_raw}")

Epoch 1/10


  super().__init__(**kwargs)


[1m516/516[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 6ms/step - loss: 10378.6914 - val_loss: 6821.0088
Epoch 2/10
[1m516/516[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 6ms/step - loss: 4239.3799 - val_loss: 1778.2062
Epoch 3/10
[1m516/516[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 6ms/step - loss: 1224.3434 - val_loss: 923.7646
Epoch 4/10
[1m516/516[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 7ms/step - loss: 1002.7106 - val_loss: 829.2434
Epoch 5/10
[1m516/516[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 6ms/step - loss: 947.3810 - val_loss: 820.9564
Epoch 6/10
[1m516/516[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 6ms/step - loss: 925.9230 - val_loss: 771.2502
Epoch 7/10
[1m516/516[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 7ms/step - loss: 869.8957 - val_loss: 930.5582
Epoch 8/10
[1m516/516[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 6ms/step - loss: 841.2140 - val_loss: 795.3683
Epoch 9/10
