In [3]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
import nltk
from nltk.corpus import stopwords

In [29]:
def load_data(data_dir):
    reviews = []
    labels = []
    ratings = []
    
    for sentiment in ['neg', 'pos']:
        folder_path = os.path.join(data_dir, sentiment)
        for filename in os.listdir(folder_path):
            if filename.endswith('.txt'):
                with open(os.path.join(folder_path, filename), 'r', encoding='utf-8') as file:
                    review = file.read()
                    reviews.append(review)
                    if sentiment == 'neg':
                        labels.append(0)
                        rating = int(filename.split('_')[1].replace('.txt', '')) 
                        ratings.append(rating)
                    else:
                        labels.append(1)
                        rating = int(filename.split('_')[1].replace('.txt', '')) 
                        ratings.append(rating)
    
    return reviews, labels, ratings

In [31]:
data_dir = 'Dataset/train'
reviews, labels, ratings = load_data(data_dir)

In [35]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
reviews = [' '.join([word for word in review.lower().split() if word not in stop_words]) for review in reviews]


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\dogde\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [37]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(reviews)
X = tokenizer.texts_to_sequences(reviews)
X = pad_sequences(X)

In [39]:
y_class = np.array(labels)
y_rank = np.array(ratings)
X_train, X_test, y_class_train, y_class_test, y_rank_train, y_rank_test = train_test_split(X, y_class, y_rank, test_size=0.2, random_state=42)


In [43]:
model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=128))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(100))
model.add(Dense(1, activation='sigmoid'))  # Для бинарной классификации

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])


In [45]:
model.fit(X_train, y_class_train, epochs=5, batch_size=64, validation_data=(X_test, y_class_test))


Epoch 1/5
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m457s[0m 1s/step - accuracy: 0.7354 - loss: 0.5044 - val_accuracy: 0.8830 - val_loss: 0.3033
Epoch 2/5
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m482s[0m 2s/step - accuracy: 0.9414 - loss: 0.1647 - val_accuracy: 0.8844 - val_loss: 0.3024
Epoch 3/5
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m492s[0m 2s/step - accuracy: 0.9764 - loss: 0.0756 - val_accuracy: 0.8652 - val_loss: 0.3660
Epoch 4/5
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m517s[0m 2s/step - accuracy: 0.9815 - loss: 0.0580 - val_accuracy: 0.8746 - val_loss: 0.5074
Epoch 5/5
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m478s[0m 2s/step - accuracy: 0.9949 - loss: 0.0209 - val_accuracy: 0.8720 - val_loss: 0.5928


<keras.src.callbacks.history.History at 0x1d119262690>

In [49]:
loss, accuracy = model.evaluate(X_test, y_class_test)
print(f'Accuracy: {accuracy * 100:.2f}%')

[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 86ms/step - accuracy: 0.8742 - loss: 0.5895
Accuracy: 87.20%


In [51]:
from sklearn.metrics import classification_report, confusion_matrix

y_pred = (model.predict(X_test) > 0.5).astype("int32")
print(confusion_matrix(y_class_test, y_pred))
print(classification_report(y_class_test, y_pred))

[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 89ms/step
[[2160  355]
 [ 285 2200]]
              precision    recall  f1-score   support

           0       0.88      0.86      0.87      2515
           1       0.86      0.89      0.87      2485

    accuracy                           0.87      5000
   macro avg       0.87      0.87      0.87      5000
weighted avg       0.87      0.87      0.87      5000



In [53]:
model.save('model.h5')



In [59]:
model.save('model.keras')

In [61]:
import pickle 
with open('tokenizer.pickle', 'wb') as handle: pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)