In [68]:
import os 

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Bidirectional, Dense, Dropout
from tensorflow.keras.optimizers import Adam


In [69]:
arc_path = "C:\\Programming\\univer\\NPL\\Lab2\\aclImdb"

pos_train_path = arc_path + "\\train\\pos"
neg_train_path = arc_path + "\\train\\neg"

pos_file_names = os.listdir(pos_train_path)
neg_file_names = os.listdir(neg_train_path)

In [70]:
bound = 30000

error_list = []

pos_list = []
for index, pos_file_name in enumerate(pos_file_names):
    with open(f"{pos_train_path}\\{pos_file_name}", "r") as file:
        try:
            pos_list.append(file.read())
        except:
            error_list.append(f"pos: {pos_file_name}")
    if index + 1 >= bound:
        break

neg_list = []
for index, neg_file_name in enumerate(neg_file_names):
    with open(f"{neg_train_path}\\{neg_file_name}", "r") as file:
        try:
            neg_list.append(file.read())
        except:
            error_list.append(f"neg: {neg_file_name}")
    if index + 1 >= bound:
        break

In [71]:
min_len = min(len(pos_list), len(neg_list))
df = pd.DataFrame.from_dict({"pos" : pos_list[:min_len], "neg": neg_list[:min_len]})

print("Head:")
print(df.head(1))

print("\nTail")
print(df.tail(1))

Head:
                                                 pos  \
0  Bromwell High is a cartoon comedy. It ran at t...   

                                                 neg  
0  Story of a man who has unnatural feelings for ...  

Tail
                                                     pos  \
12496  Working-class romantic drama from director Mar...   

                                                     neg  
12496  This is one of the dumbest films, I've ever se...  


In [72]:
pos_docs = df['pos'].tolist() 
neg_docs = df['neg'].tolist()

reviews = pos_docs + neg_docs
labels = [1] * len(pos_docs) + [0] * len(neg_docs)

In [73]:
X_train, X_test, y_train, y_test = train_test_split(reviews, labels, test_size=0.2, random_state=42)

In [74]:
max_words = 10000 
maxlen = 100

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X_train)

In [75]:
X_train_sequences = tokenizer.texts_to_sequences(X_train)
X_test_sequences = tokenizer.texts_to_sequences(X_test)

# print(X_train_sequences[100])
# print(type(tokenizer.index_word))
# print(type(tokenizer))

In [76]:
maxlen = 100
X_train_padded = pad_sequences(X_train_sequences, maxlen=maxlen)
X_test_padded = pad_sequences(X_test_sequences, maxlen=maxlen)
y_train = np.array(y_train)
y_test = np.array(y_test)

In [80]:
model = Sequential()
model.add(Embedding(input_dim=10000, output_dim=128))
model.add(Bidirectional(LSTM(64, return_sequences=False)))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [78]:
batch_size = 32
epochs = 10
model.fit(X_train_padded, y_train, batch_size=batch_size, epochs=epochs, validation_data=(X_test_padded, y_test))

Epoch 1/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 32ms/step - accuracy: 0.7205 - loss: 0.5162 - val_accuracy: 0.8520 - val_loss: 0.3372
Epoch 2/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 31ms/step - accuracy: 0.9069 - loss: 0.2393 - val_accuracy: 0.8508 - val_loss: 0.3576
Epoch 3/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 31ms/step - accuracy: 0.9437 - loss: 0.1534 - val_accuracy: 0.8412 - val_loss: 0.4583
Epoch 4/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 30ms/step - accuracy: 0.9725 - loss: 0.0839 - val_accuracy: 0.8324 - val_loss: 0.5858
Epoch 5/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 30ms/step - accuracy: 0.9840 - loss: 0.0516 - val_accuracy: 0.8328 - val_loss: 0.6301
Epoch 6/10
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 30ms/step - accuracy: 0.9898 - loss: 0.0312 - val_accuracy: 0.8338 - val_loss: 0.7526
Epoch 7/10
[1m6

<keras.src.callbacks.history.History at 0x22dd4c8a750>

In [79]:
new_reviews = [
    "This movie was absolutely fantastic and kept me on the edge of my seat!",
    "I found the plot to be boring and predictable.",
    "The acting was superb, and the storyline was captivating.",
    "It was a total waste of time; I couldn't wait for it to end.",
    "An emotional rollercoaster that left me speechless.",
    "A brilliant masterpiece that I would recommend to everyone.",
    "I loved the character development throughout the movie.",
    "Unfortunately, it didn't live up to the hype; very disappointing.",
    "That was the worst film I've ever seen",
    "I don't think that was good choise to split this film in two parts, but in general I liked it"
]

new_reviews_sequences = tokenizer.texts_to_sequences(new_reviews)
new_reviews_padded = pad_sequences(new_reviews_sequences, maxlen=maxlen)

predictions = model.predict(new_reviews_padded)

for i, prediction in enumerate(predictions):
    predicted_class = (prediction[0] > 0.5).astype(int)
    if predicted_class == 1:
        print(f"pos: {new_reviews[i]}")
    else:
        print(f"neg: {new_reviews[i]}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 250ms/step
pos: This movie was absolutely fantastic and kept me on the edge of my seat!
neg: I found the plot to be boring and predictable.
pos: The acting was superb, and the storyline was captivating.
neg: It was a total waste of time; I couldn't wait for it to end.
pos: An emotional rollercoaster that left me speechless.
pos: A brilliant masterpiece that I would recommend to everyone.
pos: I loved the character development throughout the movie.
neg: Unfortunately, it didn't live up to the hype; very disappointing.
neg: That was the worst film I've ever seen
pos: I don't think that was good choise to split this film in two parts, but in general I liked it
