In [10]:
import spacy
import pandas as pd
import numpy as np
from keras.models import Sequential
from keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import chardet

In [33]:

# Load Spacy model
nlp = spacy.load("en_core_web_sm")

# Load and preprocess data
with open("C:/Users/fedor/Documents/NLP/task1/tripadvisor_hotel_reviews.csv", 'rb') as f:
    result = chardet.detect(f.read())
df = pd.read_csv("C:/Users/fedor/Documents/NLP/task1/tripadvisor_hotel_reviews.csv", encoding=result['encoding'], sep=',')
df = df[['Review', 'Rating']]
df = df[df["Rating"] != 3]
df["Rating"] = df["Rating"].apply(lambda x: 0 if x in [1, 2] else 1)
df = df.sample(frac=1)
# Prepare text and label data
texts = df['Review'].values
labels = df['Rating'].values

In [34]:
i = 1
print(texts[i])
print(labels[i])

bravo overrated stayed bravo 3 nights martineau bay 3 nights glad did n't stay bravo stay, ok parts hotel bravo quite pretty rooms totally overrated, beds uncomfortable hard frette linen world n't help bed cheap foam matress hard board, no proper drapes blinds seaview room 5 6 strips white cloth not pull right outside window terrace guests restaurant eat, n't count privacy room people stare straight, used room sleeping 6 7 sun blaring room, assume minimalist just low standard.also not use beach rocky swimming conditions bad, nobody used 3 days, staff friendly, restaurant good food, packed lunches pretty good half listed menu n't picked, martineau bay heaven stay bravo, hotel 2 pristine beaches room beautiful luxurius bathtub bedroom bathroom, rooms clean times size rooms bravo, spa really nice too.blue beach red beach amazing,  
0


In [36]:
# Tokenize the text data
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
word_index = tokenizer.word_index

# Pad sequences to the same length
data = pad_sequences(sequences, maxlen=1000)

# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(data, labels, test_size=0.2)

# Define CNN architecture
model = Sequential()
model.add(Embedding(len(word_index) + 1, 100, input_length=1000))
model.add(Conv1D(128, 5, activation='relu'))
model.add(GlobalMaxPooling1D())
model.add(Dense(1, activation='sigmoid'))

In [37]:

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, batch_size=128, epochs=20, validation_data=(X_val, y_val), verbose=2)


Epoch 1/20
115/115 - 13s - loss: 0.3538 - accuracy: 0.8600 - val_loss: 0.1741 - val_accuracy: 0.9347 - 13s/epoch - 113ms/step
Epoch 2/20
115/115 - 14s - loss: 0.1247 - accuracy: 0.9548 - val_loss: 0.1283 - val_accuracy: 0.9465 - 14s/epoch - 119ms/step
Epoch 3/20
115/115 - 14s - loss: 0.0683 - accuracy: 0.9777 - val_loss: 0.1223 - val_accuracy: 0.9500 - 14s/epoch - 118ms/step
Epoch 4/20
115/115 - 14s - loss: 0.0371 - accuracy: 0.9907 - val_loss: 0.1303 - val_accuracy: 0.9500 - 14s/epoch - 120ms/step
Epoch 5/20
115/115 - 14s - loss: 0.0184 - accuracy: 0.9970 - val_loss: 0.1502 - val_accuracy: 0.9500 - 14s/epoch - 122ms/step
Epoch 6/20
115/115 - 14s - loss: 0.0085 - accuracy: 0.9992 - val_loss: 0.1584 - val_accuracy: 0.9500 - 14s/epoch - 120ms/step
Epoch 7/20
115/115 - 14s - loss: 0.0042 - accuracy: 0.9998 - val_loss: 0.1674 - val_accuracy: 0.9468 - 14s/epoch - 119ms/step
Epoch 8/20
115/115 - 14s - loss: 0.0023 - accuracy: 1.0000 - val_loss: 0.1780 - val_accuracy: 0.9478 - 14s/epoch - 122

<keras.src.callbacks.History at 0x1a8edfca8f0>

In [38]:
model.save_weights('model_weights.h5')

In [42]:
# Load the saved weights
model.load_weights('model_weights.h5')

# Now you can use this model to predict the sentiment of a new text:
text = "super good window room"
# Tokenize and pad the text so it can be used as input to the model
sequence = tokenizer.texts_to_sequences([text])
data = pad_sequences(sequence, maxlen=1000)
# Use the model to predict the sentiment of the text
prediction = model.predict(data)
sentiment = "positive" if prediction > .5 else "negative"
print(f"The sentiment is {sentiment}")

The sentiment is positive
