In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional, BatchNormalization
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# อัปโหลดไฟล์ CSV
from google.colab import files, drive

In [None]:
drive.mount('/content/drive')
file_path = "/content/drive/MyDrive/Online Hackathon/Hotel Review Sentiment Analysis/train_data.csv"
df = pd.read_csv(file_path)

X = df['Review'].astype(str)
y = df['Rating']

# แปลงคะแนนให้เป็นตัวเลข
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

# แบ่งชุดข้อมูล train/test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Tokenization และ Padding
max_words = 20000  # ขยาย vocab
max_len = 300  # เพิ่มความยาวสูงสุดของข้อความ

tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

X_train_pad = pad_sequences(X_train_seq, maxlen=max_len, padding='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len, padding='post')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# โหลด GloVe Pretrained Word Embeddings (100D)
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove.6B.zip

In [None]:
embedding_dim = 100
embeddings_index = {}

with open("glove.6B.100d.txt", encoding="utf-8") as f:
    for line in f:
        values = line.split()
        word = values[0]
        coef = np.asarray(values[1:], dtype="float32")
        embeddings_index[word] = coef

# สร้าง Embedding Matrix
word_index = tokenizer.word_index
embedding_matrix = np.zeros((max_words, embedding_dim))

for word, i in word_index.items():
    if i < max_words:
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

# สร้างโมเดล BiLSTM
model = Sequential([
    Embedding(input_dim=max_words, output_dim=embedding_dim, weights=[embedding_matrix], input_length=max_len, trainable=False),
    Bidirectional(LSTM(128, return_sequences=True)),
    Dropout(0.3),
    BatchNormalization(),
    Bidirectional(LSTM(64)),
    Dropout(0.3),
    Dense(32, activation='relu'),
    Dropout(0.2),
    Dense(len(set(y)), activation='softmax')
])

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# เทรนโมเดล
history = model.fit(X_train_pad, y_train, validation_data=(X_test_pad, y_test), epochs=5, batch_size=32)

# ทดสอบโมเดล
loss, accuracy = model.evaluate(X_test_pad, y_test)
print(f'Accuracy: {accuracy:.2f}')

# ฟังก์ชันทำนายรีวิวใหม่
def predict_review(review_text):
    seq = tokenizer.texts_to_sequences([review_text])
    pad_seq = pad_sequences(seq, maxlen=max_len, padding='post')
    pred = model.predict(pad_seq)
    score = np.argmax(pred)
    return label_encoder.inverse_transform([score])[0]



Epoch 1/5
[1m461/461[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m860s[0m 2s/step - accuracy: 0.4719 - loss: 1.2099 - val_accuracy: 0.5679 - val_loss: 0.9918
Epoch 2/5
[1m461/461[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m856s[0m 2s/step - accuracy: 0.5788 - loss: 0.9761 - val_accuracy: 0.6061 - val_loss: 0.8681
Epoch 3/5
[1m461/461[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m845s[0m 2s/step - accuracy: 0.6165 - loss: 0.9002 - val_accuracy: 0.6229 - val_loss: 0.8659
Epoch 4/5
[1m461/461[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m865s[0m 2s/step - accuracy: 0.6297 - loss: 0.8501 - val_accuracy: 0.6362 - val_loss: 0.8146
Epoch 5/5
[1m461/461[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m851s[0m 2s/step - accuracy: 0.6444 - loss: 0.8199 - val_accuracy: 0.6273 - val_loss: 0.8343
[1m116/116[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m62s[0m 538ms/step - accuracy: 0.6383 - loss: 0.8066
Accuracy: 0.63


In [None]:
# ทดสอบกับรีวิวใหม่
test_review = "The hotel was amazing and the staff was very friendly!"
predicted_score = predict_review(test_review)
print(f'Predicted Score: {predicted_score}')

test_review = "beautifull little hotel montmartre june 2008reviewer season travelers middle age couple canadawe just returned 10 day trip paris stayed littre rive gauche montmartre, hotel recommended airline booked agent, surprise room large unusual europe especially paris, appointed immaculately clean, location 1.5 block main metro lines good district shopping eating just walking, staff helpfull breakfast expected courteous attentive staff, 3rd trip paris best budget hotel stayed, recommend,  "
predicted_score = predict_review(test_review)
print(f'Predicted Score: {predicted_score}')

test_review = "money waiste right ahead book barcele punta cana start n't speak spanish forget thank god husband did help lot people, start check-in need lot patient good room oh boy old smelly tv n't work remote n't work suppose satellite channel spanish ask water pop card towels, bring bc wo n't able shower notowels bring hide did facecloths iron does'nt work hairdryer forget soap shampoo weeks wash tub wipe mirrors not expecting clean vacation n't want clean vacation n't tip no service including bars buffet attitude dirty extremely dirty like eat food salt bingo, place go.toronto canada,  "
predicted_score = predict_review(test_review)
print(f'Predicted Score: {predicted_score}')

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step
Predicted Score: 5
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 241ms/step
Predicted Score: 5
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 270ms/step
Predicted Score: 1


In [None]:
file_path = "/content/drive/MyDrive/Online Hackathon/Hotel Review Sentiment Analysis/test_data.csv"
df = pd.read_csv(file_path)

# ตรวจสอบว่ามีคอลัมน์ 'ID' และ 'Review' หรือไม่
if 'ID' in df.columns and 'Review' in df.columns:
    # ใช้ .apply() เพื่อเรียก predict_review() พร้อมกันหลายตัว
    df['Rating'] = df['Review'].astype(str).apply(predict_review)

    # เลือกเฉพาะคอลัมน์ ID และ Rating
    result_df = df[['ID', 'Rating']]

    # แสดงตัวอย่างผลลัพธ์
    print(result_df.head())

    # บันทึกผลลัพธ์เป็นไฟล์ CSV
    # result_df.to_csv("/content/drive/MyDrive/Hotel_Review_Predictions.csv", index=False)
else:
    print("Error: CSV file must contain 'ID' and 'Review' columns.")



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 290ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 275ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 309ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 309ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 313ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 300ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 156ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 143ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 146ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 148ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 145ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 155ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 146ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m 

In [None]:
result_df.to_csv("output.csv", index=False)