In [None]:
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
import pandas as pd
import re
from langdetect import detect
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np


In [None]:
#load the dataset
print("loading airbnb reviews dataset...")
df = pd.read_csv("reviews.csv")
print(f"Orginal dataset\n {df}")

loading airbnb reviews dataset...
Orginal dataset
                  listing_id                   id        date  reviewer_id  \
0                    164448               407660  2011-07-30       870312   
1                    164448               451097  2011-08-16       901633   
2                    164448               472271  2011-08-24       894674   
3                    164448               521708  2011-09-11       232485   
4                    164448               568347  2011-09-26       896712   
...                     ...                  ...         ...          ...   
144943  1305729897949100039  1317299844322463966  2024-12-22    666963478   
144944  1306393577432561110  1315873156254706149  2024-12-20    442177472   
144945  1310885179474906446  1316587888366107935  2024-12-21     53058534   
144946  1312171107579620356  1320280905319488958  2024-12-26    595482193   
144947  1316786620019656619  1322401494741586698  2024-12-29    135251598   

       reviewer_name    

In [None]:
# Clean html tags
def clean_html(text):
    if isinstance(text, str):
        return re.sub(r"<.*?>", "", text)
    return text

In [None]:
# Remove the empty comments rows
df = df.dropna(subset=["comments"])
df["comments"] = df["comments"].apply(clean_html)
print(f"After remoing empty comments\n {df}")

After remoing empty comments
                  listing_id                   id        date  reviewer_id  \
0                    164448               407660  2011-07-30       870312   
1                    164448               451097  2011-08-16       901633   
2                    164448               472271  2011-08-24       894674   
3                    164448               521708  2011-09-11       232485   
4                    164448               568347  2011-09-26       896712   
...                     ...                  ...         ...          ...   
144943  1305729897949100039  1317299844322463966  2024-12-22    666963478   
144944  1306393577432561110  1315873156254706149  2024-12-20    442177472   
144945  1310885179474906446  1316587888366107935  2024-12-21     53058534   
144946  1312171107579620356  1320280905319488958  2024-12-26    595482193   
144947  1316786620019656619  1322401494741586698  2024-12-29    135251598   

       reviewer_name                         

In [None]:
# Filter out the rows that isn't in English
def is_english(text):
    try:
        return detect(text) == "en"
    except:
        return False
    
df["is_english"] = df["comments"].apply(is_english)
df = df[df["is_english"]]
print(f"After removing non-english comments\n {df}")

After removing non-english comments
                  listing_id                   id        date  reviewer_id  \
0                    164448               407660  2011-07-30       870312   
1                    164448               451097  2011-08-16       901633   
2                    164448               472271  2011-08-24       894674   
3                    164448               521708  2011-09-11       232485   
4                    164448               568347  2011-09-26       896712   
...                     ...                  ...         ...          ...   
144934  1298010749055182234  1312307766873062259  2024-12-15    436029491   
144935  1298347645930946959  1307248295819579395  2024-12-08    479920506   
144938  1301950247646962698  1316668854181848371  2024-12-21     34223838   
144942  1304284803854492401  1307949554237811713  2024-12-09     86342696   
144944  1306393577432561110  1315873156254706149  2024-12-20    442177472   

       reviewer_name                  

In [None]:
print("Loading pre-trained BERT model for sentiment analysis...")
# load a pre-trained bert model from tensorflow hub
model_url = "https://tfhub.dev/google/universal-sentence-encoder/4"

Loading pre-trained BERT model for sentiment analysis...


In [None]:
# build the model

def build_classifier_model():
    hub_layer = hub.KerasLayer(model_url, trainable=False)

    model = tf.keras.Sequential([
        hub_layer,
        tf.keras.layers.Dense(64, activation="relu"),
        tf.keras.layers.Dense(32, activation="relu"),
        tf.keras.layers.Dense(1, activation="sigmoid")
    ])

    model.compile(
        optimizer="adam",
        loss="binary_crossentropy",
        metrics=["accuracy"]
    )
    
    return model

In [None]:
# create the model
print("Creating labels for training model...")
positive_words = ['great', 'good', 'nice', 'excellent', 'perfect', 'happy', 'wonderful', 
                 'fantastic', 'amazing', 'love', 'best', 'beautiful', 'clean', 'comfortable']
negative_words = ['bad', 'poor', 'terrible', 'horrible', 'awful', 'worst', 'dirty', 
                 'disappointing', 'disappointment', 'uncomfortable', 'problem', 'not clean']

Creating labels for training model...


In [None]:
def simple_sentiment(text):
    if not isinstance(text, str):
        return 1
    
    text = text.lower()
    pos_count = sum(1 for word in positive_words if word in text)
    neg_count = sum(1 for word in negative_words if word in text)

    if neg_count > pos_count:
        return 0 # negative
    else:
        return 1 # Default to positive

In [None]:
# create simple labels
df["sentiment_label"] = df["comments"].apply(simple_sentiment)

In [None]:
# Print distribution of initial labels
print("\nInitial sentiment distribution:")
label_counts = df["sentiment_label"].value_counts()
print(label_counts)
print(f"Positive percentage: {label_counts[1]/len(df)*100:.2f}%")


Initial sentiment distribution:
sentiment_label
1    99320
0      386
Name: count, dtype: int64
Positive percentage: 99.61%


In [None]:
# sample subset for training
sample_size = min(10000, len(df))
sample_df = df.sample(sample_size, random_state=42)

In [None]:
# text preprocessing
print(f"Preprocessing text data...")
max_features = 10000
max_length = 200

Preprocessing text data...


In [None]:
#Create a tokenizer
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(sample_df["comments"])

In [None]:
#convert text to sequences
x_sequences = tokenizer.texts_to_sequences(sample_df["comments"])
x_padded = pad_sequences(x_sequences, maxlen=max_length)
y = sample_df["sentiment_label"].values

In [None]:
# split into training and validation sets
x_train, x_val, y_train, y_val = train_test_split(x_padded, y, test_size=0.2, random_state=42)
print(f"Training data shape: {x_train.shape}")
print(f"Validation data shape: {x_val.shape}")

Training data shape: (8000, 200)
Validation data shape: (2000, 200)


In [None]:
# build a simple neural network model
print("Building and training sentiment model...")
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(max_features, 128, input_length=max_length),
    tf.keras.layers.Conv1D(128, 5, activation="relu"),
    tf.keras.layers.GlobalMaxPool1D(),
    tf.keras.layers.Dense(64, activation="relu"),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(1, activation="sigmoid")
])

Building and training sentiment model...




In [None]:
# compile the model
model.compile(
    optimizer="adam",
    loss="binary_crossentropy",
    metrics=["accuracy"]
)

In [None]:
# display model summary
model.summary()

In [None]:
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))
print(tf.test.is_built_with_cuda())  # Check if TensorFlow is built with CUDA
print(tf.test.is_built_with_gpu_support())  # Check if GPU support is enabled
print(tf.test.is_gpu_available())  # Check if GPU is available and usable

Num GPUs Available:  1
True
True
Instructions for updating:
Use `tf.config.list_physical_devices('GPU')` instead.


Instructions for updating:
Use `tf.config.list_physical_devices('GPU')` instead.


True


I0000 00:00:1745488941.578492   28069 gpu_device.cc:2019] Created device /device:GPU:0 with 6687 MB memory:  -> device: 0, name: NVIDIA GeForce GTX 1080, pci bus id: 0000:2b:00.0, compute capability: 6.1


In [None]:
# train the model
history = model.fit(
    x_train, y_train,
    epochs=5,
    batch_size=16,
    validation_data=(x_val, y_val),
    verbose=1
)

Epoch 1/5


E0000 00:00:1745489013.764935   29330 cuda_dnn.cc:522] Loaded runtime CuDNN library: 9.1.0 but source was compiled with: 9.3.0.  CuDNN library needs to have matching major version and equal or higher minor version. If using a binary install, upgrade your CuDNN library.  If building from sources, make sure the library loaded at runtime is compatible with the version specified during compile configuration.
E0000 00:00:1745489013.811076   29330 cuda_dnn.cc:522] Loaded runtime CuDNN library: 9.1.0 but source was compiled with: 9.3.0.  CuDNN library needs to have matching major version and equal or higher minor version. If using a binary install, upgrade your CuDNN library.  If building from sources, make sure the library loaded at runtime is compatible with the version specified during compile configuration.
2025-04-24 12:03:33.814901: W tensorflow/core/framework/op_kernel.cc:1857] OP_REQUIRES failed at xla_ops.cc:591 : FAILED_PRECONDITION: DNN library initialization failed. Look at the er

FailedPreconditionError: Graph execution error:

Detected at node StatefulPartitionedCall defined at (most recent call last):
  File "/usr/lib/python3.10/runpy.py", line 196, in _run_module_as_main

  File "/usr/lib/python3.10/runpy.py", line 86, in _run_code

  File "/mnt/c/pa2572/assignment_1/pa2572/venv/lib/python3.10/site-packages/ipykernel_launcher.py", line 18, in <module>

  File "/mnt/c/pa2572/assignment_1/pa2572/venv/lib/python3.10/site-packages/traitlets/config/application.py", line 1075, in launch_instance

  File "/mnt/c/pa2572/assignment_1/pa2572/venv/lib/python3.10/site-packages/ipykernel/kernelapp.py", line 739, in start

  File "/mnt/c/pa2572/assignment_1/pa2572/venv/lib/python3.10/site-packages/tornado/platform/asyncio.py", line 205, in start

  File "/usr/lib/python3.10/asyncio/base_events.py", line 603, in run_forever

  File "/usr/lib/python3.10/asyncio/base_events.py", line 1909, in _run_once

  File "/usr/lib/python3.10/asyncio/events.py", line 80, in _run

  File "/mnt/c/pa2572/assignment_1/pa2572/venv/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 545, in dispatch_queue

  File "/mnt/c/pa2572/assignment_1/pa2572/venv/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 534, in process_one

  File "/mnt/c/pa2572/assignment_1/pa2572/venv/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 437, in dispatch_shell

  File "/mnt/c/pa2572/assignment_1/pa2572/venv/lib/python3.10/site-packages/ipykernel/ipkernel.py", line 362, in execute_request

  File "/mnt/c/pa2572/assignment_1/pa2572/venv/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 778, in execute_request

  File "/mnt/c/pa2572/assignment_1/pa2572/venv/lib/python3.10/site-packages/ipykernel/ipkernel.py", line 449, in do_execute

  File "/mnt/c/pa2572/assignment_1/pa2572/venv/lib/python3.10/site-packages/ipykernel/zmqshell.py", line 549, in run_cell

  File "/mnt/c/pa2572/assignment_1/pa2572/venv/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3077, in run_cell

  File "/mnt/c/pa2572/assignment_1/pa2572/venv/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3132, in _run_cell

  File "/mnt/c/pa2572/assignment_1/pa2572/venv/lib/python3.10/site-packages/IPython/core/async_helpers.py", line 128, in _pseudo_sync_runner

  File "/mnt/c/pa2572/assignment_1/pa2572/venv/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3336, in run_cell_async

  File "/mnt/c/pa2572/assignment_1/pa2572/venv/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3519, in run_ast_nodes

  File "/mnt/c/pa2572/assignment_1/pa2572/venv/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3579, in run_code

  File "/tmp/ipykernel_28069/2429203810.py", line 2, in <module>

  File "/mnt/c/pa2572/assignment_1/pa2572/venv/lib/python3.10/site-packages/keras/src/utils/traceback_utils.py", line 117, in error_handler

  File "/mnt/c/pa2572/assignment_1/pa2572/venv/lib/python3.10/site-packages/keras/src/backend/tensorflow/trainer.py", line 371, in fit

  File "/mnt/c/pa2572/assignment_1/pa2572/venv/lib/python3.10/site-packages/keras/src/backend/tensorflow/trainer.py", line 219, in function

  File "/mnt/c/pa2572/assignment_1/pa2572/venv/lib/python3.10/site-packages/keras/src/backend/tensorflow/trainer.py", line 132, in multi_step_on_iterator

DNN library initialization failed. Look at the errors above for more details.
	 [[{{node StatefulPartitionedCall}}]] [Op:__inference_multi_step_on_iterator_9410]