# Downloading Libraries

In [1]:
! pip install nltk
! pip install vaderSentiment
! pip install pytrends
! pip install textblob
! pip install wordcloud
! pip install gensim
! pip install seaborn
! pip install TextBlob
! pip install joblib
! pip install transformers

Collecting transformers
  Downloading transformers-4.36.2-py3-none-any.whl.metadata (126 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m126.8/126.8 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting filelock (from transformers)
  Downloading filelock-3.13.1-py3-none-any.whl.metadata (2.8 kB)
Collecting huggingface-hub<1.0,>=0.19.3 (from transformers)
  Downloading huggingface_hub-0.20.1-py3-none-any.whl.metadata (12 kB)
Collecting pyyaml>=5.1 (from transformers)
  Downloading PyYAML-6.0.1.tar.gz (125 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m125.2/125.2 kB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Collecting tokenizers<0.19,>=0.14 (from transformers)
  Downloading tokenizers-0.15.0-cp38-cp38-macosx_11_0_arm64.whl.metadata (6.7 kB)
Collecting s

# Importing Libraries

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, TFBertForSequenceClassification
from tensorflow.keras.optimizers import Adam
import tensorflow as tf

  from .autonotebook import tqdm as notebook_tqdm
None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


ModuleNotFoundError: No module named 'tensorflow'

# Importing Dataset

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
df = pd.read_csv('/content/drive/MyDrive/Datasets/Cleaned Data/reviews.csv')
df.head()

Unnamed: 0,listing_id,reviewer_name,comments,cleaned_comments,polarity,sentiment
0,4326511,Laura,Das Zimmer und das Bad waren sauber und komfor...,da zimmer und da bad waren sauber und komforta...,-0.7,negative
1,603032069621870277,Fatima,Bad service,bad servic,-0.7,negative
2,8629818,Andrew,"In Paul's absence, Lucy and Jack were amazing ...",paul absenc luci jack amaz host brthe secret g...,-0.4,negative
3,25012636,Kate,I used Nicholas’ room as a base whilst I was i...,use nicholas’ room base whilst properti busi f...,-0.8,negative
4,12725143,Yvon,You can’t go wrong with the townhouse. It has ...,can’t go wrong townhous amen need place spotle...,-0.5,negative


# Date Preprocessing

In [None]:
df.isnull().sum()

listing_id            0
reviewer_name        63
comments             63
cleaned_comments    209
polarity            163
sentiment           163
dtype: int64

In [None]:
# Dropping the null values
df.dropna(inplace=True)
df.isnull().sum()

listing_id          0
reviewer_name       0
comments            0
cleaned_comments    0
polarity            0
sentiment           0
dtype: int64

## **<center><u>Assessment: Experimentations (using Deep Learning)</u></center>**


In [None]:
# Initialize tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [None]:
# Tokenize and encode sequences in the dataset
tokens = tokenizer.batch_encode_plus(
    df['cleaned_comments'].tolist(),
    max_length=128,  # You can adjust this depending on your text length
    padding='max_length',
    truncation=True,
    add_special_tokens=True,
    return_attention_mask=True,
    return_tensors='tf'
  )

In [None]:
# Prepare input data
input_ids = tokens['input_ids']
attention_mask = tokens['attention_mask']

In [None]:
# Prepare labels
df['sentiment'] = df['sentiment'].astype('category')
labels = df['sentiment'].cat.codes

In [None]:
input_ids_np = input_ids.numpy()
attention_mask_np = attention_mask.numpy()

labels_np = labels.to_numpy()

In [None]:
train_input_ids, test_input_ids, train_attention_mask, test_attention_mask, train_labels, test_labels = train_test_split(
    input_ids_np,
    attention_mask_np,
    labels_np,  # assuming labels_np is already a numpy array
    test_size=0.2,
    random_state=42
)

In [None]:
train_input_ids = tf.convert_to_tensor(train_input_ids)
train_attention_mask = tf.convert_to_tensor(train_attention_mask)
train_labels = tf.convert_to_tensor(train_labels)

test_input_ids = tf.convert_to_tensor(test_input_ids)
test_attention_mask = tf.convert_to_tensor(test_attention_mask)
test_labels = tf.convert_to_tensor(test_labels)

In [None]:
train_labels = tf.cast(train_labels, dtype=tf.int32)

In [None]:
model = TFBertForSequenceClassification.from_pretrained(
    'bert-base-uncased', num_labels=len(df['sentiment'].cat.categories))

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
optimizer = Adam(learning_rate=2e-5, epsilon=1e-08)
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [None]:
history = model.fit(
    {'input_ids': train_input_ids, 'attention_mask': train_attention_mask},
    train_labels,
    batch_size=32,  # Adjust based on your GPU memory
    epochs=10,  # BERT typically requires only a few epochs
    validation_split=0.1  # Use a part of the training set for validation
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
result = model.evaluate(
    x={'input_ids': test_input_ids, 'attention_mask': test_attention_mask},
    y=test_labels
)
print(f"Test loss: {result[0]}, Test accuracy: {result[1]}")

Test loss: 1.0986123085021973, Test accuracy: 0.3271551728248596
