## LLM

In this file the project is using the llama-3-405b-instruct model to predict sentiment from Trip Advisor reviews

Note that most of the code in this file is inspired by the LLM guide in MA2

In [None]:
import pandas as pd
from decouple import config
from ibm_watsonx_ai import APIClient
from ibm_watsonx_ai import Credentials
from ibm_watsonx_ai.foundation_models import ModelInference
from sklearn.metrics import classification_report 
from tqdm import tqdm
from ibm_watsonx_ai.foundation_models.schema import TextGenParameters
from sklearn.model_selection import train_test_split

## Loading in the data

In [5]:
#specify CSV path
csv_path = '../Preprocessing/data_preprocessed_general.csv'

#creating a df from the file
data = pd.read_csv(csv_path)

data.head(5)

Unnamed: 0,text,rating_overall
0,rooms are fine. service tries hard but does no...,3.0
1,best place to stay in nyc. want to go back mis...,5.0
2,it's a great place. i'll always check to see i...,5.0
3,this hotel has some of the biggest rooms in ma...,5.0
4,if you want to stay on the upper west side thi...,4.0


In [6]:
data['rating_overall'] = data['rating_overall'].replace(range(0, 3), 'Negative')
data['rating_overall'] = data['rating_overall'].replace(3, 'Neutral')
data['rating_overall'] = data['rating_overall'].replace(range(4, 6), 'Positive')

result = data.groupby('rating_overall').size()

result

rating_overall
Negative     3263
Neutral      3982
Positive    38291
dtype: int64

In [7]:

fraction = 0.05  
df_sample = data.sample(frac=fraction, random_state=42)

# Display the shape of the sampled DataFrame
print(df_sample.shape)

# Display the first few rows of the sampled DataFrame
print(df_sample.head())

(2277, 2)
                                                    text rating_overall
10586  great location for what we were seeing in ny, ...       Positive
28420  the hotel staff are very good. friendly, accom...       Positive
30457  clean and comfortable hotel - staff very helpf...       Positive
43765  hotel was sound. was in town for pistons and i...       Positive
14475  great location on the magnificent mile. beauti...       Positive


In [None]:


# Split df_sample into train (80%) and test (20%) sets
train_df, test_df = train_test_split(df_sample, test_size=0.8, random_state=42)

# Display the shapes of the resulting DataFrames
print("Train DataFrame shape:", train_df.shape)
print("Test DataFrame shape:", test_df.shape)

# Display the first few rows of each DataFrame
print("Train DataFrame:")
print(train_df.head())
print("\nTest DataFrame:")
print(test_df.head())

Train DataFrame shape: (455, 2)
Test DataFrame shape: (1822, 2)
Train DataFrame:
                                                    text rating_overall
42449  my stay at the baymont was more than i expecte...       Positive
10955  great location near irish bars and flying sauc...       Positive
6233   i stay here at least once a month for a due du...       Positive
37721  excellent. comfortable and friendly.\nstaff wa...       Positive
39127  nice, clean, upscale but expensive. i was look...       Positive

Test DataFrame:
                                                    text rating_overall
1655   everthing was superb and our room was exceptio...       Positive
44129  very cosy location, but not quite up to standa...       Positive
13254  fabulous location, great rooms and they know i...       Positive
19543  this was the best staff i have experienced @ a...       Positive
39045  i just liked the location other than that, ver...       Negative


In [9]:
result_test = train_df.groupby('rating_overall').size()

result_test

rating_overall
Negative     46
Neutral      38
Positive    371
dtype: int64

## Connecting to WatsonX and checking the connection

In [10]:
WX_API_KEY = config("WX_API_KEY")
WX_PROJECT_ID = config("WX_PROJECT_ID")
WX_API_URL = 'https://us-south.ml.cloud.ibm.com'

In [11]:


credentials = Credentials(
    url = WX_API_URL,
    api_key = WX_API_KEY
)

client = APIClient(
    credentials=credentials, 
    project_id=WX_PROJECT_ID
)


In [12]:
model = ModelInference(
    api_client=client,
    model_id="meta-llama/llama-3-405b-instruct",
)

In [13]:
prompt = "How do I make a pizza?"
generated_response = model.generate(prompt)

generated_response



{'model_id': 'meta-llama/llama-3-405b-instruct',
 'model_version': '3.1.0',
 'created_at': '2025-05-07T12:45:57.971Z',
 'results': [{'generated_text': " Here's a step-by-step guide to making a delicious homemade pizza:\nIngredients:\n\n* 1 ",
   'generated_token_count': 20,
   'input_token_count': 8,
   'stop_reason': 'max_tokens'}],
    'more_info': 'https://dataplatform.cloud.ibm.com/docs/content/wsj/analyze-data/fm-models.html?context=wx'},
   {'message': "The value of 'parameters.max_new_tokens' for this model was set to value 20",
    'id': 'unspecified_max_new_tokens',
    'additional_properties': {'limit': 0,
     'new_value': 20,
     'parameter': 'parameters.max_new_tokens',
     'value': 0}}]}}

### Building a model 

In [17]:
PARAMS = TextGenParameters(
    temperature=0.2,              # Higher temperature means more randomness - In this case we don't want randomness
    max_new_tokens=20,
    top_p=0.7,        
    stop_sequences=[".", "\n"], # Stop generating text when these sequences are encountered
)

model = ModelInference(
    api_client=client,
    model_id="meta-llama/llama-3-405b-instruct", 
    params=PARAMS
)

#### Using zero shot prompting

In [18]:
zero_shot = """You task is to classify the sentiment of hotel reviews from Trip Advisor provided in the text into one of three sentiment categories.

CATEGORIES:
{categories}

TEXT:
{text}

Please assign the correct sentiment category to the review. Your answer should only inlcude the correct sentiment category and nothing else.

Category:
"""

In [19]:
# Create a string with all categories
CATEGORIES = "- " + "\n- ".join(train_df["rating_overall"].unique())

predictions = []

for text in tqdm(train_df["text"]):

    # format the prompt with the categories and the text
    prompt = zero_shot.format(categories=CATEGORIES, text=text)
    
    # generate the response from the model
    response = model.generate(prompt)

    # extract the generated text from the response
    prediction = response["results"][0]["generated_text"].strip()

    # append the prediction to the list of predictions
    predictions.append(prediction)

100%|██████████| 455/455 [03:03<00:00,  2.48it/s]


In [20]:
print(classification_report(train_df.rating_overall, predictions))

              precision    recall  f1-score   support

    Negative       0.81      0.91      0.86        46
     Neutral       0.46      0.45      0.45        38
    Positive       0.96      0.95      0.96       371

    accuracy                           0.91       455
   macro avg       0.74      0.77      0.76       455
weighted avg       0.91      0.91      0.91       455



#### Using few shot prompting

In [22]:
few_shot = """You task is to classify the sentiment of hotel reviews from Trip Advisor provided in the text into one of three sentiment categories. You are given some review examples.

EXAMPLES:
- Text: "the hotel was great and the staff were friendly and helpful."
  Sentiment category: Positive
- Text: "the service was amazing and the food was delicious."
  Sentiment category: Positive
- Text: "the room was dirty and the service was terrible, bad value."
  Sentiment category: Negative
- Text: "the location was perfect, but the staff were rude."
  Sentiment category: Negative
- Text: "the location was pretty good, but the amenities were lacking."
  Sentiment category: Neutral
- Text: "the hotel was average, breakfast was good but the rooms were a bit dirty."
  Sentiment category: Neutral

CATEGORIES:
{categories}

TEXT:
{text}

Please assign the correct sentiment category to the review. Your answer should only inlcude the correct sentiment category and nothing else.

Category:
"""

In [23]:
fs_predictions_test = []

for text in tqdm(train_df["text"]):

    # format the prompt with the categories and the text
    fs_prompt = few_shot.format(categories=CATEGORIES, text=text)
    
    # generate the response from the model
    fs_response = model.generate(fs_prompt)

    # extract the generated text from the response
    fs_prediction = fs_response["results"][0]["generated_text"].strip()

    # append the prediction to the list of predictions
    fs_predictions_test.append(fs_prediction)

100%|██████████| 455/455 [02:56<00:00,  2.57it/s]


In [24]:
print(classification_report(train_df.rating_overall, fs_predictions_test))

              precision    recall  f1-score   support

           1       0.00      0.00      0.00         0
    Negative       0.86      0.80      0.83        46
     Neutral       0.33      0.63      0.44        38
    Positive       0.98      0.89      0.93       371

    accuracy                           0.86       455
   macro avg       0.54      0.58      0.55       455
weighted avg       0.91      0.86      0.88       455



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


#### CoT Prompting

In [25]:
CoT = """Your task is to classify the sentiment of hotel reviews from Trip Advisor into one of three categories. Think step by step.

CATEGORIES:
{categories}

TEXT:
{text}

Assign the correct sentiment category to the review. Your answer should only inlcude the correct sentiment category and nothing else.

Category:
"""

In [26]:
CoT_predictions_test = []

for text in tqdm(train_df["text"]):

    # format the prompt with the categories and the text
    CoT_prompt = CoT.format(categories=CATEGORIES, text=text)
    
    # generate the response from the model
    CoT_response = model.generate(CoT_prompt)

    # extract the generated text from the response
    CoT_prediction = CoT_response["results"][0]["generated_text"].strip()

    # append the prediction to the list of predictions
    CoT_predictions_test.append(CoT_prediction)

100%|██████████| 455/455 [02:45<00:00,  2.74it/s]


In [27]:
print(classification_report(train_df.rating_overall, CoT_predictions_test))

              precision    recall  f1-score   support

    Negative       0.79      0.89      0.84        46
     Neutral       0.50      0.42      0.46        38
    Positive       0.96      0.96      0.96       371

    accuracy                           0.91       455
   macro avg       0.75      0.76      0.75       455
weighted avg       0.91      0.91      0.91       455



### Testing the best model on the remaining test data

In [28]:
# Create a string with all categories
CATEGORIES = "- " + "\n- ".join(test_df["rating_overall"].unique())

Final_CoT_predictions_test = []

for text in tqdm(test_df["text"]):

    # format the prompt with the categories and the text
    Final_CoT_prompt = CoT.format(categories=CATEGORIES, text=text)
    
    # generate the response from the model
    Final_CoT_response = model.generate(Final_CoT_prompt)

    # extract the generated text from the response
    Final_CoT_prediction = Final_CoT_response["results"][0]["generated_text"].strip()

    # append the prediction to the list of predictions
    Final_CoT_predictions_test.append(Final_CoT_prediction)

100%|██████████| 1822/1822 [10:48<00:00,  2.81it/s]


In [41]:
print(classification_report(test_df.rating_overall, Final_CoT_predictions_test))

              precision    recall  f1-score   support

    Negative       0.72      0.90      0.80       111
     Neutral       0.53      0.48      0.51       180
    Positive       0.96      0.95      0.96      1531

    accuracy                           0.90      1822
   macro avg       0.74      0.78      0.75      1822
weighted avg       0.90      0.90      0.90      1822

