### Recommendation

- I would recommend that you run this notebook in Google Colab.

- To utilize the provided GPU, go to the `Runtime` tab and search for `Change runtime type`.

- In the pop-up window ensure that `Runtime type` is `Python` and then mark the checkbox `T4 GPU` under `Hardware accelerator`.

- Also ensure that you have uploaded the data to use to the necessary directory

In [1]:
import pandas as pd
import matplotlib.pyplot as plt

from transformers import AutoTokenizer, AutoModelForSequenceClassification
from scipy.special import softmax
import torch

plt.style.use('ggplot')

In [2]:
kq_data = pd.read_csv("data/kenya-airways.csv")
kq_data.head()

Unnamed: 0.1,Unnamed: 0,date_published,summary_title,country,trip_verified,review,other_reviews,ratings_10,aircraft,type_of_traveller,cabin_flown,route,recommended,seat_comfort,cabin_staff_service,food_and_beverages,inflight_entertainment,ground_service,value_for_money,wifi_and_connectivity
0,0,2024-06-27,Will fly again,Denmark,Trip Verified,Joburg-Nairobi-Amsterdam 26-27 June 2024 on Bo...,,8.0,Boeing 787-8,Business,Business Class,Johannesburg to Amsterdam via Nairobi,yes,5.0,4.0,4.0,4.0,5.0,4.0,
1,1,2024-06-27,keep on cancelling the flights,Kenya,Not Verified,Horrible airline. They keep on cancelling the ...,,1.0,,Solo Leisure,Economy Class,London Heathrow to Mombasa via Nairobi,no,2.0,3.0,2.0,2.0,2.0,1.0,
2,2,2024-06-27,One of the worst flights,South Africa,Trip Verified,One of the worst flights i have ever experienc...,,3.0,,Family Leisure,Economy Class,Johannesburg to Nairobi,no,1.0,3.0,3.0,2.0,2.0,2.0,1.0
3,3,2024-06-22,Very unprofessional ground staff,India,Not Verified,1. Had paid for seating after which was reques...,,4.0,,Family Leisure,Economy Class,Nairobi to Mumbai,no,2.0,2.0,1.0,,1.0,2.0,
4,4,2024-06-13,refused to compensate me,Canada,Not Verified,I was returning a trip to Kenya. Kenyan Airway...,,1.0,,Family Leisure,Economy Class,Nairobi to Amsterdam,no,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [3]:
# select only the necessary columns
review_columns = ['Unnamed: 0', 'summary_title', 'review']
kq_data = kq_data[review_columns]
kq_data.head()

Unnamed: 0.1,Unnamed: 0,summary_title,review
0,0,Will fly again,Joburg-Nairobi-Amsterdam 26-27 June 2024 on Bo...
1,1,keep on cancelling the flights,Horrible airline. They keep on cancelling the ...
2,2,One of the worst flights,One of the worst flights i have ever experienc...
3,3,Very unprofessional ground staff,1. Had paid for seating after which was reques...
4,4,refused to compensate me,I was returning a trip to Kenya. Kenyan Airway...


In [4]:
# check if there are null values
kq_data['review'].isna().sum()

0

In [5]:
# handle null values
kq_data['full_review'] = kq_data['summary_title'] \
  .str.cat(kq_data['review'], sep=' ', na_rep='')

# remove redundant columns
kq_data.drop(columns=['summary_title', 'review'], inplace=True)
kq_data.head()

Unnamed: 0.1,Unnamed: 0,full_review
0,0,Will fly again Joburg-Nairobi-Amsterdam 26-27 ...
1,1,keep on cancelling the flights Horrible airlin...
2,2,One of the worst flights One of the worst flig...
3,3,Very unprofessional ground staff 1. Had paid f...
4,4,refused to compensate me I was returning a tri...


## Sentiment Analysis

In [6]:
# configuration for gpu usage if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [7]:
# review sample for testing and tokenization purpose
example_review = kq_data['full_review'][300]
example_review

'Kenya Airways customer review ["DXB to JNB via NBO. Check-in in Dubai took longer than expected, but as most are 3rd party they don\'t care that much to be efficient. Brand new Dreamliner in both legs, good seat arrangement with fair pitch. Onboard entertainment was ok and service above standards (great surprise), good crew. Connection in NBO very smooth. Great value, recommended."]'

In [8]:
# fitting model
MODEL = f"cardiffnlp/twitter-roberta-base-sentiment"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL).to(device)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/747 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

In [9]:
# run roberta model on our example review
encoded_text = tokenizer(example_review, return_tensors='pt')
encoded_text = {key: val.to(device) for key, val in encoded_text.items()}
with torch.no_grad():
  model_output = model(**encoded_text)

scores = model_output.logits[0].cpu().numpy()
scores = softmax(scores)

scores_dict = {
  'roberta_neg' : scores[0],
  'roberta_neu' : scores[1],
  'roberta_pos' : scores[2]
}
print(pd.Series(scores_dict))

roberta_neg    0.015551
roberta_neu    0.119947
roberta_pos    0.864502
dtype: float32


In [10]:
# function to use for the whole dataset
def sentiment_polarity_score(example_review):
  # tokenize the review
  encoded_text = tokenizer(example_review, return_tensors='pt',
                           padding='max_length', truncation=True,
                           max_length=512)
  # move the tokenized review to the same device as the model
  encoded_text = {key: val.to(device) for key, val in encoded_text.items()}
  # ensure no gradient computation done
  with torch.no_grad():
    model_output = model(**encoded_text)
  # get the logits and move them to the cpu for further processing
  scores = model_output[0][0].detach().cpu().numpy()
  scores = softmax(scores)
  # scores dictionary
  scores_dict = {
    'roberta_neg' : scores[0],
    'roberta_neu' : scores[1],
    'roberta_pos' : scores[2]
  }
  return pd.Series(scores_dict)

In [11]:
# applying roberta model to whole dataset
try:
  kq_data[['roberta_neg', 'roberta_neu', 'roberta_pos']] = kq_data['full_review']\
    .apply(lambda x: sentiment_polarity_score(x))
except RuntimeError as e:
  print(e)

In [12]:
# uncomment if you want to see the progress

# from tqdm.notebook import tqdm
# roberta_results = {}
# for i, row in tqdm(kq_data.iterrows(), total=len(kq_data)):
#   text = row['full_review']
#   myid = row['id']
#   roberta_result = sentiment_polarity_score(text)
#   roberta_results[myid] = roberta_result
# print(roberta_results)

In [13]:
# find the sentiment label with the highest score
roberta_columns = ['roberta_neg', 'roberta_neu', 'roberta_pos']
kq_data['overall_sentiment'] = kq_data[roberta_columns].max(axis=1)

# rename the sentiment labels appropriately
sentiment_mapping = {
    'roberta_neg': 'negative',
    'roberta_neu': 'neutral',
    'roberta_pos': 'positive'
}
kq_data['sentiment_label'] = kq_data[roberta_columns].idxmax(axis=1)
kq_data['sentiment_label'] = kq_data['sentiment_label'].map(sentiment_mapping)
kq_data.head()

Unnamed: 0.1,Unnamed: 0,full_review,roberta_neg,roberta_neu,roberta_pos,overall_sentiment,sentiment_label
0,0,Will fly again Joburg-Nairobi-Amsterdam 26-27 ...,0.015191,0.178933,0.805876,0.805876,positive
1,1,keep on cancelling the flights Horrible airlin...,0.946588,0.048273,0.005139,0.946588,negative
2,2,One of the worst flights One of the worst flig...,0.973179,0.023846,0.002976,0.973179,negative
3,3,Very unprofessional ground staff 1. Had paid f...,0.877634,0.113443,0.008923,0.877634,negative
4,4,refused to compensate me I was returning a tri...,0.883592,0.109669,0.006739,0.883592,negative


In [14]:
# save data
kq_data.to_csv("review_trained.csv", index=False)