In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
data_path = "./data"
random_state = 69

In [3]:
review_path = os.path.join(data_path, "train_reviews.csv")
review_df = pd.read_csv(review_path)
review_df.head()

Unnamed: 0,Hotel_Address,Average_Score,Hotel_Name,Negative_Review,Positive_Review,Reviewer_Score,Tags,Hotel_Nation,Negative_Review_Nouns,Positive_Review_Nouns
0,11 Cadogan Gardens Sloane Square Kensington an...,8.7,11 Cadogan Gardens,empty,The staff are so friendly and helpful,10.0,"[' Leisure trip ', ' Solo traveler ', ' Superi...",UK,empty,staff
1,11 Cadogan Gardens Sloane Square Kensington an...,8.7,11 Cadogan Gardens,Sadly the day of my presentation the woman on...,The room and location were absolutely brillia...,9.6,"[' Business trip ', ' Solo traveler ', ' Super...",UK,day presentation woman desk note inbox driver ...,room location staff way bit bathroom snob touc...
2,11 Cadogan Gardens Sloane Square Kensington an...,8.7,11 Cadogan Gardens,empty,The whole experience from start to finish was...,10.0,"[' Leisure trip ', ' Couple ', ' Deluxe King R...",UK,empty,experience start finish service none room brea...
3,11 Cadogan Gardens Sloane Square Kensington an...,8.7,11 Cadogan Gardens,Still under renovation so building noise stil...,Breakfast amazing location perfect for west e...,8.3,"[' Leisure trip ', ' Couple ', ' Deluxe King R...",UK,renovation building noise problem outside,breakfast location west end king road area
4,11 Cadogan Gardens Sloane Square Kensington an...,8.7,11 Cadogan Gardens,The renovations were still on going and I tho...,Staff amazing very helpful and couldn t have ...,7.1,"[' Leisure trip ', ' Group ', ' Superior Queen...",UK,renovation december january sound drilling bre...,staff


In [4]:
drop_labels = ["Negative_Review", "Positive_Review", "Tags"]
hotel_df = review_df.drop(columns=drop_labels)

# Aspect nouns dictionary

In [5]:
vocab_path = os.path.join(data_path, "aspect_nouns.csv")
vocab_df = pd.read_csv(vocab_path)
vocab_df

Unnamed: 0,word,label
0,room,3
1,staff,2
2,hotel,1
3,location,1
4,breakfast,5
...,...,...
685,hip,1
686,circus,1
687,church,1
688,police,1


In [6]:
aspect_nouns = dict(vocab_df.to_numpy())
aspect_nouns

{'room': 3,
 'staff': 2,
 'hotel': 1,
 'location': 1,
 'breakfast': 5,
 'bed': 3,
 'service': 2,
 'bathroom': 3,
 'bar': 1,
 'restaurant': 1,
 'station': 1,
 'time': 2,
 'reception': 2,
 'area': 1,
 'food': 5,
 'view': 1,
 'shower': 3,
 'price': 2,
 'facility': 4,
 'floor': 3,
 'coffee': 5,
 'city': 1,
 'place': 1,
 'minute': 2,
 'water': 5,
 'door': 3,
 'metro': 1,
 'window': 3,
 'check': 2,
 'size': 3,
 'air': 3,
 'money': 2,
 'pool': 1,
 'tea': 5,
 'street': 1,
 'noise': 3,
 'value': 2,
 'walk': 1,
 'people': 2,
 'parking': 1,
 'work': 2,
 'tube': 3,
 'star': 1,
 'choice': 2,
 'quality': 2,
 'desk': 2,
 'train': 1,
 'problem': 2,
 'guest': 2,
 'bath': 3,
 'distance': 1,
 'way': 1,
 'space': 1,
 'park': 1,
 'access': 1,
 'hour': 2,
 'bedroom': 3,
 'drink': 5,
 'experience': 2,
 'bus': 1,
 'arrival': 1,
 'decor': 3,
 'center': 1,
 'toilet': 3,
 'wall': 3,
 'buffet': 5,
 'towel': 4,
 'sleep': 3,
 'card': 2,
 'car': 1,
 'spa': 1,
 'evening': 1,
 'design': 3,
 'building': 1,
 'suite': 3,

# Hotel feature extracting

In [7]:
hotel_names, review_counts = np.unique(hotel_df["Hotel_Name"], return_counts= True)

In [8]:
arr = np.stack([hotel_names, review_counts], axis= 1)
arr = np.array(sorted(arr, reverse= True, key= lambda x: x[1]))

In [9]:
hotel_code = -1
arr[hotel_code]

array(['Hotel Gallitzinberg', 6], dtype=object)

In [27]:
hotel_features = {}
for hotel in arr[:, 0]:
  hotel_data = hotel_df[hotel_df["Hotel_Name"] == hotel]

  nola = len(np.unique(list(aspect_nouns.values())))
  neg_counts = np.zeros(nola)
  for review in hotel_data["Negative_Review_Nouns"].to_numpy():
    for noun in review.split(" "):
      if noun in aspect_nouns:
        neg_counts[aspect_nouns[noun]-1] -= 1

  pos_counts = np.zeros(nola)
  for review in hotel_data["Positive_Review_Nouns"].to_numpy():
    for noun in review.split(" "):
      if noun in aspect_nouns:
        pos_counts[aspect_nouns[noun]-1] += 1

  feats = np.add(pos_counts, neg_counts).astype(np.int8)
  hotel_features[hotel] = list(feats / np.sum(np.abs(feats)))

In [28]:
hotel_features["11 Cadogan Gardens"]

[0.3940520446096654,
 0.40148698884758366,
 -0.10780669144981413,
 -0.05947955390334572,
 0.03717472118959108]

In [18]:
def review_feats_extract(review, aspect_nouns):
  nola = len(np.unique(list(aspect_nouns.values())))
  feat_counts = np.zeros(nola)
  for noun in review.split(" "):
    if noun in aspect_nouns:
      feat_counts[aspect_nouns[noun]-1] += 1

  denom = np.sum(np.abs(feat_counts))
  denom = denom if denom != 0 else 1
  return list(feat_counts / denom)

In [21]:
review_features = review_df["Positive_Review_Nouns"].apply(lambda x: 
                  review_feats_extract(x, aspect_nouns)).to_list()

In [37]:
review_features

[[0.0, 1.0, 0.0, 0.0, 0.0],
 [0.2, 0.3, 0.3, 0.1, 0.1],
 [0.0, 0.6, 0.2, 0.0, 0.2],
 [0.6666666666666666, 0.0, 0.16666666666666666, 0.0, 0.16666666666666666],
 [0.0, 1.0, 0.0, 0.0, 0.0],
 [1.0, 0.0, 0.0, 0.0, 0.0],
 [0.2, 0.6, 0.2, 0.0, 0.0],
 [0.0, 0.3333333333333333, 0.6666666666666666, 0.0, 0.0],
 [0.0, 0.0, 0.0, 0.0, 0.0],
 [0.0, 0.5, 0.5, 0.0, 0.0],
 [0.0, 0.25, 0.75, 0.0, 0.0],
 [0.2727272727272727,
  0.45454545454545453,
  0.18181818181818182,
  0.0,
  0.09090909090909091],
 [0.0, 0.0, 0.0, 0.0, 0.0],
 [0.5, 0.0, 0.5, 0.0, 0.0],
 [0.0, 0.5, 0.5, 0.0, 0.0],
 [0.0, 0.0, 1.0, 0.0, 0.0],
 [0.1, 0.2, 0.5, 0.0, 0.2],
 [0.25, 0.25, 0.25, 0.0, 0.25],
 [0.0, 0.6, 0.4, 0.0, 0.0],
 [0.4, 0.4, 0.2, 0.0, 0.0],
 [0.0, 0.0, 0.0, 0.0, 0.0],
 [1.0, 0.0, 0.0, 0.0, 0.0],
 [0.0, 0.5, 0.5, 0.0, 0.0],
 [0.3333333333333333, 0.6666666666666666, 0.0, 0.0, 0.0],
 [0.5, 0.375, 0.125, 0.0, 0.0],
 [0.0, 0.0, 0.0, 0.0, 0.0],
 [0.4, 0.2, 0.2, 0.0, 0.2],
 [0.6666666666666666, 0.3333333333333333, 0.0, 0.0, 0.0]

In [29]:
hotel_features_list = review_df["Hotel_Name"].apply(lambda x: hotel_features[x]).to_list()
hotel_features_list

[[0.3940520446096654,
  0.40148698884758366,
  -0.10780669144981413,
  -0.05947955390334572,
  0.03717472118959108],
 [0.3940520446096654,
  0.40148698884758366,
  -0.10780669144981413,
  -0.05947955390334572,
  0.03717472118959108],
 [0.3940520446096654,
  0.40148698884758366,
  -0.10780669144981413,
  -0.05947955390334572,
  0.03717472118959108],
 [0.3940520446096654,
  0.40148698884758366,
  -0.10780669144981413,
  -0.05947955390334572,
  0.03717472118959108],
 [0.3940520446096654,
  0.40148698884758366,
  -0.10780669144981413,
  -0.05947955390334572,
  0.03717472118959108],
 [0.3940520446096654,
  0.40148698884758366,
  -0.10780669144981413,
  -0.05947955390334572,
  0.03717472118959108],
 [0.3940520446096654,
  0.40148698884758366,
  -0.10780669144981413,
  -0.05947955390334572,
  0.03717472118959108],
 [0.3940520446096654,
  0.40148698884758366,
  -0.10780669144981413,
  -0.05947955390334572,
  0.03717472118959108],
 [0.3940520446096654,
  0.40148698884758366,
  -0.10780669144981

In [46]:
hotel_columns = ["Hotel_Location", "Hotel_Service", "Hotel_Room", "Hotel_F&A", "Hotel_Meal"]
customer_columns = ["Customer_Location", "Customer_Service", "Customer_Room", "Customer_F&A", "Customer_Meal"]
train_df = pd.concat([review_df, 
                  pd.DataFrame(hotel_features_list, columns= hotel_columns), 
                  pd.DataFrame(review_features, columns= customer_columns)], axis= 1)
train_df.head()

Unnamed: 0,Hotel_Address,Average_Score,Hotel_Name,Negative_Review,Positive_Review,Reviewer_Score,Tags,Hotel_Nation,Negative_Review_Nouns,Positive_Review_Nouns,Hotel_Location,Hotel_Service,Hotel_Room,Hotel_F&A,Hotel_Meal,Customer_Location,Customer_Service,Customer_Room,Customer_F&A,Customer_Meal
0,11 Cadogan Gardens Sloane Square Kensington an...,8.7,11 Cadogan Gardens,empty,The staff are so friendly and helpful,10.0,"[' Leisure trip ', ' Solo traveler ', ' Superi...",UK,empty,staff,0.394052,0.401487,-0.107807,-0.05948,0.037175,0.0,1.0,0.0,0.0,0.0
1,11 Cadogan Gardens Sloane Square Kensington an...,8.7,11 Cadogan Gardens,Sadly the day of my presentation the woman on...,The room and location were absolutely brillia...,9.6,"[' Business trip ', ' Solo traveler ', ' Super...",UK,day presentation woman desk note inbox driver ...,room location staff way bit bathroom snob touc...,0.394052,0.401487,-0.107807,-0.05948,0.037175,0.2,0.3,0.3,0.1,0.1
2,11 Cadogan Gardens Sloane Square Kensington an...,8.7,11 Cadogan Gardens,empty,The whole experience from start to finish was...,10.0,"[' Leisure trip ', ' Couple ', ' Deluxe King R...",UK,empty,experience start finish service none room brea...,0.394052,0.401487,-0.107807,-0.05948,0.037175,0.0,0.6,0.2,0.0,0.2
3,11 Cadogan Gardens Sloane Square Kensington an...,8.7,11 Cadogan Gardens,Still under renovation so building noise stil...,Breakfast amazing location perfect for west e...,8.3,"[' Leisure trip ', ' Couple ', ' Deluxe King R...",UK,renovation building noise problem outside,breakfast location west end king road area,0.394052,0.401487,-0.107807,-0.05948,0.037175,0.666667,0.0,0.166667,0.0,0.166667
4,11 Cadogan Gardens Sloane Square Kensington an...,8.7,11 Cadogan Gardens,The renovations were still on going and I tho...,Staff amazing very helpful and couldn t have ...,7.1,"[' Leisure trip ', ' Group ', ' Superior Queen...",UK,renovation december january sound drilling bre...,staff,0.394052,0.401487,-0.107807,-0.05948,0.037175,0.0,1.0,0.0,0.0,0.0
