In [2]:
import torch
from tqdm.notebook import tqdm

from transformers import BertTokenizer
from torch.utils.data import TensorDataset

from transformers import BertForSequenceClassification
import pandas as pd

from torchtext.data.utils import get_tokenizer


## Transforming TrustYou matches data to input data for polarity and multilabel classification

This notebook converts csv data from SEMA system to input data ready to be used on the new ecosystem

In [29]:
columns = ['category', 'sentiment', 'match', 'text']


en_q1_matches = pd.read_csv('../data/en_q1.csv', names=columns)
en_q2_matches = pd.read_csv('../data/en_q2.csv', names=columns)
en_q3_matches = pd.read_csv('../data/en_q3.csv', names=columns)
frames = [en_q1_matches, en_q2_matches, en_q3_matches]
df = pd.concat(frames)

#df = pd.read_csv('../data/en_q1.csv', names=columns)

print(df['match'].isnull().sum())
df.dropna(subset=['match'], inplace=True)
print(df['match'].isnull().sum())
df.size

14
0


318872

## CATEGORY

In [54]:
data_aux = df[['category', 'match']]
df2 = data_aux.groupby(['match'])['category'].unique()

agrupated_categories = pd.DataFrame(data=df2)

agrupated_categories = agrupated_categories.reset_index()

agrupated_categories

Unnamed: 0,match,category
0,$19 CAD is crazy stupid for a 20 oz,[22]
1,$800 dollars a night and you dont even get a b...,[11]
2,$85 for a 90 min massage,"[21, 22]"
3,& birthday desert,[13]
4,& cleanliness,"[333, 11, 111]"
...,...,...
34837,從硬件到軟件 是比較契合自己的 也都是成為這間酒店常客的原因,[244]
34838,柳传贵 and 吕美are extremely helpful and friendly,[15]
34839,環境的氛圍,[171]
34840,禮貌周到而貼心的照顧與服務,[15]


In [57]:
from sklearn.preprocessing import MultiLabelBinarizer

one_hot = MultiLabelBinarizer()

encoded_categories = one_hot.fit_transform(agrupated_categories["category"])
print(one_hot.classes_)

encoded_cat_df = pd.DataFrame(data=encoded_categories, columns=one_hot.classes_)

df1 = agrupated_categories["match"]
df2 = encoded_cat_df.reset_index(drop=True)

df_out = df2
df_out["match"] = df1

df_out.head()

['11' '111' '12' '13' '14' '15' '16' '171' '18' '201' '21' '22' '244'
 '333' '36' '444' '63']


Unnamed: 0,11,111,12,13,14,15,16,171,18,201,21,22,244,333,36,444,63,match
0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,$19 CAD is crazy stupid for a 20 oz
1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,$800 dollars a night and you dont even get a b...
2,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,$85 for a 90 min massage
3,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,& birthday desert
4,1,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,& cleanliness


In [60]:
df_out.to_csv('hotel_train_mul_cls.csv', sep=',', index=False, header=True)

In [53]:
train = pd.DataFrame(data=X_train)
train.to_csv('hotel_train_mul_cls.txt', sep=' ', index=False, header=False)

test = pd.DataFrame(data=X_val)
test.to_csv('hotel_test_mul_cls.txt', sep=' ', index=False, header=False)

36

## SENTIMENT

In [14]:
#df_aux = df.groupby(['text'])['match'].apply(lambda x: ' SEP '.join(x)).reset_index()

df["match_sentiment"] = df["match"] + " sent(" + df["sentiment"] + ")"

df_aux = df.groupby(['text'])['match_sentiment'].apply(lambda x: ' SEP '.join(x)).reset_index()
df_aux.head(20)

df_aux.head(20)



text               , beautiful pools and gardens and very delicio...
match_sentiment    beautiful pools and gardens sent(p) SEP beauti...
Name: 3, dtype: object

In [60]:
def get_sentiment(sentiment):
    if sentiment == 'p':
        return 'POS'
    if sentiment == 'n':
        return 'NEG'
    if sentiment == 'o':
        return 'NEU'


def get_tagged_sequence(row):
    a = row.tokenized_text
    matches = row.tokenized_match
    # comprobar si hay varios matches
    index_matches = []
    for j in range(0, len(matches)):
        b = matches[j][:-3]
        print(b)
        index_matches.append([[i, i+len(b)] for i in range(len(a)) if a[i:i+len(b)] == b]) 
        row["tagged_text"] = [a[i]+"=O" for i in range(len(a))]
    # si el match no coincide no lo queremos
    if index_matches == []:
        return None
    for w in range(0, len(index_matches)):
        index_match = index_matches[w]
        if index_match == []:
            continue
        for i in range(index_match[0][0], index_match[0][1]):
            row["tagged_text"][i] = "{}{}{}".format(a[i], "=T-", get_sentiment(matches[w][-1]))
            
    return row['tagged_text']

def tokenize_matches(matches):
    tokenizer = get_tokenizer("basic_english")
    matches = matches.split(" SEP ")
    # remove redundant matches
    # dicts contains unique keys, then back to list
    matches_list = list(dict.fromkeys(matches))
    for i in range(0, len(matches_list)):
        matches_list[i] = tokenizer(matches_list[i].strip(",.!)(-"))
    return matches_list
        
#print(get_tagged_sequence(df_aux.iloc[3]))
# print(df_aux.iloc[3].sentiment)
# print(get_sentiment(df_aux.iloc[3].sentiment))

df_aux.astype('object')
tokenizer = get_tokenizer("basic_english")
df_aux['tokenized_text'] = df_aux.apply(lambda row: tokenizer(row.text.strip(",.!)(-")), axis=1)
df_aux['tokenized_match'] = df_aux.apply(lambda row: tokenize_matches(row.match_sentiment), axis=1)
df_aux['tagged_text'] = df_aux.apply(lambda row: get_tagged_sequence(row), axis=1)


# print(df_aux['tagged_text'].isnull().sum())
# df_aux.dropna(subset=['tagged_text'], inplace=True)
# print(df_aux['tagged_text'].isnull().sum())
df_aux.head()


['be', 'coming', 'back']
['private', 'transfer']
['cannot', 'say', 'enough', 'good', 'things', 'about', 'the', 'place']
['beautiful', 'pools', 'and', 'gardens']
['very', 'delicious', 'food']
['best', 'guest', 'relation', 'and', 'restaurant']
['services', 'was', 'fantastic']
['cleanliness']
['staff', 'were', 'helpful']
['excellent', 'services']
['sumptuous', 'afternoon', 'tea']
['make', 'you', 'feel', 'most', 'welcome', 'in', 'a', 'friendly', 'and', 'sincere', 'way', '.']
['a', 'huge', 'choice', 'of', 'restaurants']
['the', 'front', 'of', 'house', 'were', 'happy']
['tons', 'of', 'towels']
['kids', 'entertainment', 'needs', 'big', 'improvement']
['animation', 'team']
['comfortable', 'and', 'good', 'location']
['location', 'is', 'a', 'plus']
['no', 'greetings']
['private', 'butler', 'service']
['room', 'it', 'is', 'spacious', 'for', 'a', 'family', 'of', 'four']
['staff', 'are', 'attentive', 'and', 'helpful']
['club', 'were', 'highly']
['heater', 'was', 'not', 'working']
['pillow', 'not', 

['got', 'a', '$150', 'to', 'spend', 'on', 'food', 'and', 'beverage']
['one', 'of', 'the', 'best', 'breakfasts']
['grateful', 'for', 'care', 'of', 'people', 'booking']
['great']
['great', 'a/c']
['small', 'fridge', 'in', 'the', 'room', 'was', 'useful']
['great', 'food']
['beautiful', 'view', 'from', 'the', 'roof']
['restaurant']
['excellent', 'cleaning', 'service']
['thank', 'you']
['beautiful', 'gardens']
['beautiful', 'rooms']
['great', 'hotel']
['reception']
['room']
['life', 'guards', 'good', 'surveillance']
['beach', 'wery', 'well', 'organized']
['good', 'variety', 'of', 'food']
['friendly', 'staff']
['great', 'hotel']
['satisfied', 'facilities']
['great', 'qscv']
['excellent', 'service']
['great', 'and', 'cozy', 'room']
['clean', 'room']
['very', 'good', 'dinner']
['great', 'bed']
['great', 'breakfast']
['quiet', 'location']
['great', 'city', 'views']
['great', 'clean', 'rooms']
['great', 'family', 'friendly', 'environment']
['great', 'getaway']
['amazing', 'view']
['clean', 'pool

['nice', 'place', 'to', 'stay', '.']
['spacious', 'gym', 'facilities']
['nice', 'pool']
['thankful', 'ibis', 'style', 'braga', 'bandung']
['nice', 'room']
['nice', 'room']
['nice', 'rooms']
['lovely', 'shower', 'and', 'a', 'balcony']
['nice', 'rooms']
['nice', 'selection', 'of', 'teas']
['nice', 'staff']
['nice', ',', 'clean', ',', 'compact', 'hotel']
['night', 'atmosphere']
['night', 'market', 'not', 'open']
['no', 'activities', 'on', 'the', 'beach', 'and', 'the', 'animation', 'team']
['no', 'internet']
['no', 'mini', 'bar']
['more', 'expensive', 'hotel']
['wonderful', 'stay', '.']
['no', 'complementary', 'water', 'in', 'room']
['no', 'cutlery', 'or', 'crockery']
['no', 'fresh', 'juices', 'or', 'fruits']
['no', 'fruit', 'juices']
['no', 'information', 'given', 'about', 'the', 'children’s']
['no', 'internet']
['no', 'kettle']
['no', 'kind', 'of', 'entertainment', 'there']
['very', 'basic', 'and', 'easy', 'animation']
['no', 'private', 'parking']
['no', 'ala', 'carte', 'restaurants']
['

['food', 'quality', 'is', 'below', 'average']
['furniture', 'is', 'outdated']
['problems', 'in', 'the', 'room']
['chefs', 'are', 'great']
['food', 'tastes', 'really', 'delicious']
['food', 'that', 'they', 'serve', 'are', 'all', 'amazing']
['friendly', 'staffs', 'too']
['bakery', 'is', 'freshly']
['food', 'there', 'was', 'extraordinary']
['food', 'was', 'amazing']
['there', 'are', 'lots', 'of', 'activities', 'available']
['more', 'varied', 'menu']
['food', 'was', 'much', 'better']
['waitress', 'was', 'very', 'accommodating']
['views', 'were', 'amazing']
['food', 'was', 'nice']
['most', 'of', 'the', 'staff', 'seemed', 'not', 'motivated']
['food', 'was', 'not', 'good']
['the', 'food', 'was', 'simply', 'superb', 'and', 'costed', 'perfectly']
['food', 'was', 'super']
['food', 'was', 'tasty']
['excellent', 'work', 'the', 'staff']
['the', 'food', ',', 'the', 'view', 'and', 'the', 'location', 'are', 'amazing']
['friendliness', 'and', 'helpfulness', 'of', 'the', 'staff', '.']
['the', 'friendlin

['a', 'hotel', 'porter', 'arranged', 'fresh', 'towels', 'for', 'us']
['we', 'really', 'enjoyed', 'our', 'stay', 'and', 'felt', 'completely', 'safe', 'with', 'the', 'precautions', 'carried', 'out', 'by', 'your', 'attentive', 'staff']
['we', 'really', 'enjoyed', 'our', 'stay', '.']
['we', 'really', 'enjoyed', 'the', 'stay']
['absolutely', 'beautiful', 'place', '.']
['really', 'love', 'the', 'room', "'", 's', 'layout', ',', 'interior', 'and', 'vibe', 'as', 'well']
['nice', 'bath']
['we', 'were', 'upgraded', 'to', 'a', 'spacious', 'modern', 'and', 'elegant', 'family', 'room']
['an', 'amazing', '3', 'nights']
['staff', 'are', 'friendly', 'and', 'helpful']
['novotel', 'new']
['grand', 'harbour', 'view']
['room', 'was', 'fantastic', 'very', 'clean', 'and', 'lots', 'of']
['really', 'impressed', 'with', 'the', 'hotel']
['we', 'stayed', 'in', 'the', 'cabin', 'and', 'it', 'was', 'everything', 'i', 'hoped', 'for']
['shopping']
['for', '3', 'nights', ',', 'it', 'was', 'perfect', 'from', 'start', 't

Unnamed: 0,text,match_sentiment,tokenized_text,tokenized_match,tagged_text
0,", I’ll be coming back very soon inshallah",be coming back sent(p),"[,, i’ll, be, coming, back, very, soon, inshal...","[[be, coming, back, sent, (, p]]","[,=O, i’ll=O, be=T-POS, coming=T-POS, back=T-P..."
1,"(Taxi might be HK$300, private transfer $800.",private transfer sent(p),"[taxi, might, be, hk$300, ,, private, transfer...","[[private, transfer, sent, (, p]]","[taxi=O, might=O, be=O, hk$300=O, ,=O, private..."
2,) Cannot say enough good things about the pla...,Cannot say enough good things about the place ...,"[cannot, say, enough, good, things, about, the...","[[cannot, say, enough, good, things, about, th...","[cannot=T-POS, say=T-POS, enough=T-POS, good=T..."
3,", beautiful pools and gardens and very delicio...",beautiful pools and gardens sent(p) SEP beauti...,"[beautiful, pools, and, gardens, and, very, de...","[[beautiful, pools, and, gardens, sent, (, p],...","[beautiful=T-POS, pools=T-POS, and=T-POS, gard..."
4,", the hotel have best guest relation and rest...",best guest relation and restaurant sent(p) SEP...,"[the, hotel, have, best, guest, relation, and,...","[[best, guest, relation, and, restaurant, sent...","[the=O, hotel=O, have=O, best=T-POS, guest=T-P..."


In [61]:
df_aux.iloc[3].tagged_text

['beautiful=T-POS',
 'pools=T-POS',
 'and=T-POS',
 'gardens=T-POS',
 'and=O',
 'very=T-POS',
 'delicious=T-POS',
 'food=T-POS',
 '.=O']

In [62]:
# randomize rows
df_random = df_aux.sample(frac=1).reset_index(drop=True)

df_random

Unnamed: 0,text,match_sentiment,tokenized_text,tokenized_match,tagged_text
0,Loved the atmosphere.,Loved the atmosphere sent(p),"[loved, the, atmosphere]","[[loved, the, atmosphere, sent, (, p]]","[loved=T-POS, the=T-POS, atmosphere=T-POS]"
1,The chalets are kept immaculate and the suppli...,chalets are kept immaculate sent(p) SEP chalet...,"[the, chalets, are, kept, immaculate, and, the...","[[chalets, are, kept, immaculate, sent, (, p],...","[the=O, chalets=T-POS, are=T-POS, kept=T-POS, ..."
2,The hotel have clearly thought of every way po...,hotel clean and safe sent(p) SEP hotel clean a...,"[the, hotel, have, clearly, thought, of, every...","[[hotel, clean, and, safe, sent, (, p], [excel...","[the=O, hotel=O, have=O, clearly=O, thought=O,..."
3,The house staff for the cabins were very helpf...,house staff for the cabins were very helpful s...,"[the, house, staff, for, the, cabins, were, ve...","[[house, staff, for, the, cabins, were, very, ...","[the=O, house=T-POS, staff=T-POS, for=T-POS, t..."
4,The room was very nice and clean.,room was very nice and clean sent(p) SEP room ...,"[the, room, was, very, nice, and, clean]","[[room, was, very, nice, and, clean, sent, (, p]]","[the=O, room=T-POS, was=T-POS, very=T-POS, nic..."
...,...,...,...,...,...
3171,Big room...,Big room sent(p) SEP Big room sent(p),"[big, room]","[[big, room, sent, (, p]]","[big=T-POS, room=T-POS]"
3172,"It could have been rated as 10, however, the f...",fridge on the room was not working sent(n) SEP...,"[it, could, have, been, rated, as, 10, ,, howe...","[[fridge, on, the, room, was, not, working, se...","[it=O, could=O, have=O, been=O, rated=O, as=O,..."
3173,The gourmet food hall flowed with all the opti...,gourmet food hall sent(p),"[the, gourmet, food, hall, flowed, with, all, ...","[[gourmet, food, hall, sent, (, p]]","[the=O, gourmet=T-POS, food=T-POS, hall=T-POS,..."
3174,I spent one night at Rosewood for my birthday ...,the best experience sent(p),"[i, spent, one, night, at, rosewood, for, my, ...","[[the, best, experience, sent, (, p]]","[i=O, spent=O, one=O, night=O, at=O, rosewood=..."


In [63]:
# generate output in QA format
def get_output(row):
    return row.text + '####'+ " ".join(row["tagged_text"])

df_random['output'] = df_random.apply(lambda row: get_output(row), axis=1)

df_random['output'].head()  
    

0    Loved the atmosphere.####loved=T-POS the=T-POS...
1    The chalets are kept immaculate and the suppli...
2    The hotel have clearly thought of every way po...
3    The house staff for the cabins were very helpf...
4    The room was very nice and clean.####the=O roo...
Name: output, dtype: object

In [64]:
from sklearn.model_selection import train_test_split
df_random["sentiment"] = df["sentiment"]
possible_labels = df_random.sentiment.unique()

label_dict = {}
for index, possible_label in enumerate(possible_labels):
    label_dict[possible_label] = index

df_random['label'] = df_random.sentiment.replace(label_dict)
df_random.iloc[0].tagged_text

['loved=T-POS', 'the=T-POS', 'atmosphere=T-POS']

In [65]:
X_train, X_val, y_train, y_val = train_test_split(df_random.output.values, 
                                                  df_random.label.values, 
                                                  test_size=0.15, 
                                                  random_state=42, 
                                                  stratify=df_random.label.values)


In [66]:
train = pd.DataFrame(data=X_train)
train.to_csv('hotel_train_mul_polarity.txt', sep=' ', index=False, header=False)

test = pd.DataFrame(data=X_val)
test.to_csv('hotel_test_mul_polarity.txt', sep=' ', index=False, header=False)