In [1]:
import torch
from tqdm.notebook import tqdm

from transformers import BertTokenizer
from torch.utils.data import TensorDataset

from transformers import BertForSequenceClassification
import pandas as pd

from torchtext.data.utils import get_tokenizer


## Transforming TrustYou matches data to input data for span extraction

This notebook converts csv data from SEMA system to input data ready to be used on the new ecosystem

In [2]:
columns = ['category', 'sentiment', 'match', 'text']

data_dir = '../data/'

en_q1_matches = pd.read_csv(data_dir + 'en_q1.csv', names=columns)
en_q2_matches = pd.read_csv(data_dir + 'en_q2.csv', names=columns)
en_q3_matches = pd.read_csv(data_dir + 'en_q3.csv', names=columns)
frames = [en_q1_matches, en_q2_matches, en_q3_matches]
df = pd.concat(frames)

#df = pd.read_csv('../data/en_q1.csv', names=columns)

# remove rows with empty matches
print(df['match'].isnull().sum())
df.dropna(subset=['match'], inplace=True)
print(df['match'].isnull().sum())


14
0


In [3]:
df.groupby(['text'])['match'].apply(lambda x: ','.join(x)).reset_index()
df.head()

Unnamed: 0,category,sentiment,match,text
0,16,n,lifts and the rooms need to be renovated,Cons:- I think the lifts and the rooms need to...
1,244,n,lifts and the rooms need to be renovated,Cons:- I think the lifts and the rooms need to...
2,15,p,Concierge staffs were friendly and attentive,Concierge staffs were friendly and attentive.
3,11,n,can't adjust the aircon,The aircon was too cold and we can't adjust th...
4,14,p,location makes it one of the most popular,Even though the rooms and the bathrooms are sm...


In [5]:
def get_sentiment(sentiment):
    if sentiment == 'p':
        return 'POS'
    if sentiment == 'n':
        return 'NEG'
    if sentiment == 'o':
        return 'NEU'

# function de output data in order to feed the models
def get_tagged_sequence(row):
    a = row.tokenized_text
    b = row.tokenized_match
    index_match = [[i, i+len(b)] for i in range(len(a)) if a[i:i+len(b)] == b]    
    row["tagged_text"] = [a[i]+"=O" for i in range(len(a))]
    # si el match no coincide no lo queremos
    if index_match == []:
        return None
    for i in range(index_match[0][0], index_match[0][1]):
        row["tagged_text"][i] = a[i]+"=T-"+get_sentiment(row["sentiment"]) 
    return row['tagged_text']

# def get_tagged_sequence(row):
#     a = row.tokenized_text
#     b = row.tokenized_match
#     index_match = [[i, i+len(b)] for i in range(len(a)) if a[i:i+len(b)] == b]    
#     row["tagged_text"] = [a[i]+"=O" for i in range(len(a))]
#     # si el match no coincide no lo queremos
#     if index_match == []:
#         return None
#     row["tagged_text"][index_match[0][0]] = a[index_match[0][0]]+"=T-"+get_sentiment(row["sentiment"]) 
#     row["tagged_text"][index_match[0][1] -1] = a[index_match[0][1] -1]+"=T-"+get_sentiment(row["sentiment"]) 
#     return row['tagged_text']
    
  
# result = get_tagged_sequence(df_aux.iloc[11])
# result

In [10]:
tokenizer = get_tokenizer("basic_english")
# delete these problematic lines
df_aux = df.drop([df.index[81], df.index[387]] )

#tokenize text and match in order to tag the sequence

df_aux['tokenized_text'] = df_aux.apply(lambda row: tokenizer(row.text), axis=1)
df_aux['tokenized_match'] = df_aux.apply(lambda row: tokenizer(row.match.strip(",.!")), axis=1)
df_aux['tagged_text'] = df_aux.apply(lambda row: get_tagged_sequence(row), axis=1)


print(df_aux['tagged_text'].isnull().sum())
df_aux.dropna(subset=['tagged_text'], inplace=True)
print(df_aux['tagged_text'].isnull().sum())
df_aux.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


154
0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':


Unnamed: 0,category,sentiment,match,text,tokenized_text,tokenized_match,tagged_text
21740,63,p,staff were really helpful,The girl who greeted us was lovely and really ...,"[the, girl, who, greeted, us, was, lovely, and...","[staff, were, really, helpful]","[the=O, girl=O, who=O, greeted=O, us=O, was=O,..."
20957,11,p,Satisfied with room and restroom,"Satisfied with room and restroom, spacious, ne...","[satisfied, with, room, and, restroom, ,, spac...","[satisfied, with, room, and, restroom]","[satisfied=T-POS, with=T-POS, room=T-POS, and=..."
5068,16,p,We enjoyed our stay,We enjoyed our stay and hope to return one day...,"[we, enjoyed, our, stay, and, hope, to, return...","[we, enjoyed, our, stay]","[we=T-POS, enjoyed=T-POS, our=T-POS, stay=T-PO..."
5241,111,p,We are returning guests and have been for quit...,We are returning guests and have been for quit...,"[we, are, returning, guests, and, have, been, ...","[we, are, returning, guests, and, have, been, ...","[we=T-POS, are=T-POS, returning=T-POS, guests=..."
6234,18,p,Everything was perfect,Everything was perfect from the moment we ente...,"[everything, was, perfect, from, the, moment, ...","[everything, was, perfect]","[everything=T-POS, was=T-POS, perfect=T-POS, f..."


In [14]:
# randomize dataset just in case there is any internal order
df_random = df_aux.sample(frac=1).reset_index(drop=True)

df_random

Unnamed: 0,category,sentiment,match,text,tokenized_text,tokenized_match,tagged_text
0,14,n,also far from MRT station,The hotel is a bit far from clubs and bars als...,"[the, hotel, is, a, bit, far, from, clubs, and...","[also, far, from, mrt, station]","[the=O, hotel=O, is=O, a=O, bit=O, far=O, from..."
1,16,p,restaurant at the family pool was open,(The restaurant at the family pool was open fo...,"[(, the, restaurant, at, the, family, pool, wa...","[restaurant, at, the, family, pool, was, open]","[(=O, the=O, restaurant=T-POS, at=T-POS, the=T..."
2,333,n,hair on the toilet seat,"There were hair in the bathtub, someone's faci...","[there, were, hair, in, the, bathtub, ,, someo...","[hair, on, the, toilet, seat]","[there=O, were=O, hair=O, in=O, the=O, bathtub..."
3,11,n,rooms weren't all renovated,the rooms weren't all renovated.,"[the, rooms, weren, ', t, all, renovated, .]","[rooms, weren, ', t, all, renovated]","[the=O, rooms=T-NEG, weren=T-NEG, '=T-NEG, t=T..."
4,11,o,Bed sheet was not being changed often.,Bed sheet was not being changed often.,"[bed, sheet, was, not, being, changed, often, .]","[bed, sheet, was, not, being, changed, often]","[bed=T-NEU, sheet=T-NEU, was=T-NEU, not=T-NEU,..."
...,...,...,...,...,...,...,...
16224,11,n,Air conditioners in room a bit noisy,Air conditioners in room a bit noisy.,"[air, conditioners, in, room, a, bit, noisy, .]","[air, conditioners, in, room, a, bit, noisy]","[air=T-NEG, conditioners=T-NEG, in=T-NEG, room..."
16225,21,n,gym is just laughable,The fact that the hotel is charging a daily am...,"[the, fact, that, the, hotel, is, charging, a,...","[gym, is, just, laughable]","[the=O, fact=O, that=O, the=O, hotel=O, is=O, ..."
16226,16,p,exprience very comfortable and clean,Awasome exprience very comfortable and clean r...,"[awasome, exprience, very, comfortable, and, c...","[exprience, very, comfortable, and, clean]","[awasome=O, exprience=T-POS, very=T-POS, comfo..."
16227,11,n,bathroom looks very small,"D shower , toilet n d basin r separated so d b...","[d, shower, ,, toilet, n, d, basin, r, separat...","[bathroom, looks, very, small]","[d=O, shower=O, ,=O, toilet=O, n=O, d=O, basin..."


In [15]:
# write output text on another column
saveFileLine = 'output_for__extract.txt.'

def get_output(row):
    return row.text + '####'+ " ".join(row["tagged_text"])

df_random['output'] = df_random.apply(lambda row: get_output(row), axis=1)

df_random['output'].head()

0    The hotel is a bit far from clubs and bars als...
1    (The restaurant at the family pool was open fo...
2    There were hair in the bathtub, someone's faci...
3    the rooms weren't all renovated. ####the=O roo...
4    Bed sheet was not being changed often. ####bed...
Name: output, dtype: object

In [16]:
# this can be done better - adding dummy label to use train_test_split to get train and test dataset
from sklearn.model_selection import train_test_split
possible_labels = df_random.category.unique()

label_dict = {}
for index, possible_label in enumerate(possible_labels):
    label_dict[possible_label] = index

df_random['label'] = df_random.category.replace(label_dict)

In [17]:
df_random.iloc[0].tagged_text

['the=O',
 'hotel=O',
 'is=O',
 'a=O',
 'bit=O',
 'far=O',
 'from=O',
 'clubs=O',
 'and=O',
 'bars=O',
 'also=T-NEG',
 'far=T-NEG',
 'from=T-NEG',
 'mrt=T-NEG',
 'station=T-NEG',
 '.=O']

In [18]:
X_train, X_val, y_train, y_val = train_test_split(df_random.output.values, 
                                                  df_random.label.values, 
                                                  test_size=0.15, 
                                                  random_state=42, 
                                                  stratify=df_random.label.values)


In [19]:
train = pd.DataFrame(data=X_train)
train.to_csv('hotel_train_polarity.txt', sep=' ', index=False, header=False)

In [20]:
test = pd.DataFrame(data=X_val)
test.to_csv('hotel_test_polarity.txt', sep=' ', index=False, header=False)