# IIIT Delhi

#### A jsonlines file called preds.jsonl which contains your model's predictions on the sentihood test set. Please ensure that the original sample, and its annotated aspects/sentiments are also included. The model should take the text and the target entity as inputs and predict the aspect as well as the sentiment for each predicted sentiment.

In [397]:
from __future__ import absolute_import

import json  
import pandas as pd  
from pandas.io.json import json_normalize 
import os
import numpy as np

import operator
import re
import sys
import xml.etree.ElementTree

import nltk

In [398]:
with open('./Data/sentihood-train.json', mode = 'r') as f: 
    d = json.load(f)    

data = json_normalize(d)
data.head(10)

Unnamed: 0,id,opinions,text
0,1430,"[{'sentiment': 'Negative', 'aspect': 'price', ...",LOCATION1 is transforming and the prices w...
1,2013,"[{'sentiment': 'Positive', 'aspect': 'shopping...",Along LOCATION1 there are lots of Electronic...
2,1244,"[{'sentiment': 'Positive', 'aspect': 'transit-...",And LOCATION1 is ten mins direct on the tube...
3,209,"[{'sentiment': 'Positive', 'aspect': 'nightlif...",Another option is LOCATION1 which is very ce...
4,2824,"[{'sentiment': 'Positive', 'aspect': 'general'...",Best bet is around LOCATION2 and LOCATION1 a...
5,1835,"[{'sentiment': 'Negative', 'aspect': 'transit-...",Central London based taxis mostly refuse far...
6,1429,"[{'sentiment': 'Negative', 'aspect': 'general'...",Don't go looking at places like LOCATION1 n...
7,1404,[],Down here in South London the accent [local]...
8,190,"[{'sentiment': 'Positive', 'aspect': 'multicul...",Everyone in LOCATION1 is now black or Bangla...
9,1281,"[{'sentiment': 'Negative', 'aspect': 'general'...",For gods sake don't move to LOCATION1 its ho...


In [399]:
sentiment = []
aspect = []
target_entity = []
for dic in data[['opinions']][:].items():
    
    for t in dic[1]:
        if t == []:
            t.append({'sentiment': '', 'aspect': '', 'target_entity': ''})
            
        for a in t:
            sentiment.append(a.get('sentiment'))
            aspect.append(a.get('aspect'))
            target_entity.append(a.get('target_entity'))


In [400]:
def unique_item(list1): 
    a = np.array(list1) 
    print(np.unique(a)) 

In [405]:
from collections import Counter
counts = Counter(aspect)
print(counts)


Counter({'general': 1182, '': 956, 'price': 500, 'transit-location': 431, 'safety': 352, 'live': 221, 'nightlife': 158, 'shopping': 143, 'multicultural': 123, 'green-nature': 95, 'dining': 93, 'quiet': 54, 'touristy': 49})


In [401]:
print('Unique Sentiments: ')
unique_item(sentiment)
print('Unique Aspect: ')
unique_item(aspect)
print('Unique Target Entity: ')
unique_item(target_entity)

Unique Sentiments: 
['' 'Negative' 'Positive']
Unique Aspect: 
['' 'dining' 'general' 'green-nature' 'live' 'multicultural' 'nightlife'
 'price' 'quiet' 'safety' 'shopping' 'touristy' 'transit-location']
Unique Target Entity: 
['' 'LOCATION1' 'LOCATION2']


In [408]:
# Top 5 occurences other than None
aspect_to_id = {
    'general' : 0,
    'price': 1,
    'transit-location' : 2,
    'safety' : 3, 
    'live' : 4, 
}

In [446]:
def parse_json(file):
    with open(file) as f:
        d = json.load(f)
        
#     d = json_normalize(d)
#     _df = list()

    _df = list()
    
    for i in d:
        _test = i['text']
        _id = i['id']
        opinions = list()
        
        for j in i['opinions']:
            
            _sent = j['sentiment']
            _aspe = j['aspect']
            _tare = j['target_entity']
            
#             for k in j:            
#                 _sent = j['sentiment']
#                 _aspe = j['aspect']
#                 _tare = j['target_entity']
            
            opinions.append((_tare, _aspe, _sent))
        _df.append((_id, _test, opinions))
    return _df

In [447]:
def preprocess_data(data, aspectid):
    
    preprocessed_data_1 = list()
    
    for _id, _txt, _op in data:
        for _tare, _aspe, _sent in _op:
            if _aspe not in aspectid:
                continue
            preprocessed_data_1.append((_id, _txt, _tare, _aspe, _sent))

            assert 'LOCATION1' in _txt
            _tars = set(['LOCATION1'])
        
            if 'LOCATION2' in _txt:
                _tars.add('LOCATION2')
                
            for tar in _tars:
                
                aspects = set([a for t, a, _ in _op if t == tar])
                none_aspects = [a for a in aspectid if a not in aspects]
                
                for aspect in none_aspects:
                    preprocessed_data_1.append((_id, _txt, tar, aspect, 'None'))
    
    
    # Extracting the aspect for ABSA
    preprocessed_data_2 = list()
    
    for _, _, _, aspect, _ in preprocessed_data_1:
        preprocessed_data_2.append(aspectid[aspect])
        
        
    assert len(preprocessed_data_1) == len(preprocessed_data_2)
    
    preprocessed_data_2 = np.array(preprocessed_data_2)

    
    # Creating tokens 
    preprocessed_data_final = list()
                                               
#     print(len(preprocessed_data_2))
    
    for _id, _txt, _tare, _aspe, _sent in preprocessed_data_1:
        
        _txt_1 = nltk.word_tokenize(_txt)
        _aspe_1 = _aspe.split('-')
        
        preprocessed_data_final.append((_id, _txt_1, _tare, _aspe_1, _sent))
    
    return preprocessed_data_2, preprocessed_data_final

In [448]:
train = parse_json('./Data/sentihood-train.json')
val = parse_json('./Data/sentihood-dev.json')
test = parse_json('./Data/sentihood-test.json')

In [449]:
len(train)

2977

In [450]:
print("train length = ", len(train))
print("val length = ", len(dev))
print("test length = ", len(test))

train length =  2977
val length =  747
test length =  1491


In [451]:
train_id, train = preprocess_data(train, aspect_to_id)
val_id, val = preprocess_data(val, aspect_to_id)
test_id, test = preprocess_data(test, aspect_to_id)

In [452]:
print("train length = ", len(train))
print("val length = ", len(dev))
print("test length = ", len(test))

train length =  16226
val length =  747
test length =  8052


In [453]:
train.sort(key=lambda x:x[2]+str(x[0])+x[3][0])
val.sort(key=lambda y:y[2]+str(y[0])+y[3][0])
test.sort(key=lambda z:z[2]+str(z[0])+z[3][0])

In [455]:
path = data_dir+'bert-pair/'
if not os.path.exists(path):
    os.makedirs(path)

#### Which aspects and their respective sentiments does your model most accurately detect

#### What the points of failure for the model - which aspects does it perform poorly on.

#### A quick paragraph in a 100 words or less about your favourite machine learning library and what you dislike about it.