## **Source**

In [0]:
# https://github.com/jimmyyfeng/TD-LSTM

## **Setup Environment**

In [2]:
!unzip datasets.zip
!rm datasets.zip
!git clone https://github.com/jimmyyfeng/TD-LSTM
!mkdir tmp
!cp TD-LSTM/data/twitter/twitter_word_embedding_partial_200.txt /content/tmp/twitter_word_embedding_partial_200.txt
!mv utils.py TD-LSTM/utils.py

Archive:  datasets.zip
   creating: datasets/rt-polaritydata/
  inflating: datasets/rt-polaritydata/rt-polarity.pos  
  inflating: datasets/rt-polaritydata/rt-polarity.neg  
   creating: datasets/semeval/
  inflating: datasets/semeval/Restaurants_Train_v2.xml.txt  
  inflating: datasets/semeval/Laptops_test_PhaseB.xml.txt  
  inflating: datasets/semeval/Laptops_test_PhaseA.xml.txt  
  inflating: datasets/semeval/Restuarants_test_phaseA.xml.txt  
  inflating: datasets/semeval/Restuarants_test_phaseB.xml.txt  
  inflating: datasets/semeval/Laptops_train_v2.xml.txt  
  inflating: datasets/semeval/Restaurants_Train.xml.txt  
   creating: datasets/sentihood/
  inflating: datasets/sentihood/sentihood-test.json  
  inflating: datasets/sentihood/sentihood-dev.json  
  inflating: datasets/sentihood/sentihood-train.json  
   creating: datasets/sst/
  inflating: datasets/sst/Test_SST-1.txt  
  inflating: datasets/sst/Dev_SST-2.txt  
  inflating: datasets/sst/Training_SST-1.txt  
  inflating: data

## **Imports**

In [0]:
import numpy as np
import os
from operator import itemgetter
from string import punctuation
from sklearn.feature_extraction.text import CountVectorizer

## **Loading Data from Different Datasets**
To train for a specific dataset, only run the cell for that dataset, then skip over to the next section.\
Don't forget to run the first cells in this section for the loading functions.\

In [4]:
def generate_vocab(sentences):
    vectorizer = CountVectorizer(input = u'content',
                                analyzer = "word",
                                tokenizer = None,
                                preprocessor = None,
                                stop_words = None,
                                token_pattern = r',|\b\w+\b',
                                max_features = 15448)
    vectorizer.fit(sentences)
    return vectorizer.vocabulary_

def load_vocab(paths):
    sentences = []
    tokens = []
    for path in paths:
        with open(path, 'r', encoding='latin-1', errors='ignore') as f:
            r = f.read()
        for line in r.split('\n'):
            line = line.rstrip()
            text = line.split(' ')[1:]
            for token in text:
                if token not in punctuation:
                    tokens.append(token)
            sentences.append(' '.join(text))
    vocab = generate_vocab(sentences)
    return vocab, tokens

/content


In [0]:
# sst-1 data
% cd /content
def load_data(split_path, name):
    data = []
    with open(split_path, 'r', encoding='latin-1', errors='ignore') as f:
        r = f.read()
    for line in r.split('\n'):
        line = line.rstrip()
        text = line.split(' ')[1:]
        label = int(line.split(' ')[0])
        for token in text:
            if token not in punctuation:
                data.append((' '.join(text)).replace(token, '$T$'))
                data.append(token)
                data.append(label)  
    with open('tmp/' + name + '.raw', 'w') as f:
        for line in data:
            f.write(str(line) + '\n')

vocab, tokens = load_vocab(['datasets/sst/Training_SST-1.txt', 'datasets/sst/Dev_SST-1.txt', 'datasets/sst/Test_SST-1.txt'])

# word_id.txt
word_ids = [(w, idx+1) for w, idx in zip(list(vocab.keys()), list(vocab.values()))]
word_ids.sort(key=itemgetter(1))
with open('tmp/word_id.txt', 'w') as f:
    for w, idx in word_ids:
        f.write(f'{w} {idx}\n')

# aspect_id.txt
with open('tmp/aspect_id.txt', 'w') as f:
    for idx, token in enumerate(tokens):
        f.write(f'{token} {idx}\n')

# split.raw
load_data('datasets/sst/Training_SST-1.txt', 'train')
load_data('datasets/sst/Dev_SST-1.txt', 'validate')
load_data('datasets/sst/Test_SST-1.txt', 'test')

In [0]:
# sst-2 data
% cd /content
def load_data(split_path, name):
    data = []
    with open(split_path, 'r', encoding='latin-1', errors='ignore') as f:
        r = f.read()
    for line in r.split('\n'):
        line = line.rstrip()
        text = line.split(' ')[1:]
        label = int(line.split(' ')[0])
        for token in text:
            if token not in punctuation:
                data.append((' '.join(text)).replace(token, '$T$'))
                data.append(token)
                data.append(label)  
    with open('tmp/' + name + '.raw', 'w') as f:
        for line in data:
            f.write(str(line) + '\n')

vocab, tokens = load_vocab(['datasets/sst/Training_SST-2.txt', 'datasets/sst/Dev_SST-2.txt', 'datasets/sst/Test_SST-2.txt'])

# word_id.txt
word_ids = [(w, idx+1) for w, idx in zip(list(vocab.keys()), list(vocab.values()))]
word_ids.sort(key=itemgetter(1))
with open('tmp/word_id.txt', 'w') as f:
    for w, idx in word_ids:
        f.write(f'{w} {idx}\n')

# aspect_id.txt
with open('tmp/aspect_id.txt', 'w') as f:
    for idx, token in enumerate(tokens):
        f.write(f'{token} {idx}\n')

# split.raw
load_data('datasets/sst/Training_SST-2.txt', 'train')
load_data('datasets/sst/Dev_SST-2.txt', 'validate')
load_data('datasets/sst/Test_SST-2.txt', 'test')

In [12]:
# sentihood data
% cd /content
import json
def load_vocab(paths):
    sentences = []
    tokens = []
    for path in paths:
        with open(path, 'r') as f:
            jfile = json.load(f)
        for i in range(len(jfile)):
            for j in range(len(jfile[i]['opinions'])):                
                text = jfile[i]['text']
                aspect = jfile[i]['opinions'][j]['aspect']
                entity = jfile[i]['opinions'][j]['target_entity']
                sentences.append(text.replace(entity, aspect))
                tokens.append(aspect)
    vocab = generate_vocab(sentences)
    return vocab, tokens

def load_data(split_path, name):
    data = []
    with open(split_path, 'r') as f:
        jfile = json.load(f)
    for i in range(len(jfile)):
        for j in range(len(jfile[i]['opinions'])):                
            text = jfile[i]['text']
            aspect = jfile[i]['opinions'][j]['aspect']
            entity = jfile[i]['opinions'][j]['target_entity']
            if jfile[i]['opinions'][j]['sentiment'] == 'Negative':
                sentiment = 0
            else:
                sentiment = 1
            data.append(text.replace(entity, '$T$'))
            data.append(aspect)
            data.append(sentiment)
    with open('tmp/' + name + '.raw', 'w') as f:
        for line in data:
            f.write(str(line) + '\n')

vocab, tokens = load_vocab(['datasets/sentihood/sentihood-train.json', 'datasets/sentihood/sentihood-dev.json', 'datasets/sentihood/sentihood-test.json'])

# word_id.txt
word_ids = [(w, idx+1) for w, idx in zip(list(vocab.keys()), list(vocab.values()))]
word_ids.sort(key=itemgetter(1))
with open('tmp/word_id.txt', 'w') as f:
    for w, idx in word_ids:
        f.write(f'{w} {idx}\n')

# aspect_id.txt
with open('tmp/aspect_id.txt', 'w') as f:
    for idx, token in enumerate(tokens):
        f.write(f'{token} {idx}\n')

load_data('datasets/sentihood/sentihood-train.json', 'train')
load_data('datasets/sentihood/sentihood-dev.json', 'validate')
load_data('datasets/sentihood/sentihood-test.json', 'test')

/content


## **Training**

In [0]:
# AT SST1
%cd /content/TD-LSTM
!python at_lstm.py --train_file_path='../tmp/train.raw' --validate_file_path='../tmp/validate.raw' --test_file_path='../tmp/test.raw' --embedding_file_path='../tmp/twitter_word_embedding_partial_200.txt' --word_id_file_path='../tmp/word_id.txt' --aspect_id_file_path='../tmp/aspect_id.txt' --method='AT' --n_class=5

/content/TD-LSTM


load word-id mapping done!

a bad word embedding: jojo
a bad word embedding: raining
a bad word embedding: borderline
a bad word embedding: yellow
a bad word embedding: keno
a bad word embedding: four
a bad word embedding: gag
a bad word embedding: woods
a bad word embedding: olympics
a bad word embedding: spiders
a bad word embedding: verses
a bad word embedding: hanging
a bad word embedding: until
a bad word embedding: marching
a bad word embedding: foundation
a bad word embedding: twittercrush
a bad word embedding: extrapolate
a bad word embedding: archuleta
a bad word embedding: gnews
a bad word embedding: lord
a bad word embedding: sinking
a bad word embedding: muthafuckin
a bad word embedding: differently
a bad word embedding: hdtv
a bad word embedding: fur
a bad word embedding: bringing
a bad word embedding: pants
a bad word embedding: welp
a bad word embedding: engga
a bad word embedding: prize
a bad word embedding: less
a bad word embedding: wooden
a bad wor

In [0]:
# AEAT SST1
%cd /content/TD-LSTM
!python at_lstm.py --train_file_path='../tmp/train.raw' --validate_file_path='../tmp/validate.raw' --test_file_path='../tmp/test.raw' --embedding_file_path='../tmp/twitter_word_embedding_partial_200.txt' --word_id_file_path='../tmp/word_id.txt' --aspect_id_file_path='../tmp/aspect_id.txt' --method='AE' --n_class=5

/content/TD-LSTM


load word-id mapping done!

a bad word embedding: jojo
a bad word embedding: raining
a bad word embedding: borderline
a bad word embedding: yellow
a bad word embedding: keno
a bad word embedding: four
a bad word embedding: gag
a bad word embedding: woods
a bad word embedding: olympics
a bad word embedding: spiders
a bad word embedding: verses
a bad word embedding: hanging
a bad word embedding: until
a bad word embedding: marching
a bad word embedding: foundation
a bad word embedding: twittercrush
a bad word embedding: extrapolate
a bad word embedding: archuleta
a bad word embedding: gnews
a bad word embedding: lord
a bad word embedding: sinking
a bad word embedding: muthafuckin
a bad word embedding: differently
a bad word embedding: hdtv
a bad word embedding: fur
a bad word embedding: bringing
a bad word embedding: pants
a bad word embedding: welp
a bad word embedding: engga
a bad word embedding: prize
a bad word embedding: less
a bad word embedding: wooden
a bad wor

In [6]:
# AT SST2
%cd /content/TD-LSTM
!python at_lstm.py --train_file_path='../tmp/train.raw' --validate_file_path='../tmp/validate.raw' --test_file_path='../tmp/test.raw' --embedding_file_path='../tmp/twitter_word_embedding_partial_200.txt' --word_id_file_path='../tmp/word_id.txt' --aspect_id_file_path='../tmp/aspect_id.txt' --method='AT' --n_class=2

/content/TD-LSTM


load word-id mapping done!

a bad word embedding: jojo
a bad word embedding: raining
a bad word embedding: borderline
a bad word embedding: yellow
a bad word embedding: keno
a bad word embedding: four
a bad word embedding: gag
a bad word embedding: woods
a bad word embedding: olympics
a bad word embedding: spiders
a bad word embedding: verses
a bad word embedding: hanging
a bad word embedding: until
a bad word embedding: marching
a bad word embedding: foundation
a bad word embedding: twittercrush
a bad word embedding: extrapolate
a bad word embedding: archuleta
a bad word embedding: gnews
a bad word embedding: lord
a bad word embedding: sinking
a bad word embedding: muthafuckin
a bad word embedding: differently
a bad word embedding: hdtv
a bad word embedding: fur
a bad word embedding: bringing
a bad word embedding: pants
a bad word embedding: welp
a bad word embedding: engga
a bad word embedding: prize
a bad word embedding: less
a bad word embedding: wooden
a bad wor

In [0]:
# AEAT SST2
%cd /content/TD-LSTM
!python at_lstm.py --train_file_path='../tmp/train.raw' --validate_file_path='../tmp/validate.raw' --test_file_path='../tmp/test.raw' --embedding_file_path='../tmp/twitter_word_embedding_partial_200.txt' --word_id_file_path='../tmp/word_id.txt' --aspect_id_file_path='../tmp/aspect_id.txt' --method='AT' --n_class=2

In [13]:
# AT SentiHood
%cd /content/TD-LSTM
!python at_lstm.py --train_file_path='../tmp/train.raw' --validate_file_path='../tmp/validate.raw' --test_file_path='../tmp/test.raw' --embedding_file_path='../tmp/twitter_word_embedding_partial_200.txt' --word_id_file_path='../tmp/word_id.txt' --aspect_id_file_path='../tmp/aspect_id.txt' --method='AT' --n_class=2

/content/TD-LSTM


load word-id mapping done!

a bad word embedding: jojo
a bad word embedding: raining
a bad word embedding: borderline
a bad word embedding: yellow
a bad word embedding: keno
a bad word embedding: four
a bad word embedding: gag
a bad word embedding: woods
a bad word embedding: olympics
a bad word embedding: spiders
a bad word embedding: verses
a bad word embedding: hanging
a bad word embedding: until
a bad word embedding: marching
a bad word embedding: foundation
a bad word embedding: twittercrush
a bad word embedding: extrapolate
a bad word embedding: archuleta
a bad word embedding: gnews
a bad word embedding: lord
a bad word embedding: sinking
a bad word embedding: muthafuckin
a bad word embedding: differently
a bad word embedding: hdtv
a bad word embedding: fur
a bad word embedding: bringing
a bad word embedding: pants
a bad word embedding: welp
a bad word embedding: engga
a bad word embedding: prize
a bad word embedding: less
a bad word embedding: wooden
a bad wor

In [0]:
# AEAT SentiHood
%cd /content/TD-LSTM
!python at_lstm.py --train_file_path='../tmp/train.raw' --validate_file_path='../tmp/validate.raw' --test_file_path='../tmp/test.raw' --embedding_file_path='../tmp/twitter_word_embedding_partial_200.txt' --word_id_file_path='../tmp/word_id.txt' --aspect_id_file_path='../tmp/aspect_id.txt' --method='AEAT' --n_class=2