In [4]:
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize 
from pathlib import Path
from urllib.request import urlretrieve

### Loading Data

In [5]:
df_train=pd.read_csv("data/traindata.csv",sep='\t',header=None)
df_train.columns=["polarity","aspect_category","target_term","character_offset","sentence"]
df_train.head(2)

Unnamed: 0,polarity,aspect_category,target_term,character_offset,sentence
0,positive,AMBIENCE#GENERAL,seating,18:25,short and sweet – seating is great:it's romant...
1,positive,AMBIENCE#GENERAL,trattoria,25:34,This quaint and romantic trattoria is at the t...


In [6]:
df_dev=pd.read_csv("data/devdata.csv",sep='\t',header=None)
df_dev.columns=["polarity","aspect_category","target_term","character_offset","sentence"]
df_dev.head(2)

Unnamed: 0,polarity,aspect_category,target_term,character_offset,sentence
0,positive,LOCATION#GENERAL,neighborhood,54:66,"great food, great wine list, great service in ..."
1,negative,RESTAURANT#GENERAL,place,15:20,I thought this place was totally overrated.


In [7]:
df_train["label"]=df_train["polarity"].apply(lambda x: 1 if x=="positive" else (0 if x=="neutral" else -1))
df_dev["label"]=df_dev["polarity"].apply(lambda x: 1 if x=="positive" else (0 if x=="neutral" else -1))
df_dev.head(2)

Unnamed: 0,polarity,aspect_category,target_term,character_offset,sentence,label
0,positive,LOCATION#GENERAL,neighborhood,54:66,"great food, great wine list, great service in ...",1
1,negative,RESTAURANT#GENERAL,place,15:20,I thought this place was totally overrated.,-1


### Word2vec

In [8]:
PATH_TO_DATA = Path('/Users/antoineguiot/Documents/OMA/NLP/NLP_exo_2/NLP-OMA/data')
# Download word vectors, might take a few minutes and about ~3GB of storage space
en_embeddings_path = PATH_TO_DATA / 'cc.en.300.vec.gz'
if not en_embeddings_path.exists():
    urlretrieve('https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.vec.gz', en_embeddings_path)
fr_embeddings_path = PATH_TO_DATA / 'cc.fr.300.vec.gz'
if not fr_embeddings_path.exists():
    urlretrieve('https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.fr.300.vec.gz', fr_embeddings_path)

KeyboardInterrupt: 

In [9]:
from collections import defaultdict
import gzip
import numpy as np

import difflib
import re
import numpy as n

In [10]:
class Word2Vec():

    def __init__(self, filepath, vocab_size=50000):
        self.words, self.embeddings = self.load_wordvec(filepath, vocab_size)
        # Mappings for O(1) retrieval:
        self.word2id = {word: idx for idx, word in enumerate(self.words)}
        self.id2word = {idx: word for idx, word in enumerate(self.words)}
    
    def load_wordvec(self, filepath, vocab_size):
        assert str(filepath).endswith('.gz')
        words = []
        embeddings = []
        with gzip.open(filepath, 'rt',encoding="utf8") as f:  # Read compressed file directly
            next(f)  # Skip header
            for i, line in enumerate(f):
                word, vec = line.split(' ', 1)
                words.append(word)
                embeddings.append(np.fromstring(vec, sep=' '))
                if i == (vocab_size - 1):
                    break
        print('Loaded %s pretrained word vectors' % (len(words)))
        return words, np.vstack(embeddings)
    
    def encode(self, word):
        # Returns the 1D embedding of a given word
        #return self.embeddings[self.word2id[word]]
        try:
            i = self.word2id[word]
            return self.embeddings[i]
        except:
            try:
                word = difflib.get_close_matches(word, self.words)[0]
                i = self.word2id[word]
            except:
                return np.zeros((300))
        return self.embeddings[i]
    
    def score(self, word1, word2):
        # Return the cosine similarity: use np.dot & np.linalg.norm
        code1=self.encode(word1)
        code2=self.encode(word2)
        return np.dot(code1,code2)/(np.linalg.norm(code1)*np.linalg.norm(code2))

In [13]:
class BagOfWords():
    def __init__(self, word2vec):
        self.word2vec = word2vec
    
    def build_idf(self, sentences):
        # build the idf dictionary: associate each word to its idf value
        # -> idf = {word: idf_value, ...}
        idf={}
        N=len(sentences)
        
        # get number of documents containing each word
        for sentence in sentences:
            wordsList=re.sub("[^\w]", " ",sentence).split()
            for word in set(wordsList):
                idf[word]=idf.get(word, 0)+1
                
        #transform to get idf value of each word       
        for word in idf:
            idf[word]=np.log10(N/idf[word])
        return idf
        
    
    def encode(self, sentence, ag_sentence = True, padding =25, idf=None):
        
        # Takes a sentence as input, returns the sentence embedding
        wordsList=re.sub("[^\w]", " ",sentence).split()
        wordsVectors=[self.word2vec.encode(word) for word in wordsList]
        
        if ag_sentence==False:
            wordsVectors = wordsVectors[0:padding]
            wordsVectors = np.stack(wordsVectors, axis=0)
            wordsVectors = np.pad(wordsVectors, [(0, padding-len(wordsVectors)),(0,0)], mode='constant')
            return wordsVectors
        
        if idf is None:
            # mean of word vectors
            return np.mean(wordsVectors,axis=0)
        else:
            # idf-weighted mean of word vectors
            weightedMean=0
            sumIdf=0
            for i,word in enumerate(wordsList):
                weightedMean+=idf.get(word,0)*wordsVectors[i]
                sumIdf+=idf.get(word,0)
            weightedMean=weightedMean/sumIdf
            return weightedMean
                                

    def score(self, sentence1, sentence2, idf=None):
        # cosine similarity: use np.dot & np.linalg.norm 
        code1=self.encode(sentence1,idf)
        code2=self.encode(sentence2,idf)
        return np.dot(code1,code2)/(np.linalg.norm(code1)*np.linalg.norm(code2))

In [14]:
word2vec = Word2Vec(en_embeddings_path, vocab_size=50000)
sentence2vec = BagOfWords(word2vec)

Loaded 50000 pretrained word vectors


### First Model

#### Encode full sentence with word2vec then classification (logreg)

In [138]:
a=sentence2vec.encode(df_train["sentence"][14], ag_sentence=False, padding=10)
#word2vec.encode(df_train["sentence"][1][0:4]).shape

In [34]:
train_sentences_emb=[sentence2vec.encode(df_train["sentence"][i]) for i in range(len(df_train["sentence"]))]

In [46]:
dev_sentences_emb=[sentence2vec.encode(df_dev["sentence"][i]) for i in range(len(df_dev["sentence"]))]

In [55]:
from sklearn.linear_model import LogisticRegression

train_acc,dev_acc=[],[]
pen_values = 10.0**(np.arange(-2,2,0.5))

for pen in pen_values:
    logReg = LogisticRegression(penalty="l2",C = pen, multi_class='auto',solver='newton-cg')
    logReg.fit(train_sentences_emb, df_train["label"])
    train_acc.append(logReg.score(train_sentences_emb, df_train["label"]))
    dev_acc.append(logReg.score(dev_sentences_emb, df_dev["label"]))

best_pen=pen_values[np.argmax(dev_acc)]
best_train_acc=train_acc[np.argmax(dev_acc)]
best_dev_acc=max(dev_acc)

print("Results for mean BoW: \n","Best value for the penalty:",best_pen,'\n Dev accuracy:',best_dev_acc,'\n Train accuracy:',best_train_acc)

Results for mean BoW: 
 Best value for the penalty: 10.0 
 Dev accuracy: 0.7686170212765957 
 Train accuracy: 0.8416500332667998


In [57]:
logReg = LogisticRegression(penalty="l2",C = 10, multi_class='auto',solver='newton-cg')

In [69]:
logReg.fit(train_sentences_emb, df_train["label"])
logReg.predict(dev_sentences_emb);

### 2nd Model

#### Same but remove target term and stopwords from sentence

In [196]:
sentence_red=[0]*len(df_train)
for i in range(len(df_train)):
    sentence_red[i]=df_train["sentence"][i][:int(df_train["character_offset"][i].split(":")[0])]+df_train["sentence"][i][int(df_train["character_offset"][i].split(":")[1]):]
df_train["sentence_red"]=sentence_red

sentence_red=[0]*len(df_dev)
for i in range(len(df_dev)):
    sentence_red[i]=df_dev["sentence"][i][:int(df_dev["character_offset"][i].split(":")[0])]+df_dev["sentence"][i][int(df_dev["character_offset"][i].split(":")[1]):]
df_dev["sentence_red"]=sentence_red

In [197]:
df_train.head(2)

Unnamed: 0,polarity,aspect_category,target_term,character_offset,sentence,label,sentence_red
0,positive,AMBIENCE#GENERAL,seating,18:25,short and sweet – seating is great:it's romant...,1,"short and sweet – is great:it's romantic,cozy..."
1,positive,AMBIENCE#GENERAL,trattoria,25:34,This quaint and romantic trattoria is at the t...,1,This quaint and romantic is at the top of my ...


In [198]:
stop_words = set(stopwords.words('english')) 
stop_words.remove("not")
stop_words.remove("no")

def remove_stopwords(sentence):
    sentence = sentence.lower()
    sentence_tok = word_tokenize(sentence)
    sentence_f = ""
    for i in range(len(sentence_tok)):
        w=sentence_tok[i]
        if w not in stop_words:
            if i==len(sentence_tok)-1:
                sentence_f+=w
            else:
                sentence_f+=w+" "
    if len(sentence_f)<2:
        sentence_f = sentence
    return sentence_f

In [199]:
df_train["sentence_red"]=df_train["sentence_red"].apply(lambda x:remove_stopwords(x))
df_dev["sentence_red"]=df_dev["sentence_red"].apply(lambda x:remove_stopwords(x))
df_train.head(2)

Unnamed: 0,polarity,aspect_category,target_term,character_offset,sentence,label,sentence_red
0,positive,AMBIENCE#GENERAL,seating,18:25,short and sweet – seating is great:it's romant...,1,"short sweet – great : 's romantic , cozy priva..."
1,positive,AMBIENCE#GENERAL,trattoria,25:34,This quaint and romantic trattoria is at the t...,1,quaint romantic top manhattan restaurant list .


In [201]:
train_sentences_emb2=[sentence2vec.encode(df_train["sentence_red"][i]) for i in range(len(df_train["sentence_red"]))]

In [202]:
dev_sentences_emb2=[sentence2vec.encode(df_dev["sentence_red"][i]) for i in range(len(df_dev["sentence_red"]))]

In [203]:
from sklearn.linear_model import LogisticRegression

train_acc,dev_acc=[],[]
pen_values = 10.0**(np.arange(-2,2,0.5))

for pen in pen_values:
    logReg = LogisticRegression(penalty="l2",C = pen, multi_class='auto',solver='newton-cg')
    logReg.fit(train_sentences_emb2, df_train["label"])
    train_acc.append(logReg.score(train_sentences_emb2, df_train["label"]))
    dev_acc.append(logReg.score(dev_sentences_emb2, df_dev["label"]))

best_pen=pen_values[np.argmax(dev_acc)]
best_train_acc=train_acc[np.argmax(dev_acc)]
best_dev_acc=max(dev_acc)

print("Results for mean BoW: \n","Best value for the penalty:",best_pen,'\n Dev accuracy:',best_dev_acc,'\n Train accuracy:',best_train_acc)

Results for mean BoW: 
 Best value for the penalty: 10.0 
 Dev accuracy: 0.7978723404255319 
 Train accuracy: 0.8642714570858283


3 rd model 

In [34]:
df_train['cat_'] = df_train['aspect_category'].astype("category").cat.codes

In [44]:
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder(handle_unknown='ignore')

enc.fit(df_train['aspect_category'].values.reshape(-1, 1))
categories_train = enc.transform(df_train['aspect_category'].values.reshape(-1, 1))
categories_dev = enc.transform(df_dev['aspect_category'].values.reshape(-1, 1))

In [64]:
df_train

Unnamed: 0,polarity,aspect_category,target_term,character_offset,sentence,label,cat_
0,positive,AMBIENCE#GENERAL,seating,18:25,short and sweet – seating is great:it's romant...,1,0
1,positive,AMBIENCE#GENERAL,trattoria,25:34,This quaint and romantic trattoria is at the t...,1,0
2,positive,FOOD#QUALITY,food,98:102,The have over 100 different beers to offer thi...,1,5
3,negative,SERVICE#GENERAL,STAFF,5:10,THIS STAFF SHOULD BE FIRED.,-1,11
4,positive,FOOD#STYLE_OPTIONS,menu,4:8,"The menu looked great, and the waiter was very...",1,6
...,...,...,...,...,...,...,...
1498,positive,DRINKS#QUALITY,expresso,29:37,One of us actually liked the expresso - that's...,1,2
1499,negative,SERVICE#GENERAL,waitress,20:28,The hostess and the waitress were incredibly r...,-1,11
1500,positive,RESTAURANT#PRICES,place,12:17,this little place has a cute interior decor an...,1,10
1501,positive,RESTAURANT#GENERAL,restaurant,30:40,Nice Family owned traditional restaurant.,1,8


In [15]:
a=sentence2vec.encode(df_train["sentence"][14], ag_sentence=False, padding=100)

In [16]:
train_sentences_emb3=[sentence2vec.encode(df_train["sentence"][i], ag_sentence=False, padding=50) for i in range(len(df_train["sentence"]))]

In [17]:
dev_sentences_emb3=[sentence2vec.encode(df_dev["sentence"][i], ag_sentence=False, padding=50) for i in range(len(df_dev["sentence"]))]

In [18]:
train_sentences_emb3 = np.stack(train_sentences_emb3)
dev_sentences_emb3 = np.stack(dev_sentences_emb3)

In [74]:
import tensorflow
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, Activation, LSTM, Input, concatenate, Flatten, Reshape
from sklearn.metrics import mean_squared_error
inputA = Input(shape=(50, 300))
inputB = Input(shape=(12,))
y = Dense((16), activation='relu')(inputB)

x = Dense((16), activation="relu")(inputA)
x = LSTM(16, return_sequences=False, input_shape=(50, 300), go_backwards=True)(x)
z = concatenate([x,y])
output = Dense(3, activation='sigmoid')(z)

model = Model(inputs=[inputA, inputB], outputs=output)
model.compile(loss='categorical_crossentropy', optimizer='adam')
model.summary()  # show the summary of this model in logs

Model: "model_7"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_16 (InputLayer)           [(None, 50, 300)]    0                                            
__________________________________________________________________________________________________
dense_21 (Dense)                (None, 50, 16)       4816        input_16[0][0]                   
__________________________________________________________________________________________________
input_17 (InputLayer)           [(None, 12)]         0                                            
__________________________________________________________________________________________________
unified_lstm_7 (UnifiedLSTM)    (None, 16)           2112        dense_21[0][0]                   
____________________________________________________________________________________________

In [75]:
# trainig : 
label_train = to_categorical(df_train['label']+1)
label_dev = to_categorical(df_dev['label']+1)

history_signal = model.fit((train_sentences_emb3,categories_train) ,
                                          label_train,
                                          epochs=70,
                                          batch_size=124,
                                          validation_data=((dev_sentences_emb3,categories_dev),label_dev),
                                          verbose=1)

Train on 1503 samples, validate on 376 samples
Epoch 1/70
Epoch 2/70
Epoch 3/70
Epoch 4/70
Epoch 5/70
Epoch 6/70
Epoch 7/70
Epoch 8/70
Epoch 9/70
Epoch 10/70
Epoch 11/70
Epoch 12/70
Epoch 13/70
Epoch 14/70
Epoch 15/70
Epoch 16/70
Epoch 17/70
Epoch 18/70
Epoch 19/70
Epoch 20/70
Epoch 21/70
Epoch 22/70
Epoch 23/70
Epoch 24/70
Epoch 25/70
Epoch 26/70
Epoch 27/70
Epoch 28/70
Epoch 29/70
Epoch 30/70
Epoch 31/70
Epoch 32/70
Epoch 33/70
Epoch 34/70
Epoch 35/70
Epoch 36/70
Epoch 37/70
Epoch 38/70
Epoch 39/70
Epoch 40/70
Epoch 41/70
Epoch 42/70
Epoch 43/70
Epoch 44/70
Epoch 45/70
Epoch 46/70
Epoch 47/70
Epoch 48/70
Epoch 49/70
Epoch 50/70
Epoch 51/70
Epoch 52/70
Epoch 53/70
Epoch 54/70
Epoch 55/70
Epoch 56/70
Epoch 57/70
Epoch 58/70
Epoch 59/70
Epoch 60/70
Epoch 61/70
Epoch 62/70
Epoch 63/70
Epoch 64/70
Epoch 65/70
Epoch 66/70
Epoch 67/70
Epoch 68/70
Epoch 69/70
Epoch 70/70


In [76]:
y_pred = model.predict((dev_sentences_emb3,categories_dev))
from sklearn.metrics import accuracy_score
y_pred = np.argmax(y_pred, axis=1)-1
from sklearn.metrics import f1_score

In [77]:
from tensorflow.keras.utils import to_categorical
from sklearn.metrics import accuracy_score
accuracy_score(y_true=df_dev['label'],y_pred=y_pred)

0.776595744680851

In [230]:
to_categorical(df_dev['label']+1)

array([[0., 0., 1.],
       [1., 0., 0.],
       [0., 0., 1.],
       ...,
       [0., 0., 1.],
       [1., 0., 0.],
       [0., 0., 1.]], dtype=float32)