In [31]:
import numpy as np
import pandas as pd 
import keras
import pickle
import json
import os
import random
import tensorflow as tf

from sklearn.feature_extraction.text import CountVectorizer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, Dropout
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
from keras import optimizers
import re

In [32]:
print(keras.__version__)

2.0.6


In [33]:
#forcing keras to use the gpu
config = tf.ConfigProto( device_count = {'GPU': 1 , 'CPU': 4} ) 
sess = tf.Session(config=config) 
keras.backend.set_session(sess)

In [34]:
#inizialization of the dataset
data = pd.DataFrame([['tweet molto negativo', 0]], columns=['text', 'sent'])

In [35]:
#loading the data from the KaggleTranslated dataset
data_kaggle = pd.read_csv('training_data/KaggleTranslated.csv', encoding='latin_1', index_col = 0)
data_kaggle = data_kaggle.rename(index=str, columns={"SentimentText": "text"})

print(data_kaggle['sent'].value_counts())

diff = abs(data_kaggle['sent'].value_counts()[0] - data_kaggle['sent'].value_counts()[1])
print (diff)

data_kaggle_pos = data_kaggle[data_kaggle['sent'] == 1]

drop_indices = np.random.choice(data_kaggle_pos.index, diff, replace=False)

data_kaggle = data_kaggle.drop(drop_indices) #balancing the dataset |pos| = |neg|

data_kaggle = data_kaggle.sample(frac=1).reset_index(drop=True) #shuffling the dataset

data_kaggle_test = data_kaggle[data_kaggle.index < 20000] #test-train split
data_kaggle_train = data_kaggle[data_kaggle.index >= 20000]
    
print(data_kaggle_train['sent'].value_counts())
print(data_kaggle_test['sent'].value_counts())

1    39822
0    31435
Name: sent, dtype: int64
8387
1    21480
0    21390
Name: sent, dtype: int64
0    10045
1     9955
Name: sent, dtype: int64


In [36]:
#loading data from sentipol16, data from an italian political sentiment competion
data_pol = pd.read_csv('training_data/training_set_sentipolc16.csv', encoding = "latin_1")

#deleting the unused collumns 
del data_pol['idtwitter'] 
del data_pol['iro']
del data_pol['top']
del data_pol['subj']

#dropping mixed sentiment tweets
data_pol = data_pol.drop(data_pol[(data_pol['opos'] == 1) & (data_pol['oneg'] == 1)].index)
data_pol = data_pol.drop(data_pol[(data_pol['lpos'] == 1) & (data_pol['lneg'] == 1)].index)

data_pol.loc[data_pol['lpos'] == 1, 'sent'] = 1
data_pol.loc[data_pol['lneg'] == 1, 'sent'] = 0

data_pol.loc[data_pol['opos'] == 1, 'sent'] = 1
data_pol.loc[data_pol['oneg'] == 1, 'sent'] = 0


data_pol = data_pol[['text','sent']]

print(data_pol['sent'].value_counts())

diff = data_pol['sent'].value_counts()[0] - data_pol['sent'].value_counts()[1]
print (diff)
d_neg = data_pol[data_pol['sent'] == 0]

drop_indices = np.random.choice(d_neg.index, diff-150, replace=False)

data_pol = data_pol.drop(drop_indices)#balancing the dataset 
    
print(data_pol['sent'].value_counts())

0.0    2472
1.0    1601
Name: sent, dtype: int64
871
0.0    1751
1.0    1601
Name: sent, dtype: int64


In [37]:
data_pol = data_pol.sample(frac=1).reset_index(drop=True)

data_null = data_pol[np.isnan(data_pol['sent'])]
drop_indices = np.random.choice(data_null.index, data_null.shape[0], replace=False)

data_pol = data_pol.drop(drop_indices)

data_pol = data_pol.sample(frac=1).reset_index(drop=True)

#train-test split
data_pol_test = data_pol[data_pol.index < 500] 
data_pol_train = data_pol[data_pol.index >= 500]


print(data_pol_test['sent'].value_counts())
print(data_pol_train['sent'].value_counts())

0.0    266
1.0    234
Name: sent, dtype: int64
0.0    1485
1.0    1367
Name: sent, dtype: int64


In [38]:
#Combining the dataset,  
data_pol = data_pol.sample(frac=1).reset_index(drop=True)
data = data.append(data_kaggle_train,ignore_index=True)
data = data.append(data_pol_train,  ignore_index=True)
data = data.append(data_pol_train,  ignore_index=True)

print(data['sent'].value_counts())

0.0    24361
1.0    24214
Name: sent, dtype: int64


In [39]:
#Removing sensible words such as the names of candidates and parties
data['text'] = data['text'].apply(lambda x: x.lower())
data['text'] = data['text'].apply((lambda x: re.sub('[^a-zA-z0-9\s]','',x)))
data['text'] = data['text'].apply((lambda x: re.sub(':\)','',x)))
data['text'] = data['text'].apply((lambda x: re.sub(':\(','',x)))
data['text'] = data['text'].apply((lambda x: re.sub('#','',x)))
data['text'] = data['text'].apply((lambda x: re.sub('Berlusconi','akfha',x)))
data['text'] = data['text'].apply((lambda x: re.sub('Renzi','lafh',x)))
data['text'] = data['text'].apply((lambda x: re.sub('Salvini','kjahfka',x)))
data['text'] = data['text'].apply((lambda x: re.sub('Grillo','lhalahfl',x)))
data['text'] = data['text'].apply((lambda x: re.sub('berlusconi','akfha',x)))
data['text'] = data['text'].apply((lambda x: re.sub('renzi','lafh',x)))
data['text'] = data['text'].apply((lambda x: re.sub('salvini','kjahfka',x)))
data['text'] = data['text'].apply((lambda x: re.sub('grillo','lhalahfl',x)))
data['text'] = data['text'].apply((lambda x: re.sub('Lega','sds',x)))
data['text'] = data['text'].apply((lambda x: re.sub('PD ','fsfsfsf',x)))
data['text'] = data['text'].apply((lambda x: re.sub('FI ','dsgsgsgs',x)))

print(data['sent'].value_counts())

data = data.sample(frac=1).reset_index(drop=True)

#initializing the tokenizer
num_words = 25000
maxlen = 30
tokenizer = Tokenizer(num_words=num_words, split=' ') 
tokenizer.fit_on_texts(data['text'].values)

#tokening the dataset
tweets = tokenizer.texts_to_sequences(data['text'].values)
tweets = pad_sequences(tweets, maxlen=maxlen)

#saving the tokenizer
tokenizer_file = open("model_files/tokenizer.pickle", "wb")
pickle.dump(tokenizer, tokenizer_file)
tokenizer_file.close()

0.0    24361
1.0    24214
Name: sent, dtype: int64


In [40]:
#building the model
emb_dim = 128 #128
lstm_size = 196 #196
lstm2_size = 98

model = Sequential()
model.add(Embedding(num_words, emb_dim,input_length = tweets.shape[1]))
model.add(Dropout(0.5, noise_shape=None, seed=None))
model.add(LSTM(lstm_size, dropout=0.5, recurrent_dropout=0.5,return_sequences=True))
model.add(LSTM(lstm2_size, dropout=0.5, recurrent_dropout=0.5))
model.add(Dense(2,activation='softmax'))
model.compile(loss = 'categorical_crossentropy', optimizer= 'adam' ,metrics = ['accuracy'])
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 30, 128)           3200000   
_________________________________________________________________
dropout_2 (Dropout)          (None, 30, 128)           0         
_________________________________________________________________
lstm_3 (LSTM)                (None, 30, 196)           254800    
_________________________________________________________________
lstm_4 (LSTM)                (None, 98)                115640    
_________________________________________________________________
dense_2 (Dense)              (None, 2)                 198       
Total params: 3,570,638
Trainable params: 3,570,638
Non-trainable params: 0
_________________________________________________________________
None


In [41]:
labels = pd.get_dummies(data['sent']).values
tweets_train, tweets_test, labels_train, labels_test = train_test_split(tweets,labels, test_size = 0.1, random_state = 42)
print(tweets_train.shape,labels_train.shape)
print(tweets_test.shape,labels_test.shape)

(43717, 30) (43717, 2)
(4858, 30) (4858, 2)


In [42]:
batch_size = 512
model.fit(tweets_train, labels_train, epochs = 7, batch_size=batch_size, verbose = 2)

Epoch 1/7
13s - loss: 0.6375 - acc: 0.6245
Epoch 2/7
13s - loss: 0.5188 - acc: 0.7477
Epoch 3/7
13s - loss: 0.4567 - acc: 0.7887
Epoch 4/7
13s - loss: 0.4151 - acc: 0.8157
Epoch 5/7
13s - loss: 0.3825 - acc: 0.8335
Epoch 6/7
13s - loss: 0.3608 - acc: 0.8452
Epoch 7/7
13s - loss: 0.3376 - acc: 0.8566


<keras.callbacks.History at 0x7efcfc4cf438>

In [43]:
validation_size = 500

tweets_validate = tweets_test[0:validation_size]
labels_validate = labels_test[0:validation_size]
tweets_test = tweets_test[validation_size:]
labels_test = labels_test[validation_size:]
score,acc = model.evaluate(tweets_test, labels_test, verbose = 1, batch_size = batch_size)
print("score: %.2f" % (score))
print("acc: %.2f" % (acc))

acc: 0.75


In [44]:
score,acc = model.evaluate(tweets_validate, labels_validate, verbose = 1, batch_size = batch_size)
print("score: %.2f" % (score))
print("acc: %.2f" % (acc))

score: 0.59
acc: 0.73


In [45]:
model.save('model_files/ItalianSentCls.h5')

In [46]:
tweets_kaggle = data_kaggle_test['text'].values
tweets_kaggle = tokenizer.texts_to_sequences(data_kaggle_test['text'].values)
tweets_kaggle = pad_sequences(tweets_kaggle, maxlen=maxlen)
labels_kaggle = pd.get_dummies(data_kaggle_test['sent']).values
score,acc = model.evaluate(tweets_kaggle, labels_kaggle, verbose = 1, batch_size = batch_size)
print("score: %.2f" % (score))
print("acc: %.2f" % (acc))

acc: 0.73


In [170]:
pos_cnt, neg_cnt, pos_correct, neg_correct = 0, 0, 0, 0
for x in range(len(tweets_kaggle)):    
    result = model.predict(tweets_kaggle[x].reshape(1,tweets_test.shape[1]),batch_size=1,verbose = 2)[0]   
    if random.uniform(0, 1) < 0.0003:
        print(result)
        print(x)   
    if result[0]>result[1]:
        conf = result[0]
    else:
        conf = result[1]   
    if conf > 0.80:
        if np.argmax(result) == np.argmax(labels_kaggle[x]):
            if np.argmax(labels_kaggle[x]) == 0:
                neg_correct += 1
            else:
                pos_correct += 1

        if np.argmax(labels_kaggle[x]) == 0:
            neg_cnt += 1
        else:
            pos_cnt += 1

print("pos_acc", pos_correct/pos_cnt*100, "%", pos_cnt)
print("neg_acc", neg_correct/neg_cnt*100, "%", neg_cnt)

KeyboardInterrupt: 

In [47]:
tweets_data_pol_test = data_pol_test['text'].values
tweets_data_pol_test = tokenizer.texts_to_sequences(data_pol_test['text'].values)
tweets_data_pol_test = pad_sequences(tweets_data_pol_test, maxlen=maxlen)
labels_data_pol_test = pd.get_dummies(data_pol_test['sent']).values
score,acc = model.evaluate(tweets_data_pol_test, labels_data_pol_test, verbose = 1, batch_size = batch_size)
print("score: %.2f" % (score))
print("acc: %.2f" % (acc))

score: 0.61
acc: 0.72


In [48]:
tweets_data_pol_train = data_pol_train['text'].values
tweets_data_pol_train = tokenizer.texts_to_sequences(data_pol_train['text'].values)
tweets_data_pol_train = pad_sequences(tweets_data_pol_train, maxlen=maxlen)
labels_data_pol_train = pd.get_dummies(data_pol_train['sent']).values
diff = data_pol['sent'].value_counts()[0] - data_pol['sent'].value_counts()[1]
print (diff)
    
print(data_pol_train['sent'].value_counts())

150
0.0    1485
1.0    1367
Name: sent, dtype: int64


In [49]:
batch_size = 512
model.fit(tweets_data_pol_train, labels_data_pol_train, epochs = 2, batch_size=batch_size, verbose = 2)

Epoch 1/2
0s - loss: 0.1965 - acc: 0.9299
Epoch 2/2
0s - loss: 0.1714 - acc: 0.9355


<keras.callbacks.History at 0x7efcdc7d5940>

In [50]:
pos_cnt, neg_cnt, pos_correct, neg_correct = 0, 0, 0, 0
for x in range(len(tweets_data_pol_test)):
    
    result = model.predict(tweets_data_pol_test[x].reshape(1,tweets_test.shape[1]),batch_size=1,verbose = 2)[0]
    
    if random.uniform(0, 1) < 0.001:
        print(result)
        print(x)
    
    if result[0]>result[1]:
        conf = result[0]
    else:
        conf = result[1]
   
    if conf > 0.80:
        if np.argmax(result) == np.argmax(labels_data_pol_test[x]):
            if np.argmax(labels_data_pol_test[x]) == 0:
                neg_correct += 1
            else:
                pos_correct += 1

        if np.argmax(labels_kaggle[x]) == 0:
            neg_cnt += 1
        else:
            pos_cnt += 1



print("pos_acc", pos_correct/pos_cnt*100, "%", pos_cnt)
print("neg_acc", neg_correct/neg_cnt*100, "%", neg_cnt)

[0.220595 0.779405]
100
[0.9884454  0.01155459]
223
pos_acc 74.61928934010153 % 197
neg_acc 85.94594594594595 % 185


In [58]:
text = 'Questa cosa non non non  mi piace particolarmente Renzi'
text = [text]
text = tokenizer.texts_to_sequences(text)
text = pad_sequences(text, maxlen = 30)

print(model.predict(text))

[[0.80771905 0.19228093]]
