In [1]:
# Part 1: Extract Transform and Load
# Assumptions: tweets are stored in a tsv file

from __future__ import absolute_import, division, print_function
import pandas as pd
import ETL

fpath = r"/Users/Bart/Desktop/AITeam/semeval1.tsv"

#loads both annotated and neutral tweets
percentageOfAnnotated = 0.2
x_train, y_train, x_test, y_test, vocabulary_inv, neutral_tweets = ETL.main(fpath, percentageOfAnnotated)

Parsed 0/315725 tweets with 0 exceptions
Parsed 31572/315725 tweets with 33 exceptions
Parsed 63144/315725 tweets with 33 exceptions
Parsed 94716/315725 tweets with 33 exceptions
Parsed 126288/315725 tweets with 33 exceptions
Parsed 157860/315725 tweets with 33 exceptions
Parsed 189432/315725 tweets with 33 exceptions
Parsed 221004/315725 tweets with 34 exceptions
Parsed 252576/315725 tweets with 34 exceptions
Parsed 284148/315725 tweets with 34 exceptions
Parsed 315720/315725 tweets with 34 exceptions
Job finished. Go home and drink a beer.
Load data...
Nog even, en het internetbankieren bij ING ligt helemaal plat. #tragesite #backupvanhetjaaraanhetdraaien
Step 1: Neutral loading done
Step 2: Annotated loading done
Padding length determined as 136
Step 3: Padding done with 315691 annotated tweets and 89416 neutral tweets.
3 done
4 done
[122030   3624      1      1      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0  

In [2]:
### Part 2A: Hyperparameters
"""
Train convolutional network for sentiment analysis on IMDB corpus. Based on
"Convolutional Neural Networks for Sentence Classification" by Yoon Kim
http://arxiv.org/pdf/1408.5882v2.pdf

For "CNN-rand" and "CNN-non-static" gets to 88-90%, and "CNN-static" - 85% after 2-5 epochs with following settings:
embedding_dim = 50          
filter_sizes = (3, 8)
num_filters = 10
dropout_prob = (0.5, 0.8)
hidden_dims = 50

Differences from original article:
- larger IMDB corpus, longer sentences; sentence length is very important, just like data size
- smaller embedding dimension, 50 instead of 300
- 2 filter sizes instead of original 3
- fewer filters; original work uses 100, experiments show that 3-10 is enough;
- random initialization is no worse than word2vec init on IMDB corpus
- sliding Max Pooling instead of original Global Pooling
"""

import numpy as np


np.random.seed(0)

model_type = "CNN-non-static"  

# Data source
data_source = "local_dir"

# Model Hyperparameters
embedding_dim = 50
filter_sizes = (3, 8)
num_filters = 20 #10
dropout_prob = (0.5, 0.8)
hidden_dims = 50

# Training parameters
batch_size = 64
num_epochs = 1 #10

# Prepossessing parameters
sequence_length = 140 #400
max_words = 5000

# Word2Vec parameters (see train_word2vec)
min_word_count = 1
context = 10
# ---------------------- Parameters end -----------------------
print("Completed.")


Completed.


In [3]:
print("tragesite" in vocabulary_inv.values())

True


In [4]:
print([x_train.shape, y_train.shape])
print([x_train[1],y_train[1]])

[(284121, 136), (284121,)]
[array([136527,      2,    126,      4,     54,    659,     23,    230,
           93,      4,     53,    114,    185,      7,      0,      0,
            0,      0,      0,      0,      0,      0,      0,      0,
            0,      0,      0,      0,      0,      0,      0,      0,
            0,      0,      0,      0,      0,      0,      0,      0,
            0,      0,      0,      0,      0,      0,      0,      0,
            0,      0,      0,      0,      0,      0,      0,      0,
            0,      0,      0,      0,      0,      0,      0,      0,
            0,      0,      0,      0,      0,      0,      0,      0,
            0,      0,      0,      0,      0,      0,      0,      0,
            0,      0,      0,      0,      0,      0,      0,      0,
            0,      0,      0,      0,      0,      0,      0,      0,
            0,      0,      0,      0,      0,      0,      0,      0,
            0,      0,      0,      0,      0,   

In [5]:
### Part 2B: Network definition & word2vec training
### make sure to delete existing word2vec model if you want to udate it
from w2v import train_word2vec

from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Flatten, Input, MaxPooling1D, Convolution1D, Embedding
from keras.layers.merge import Concatenate
from keras.datasets import imdb
from keras.preprocessing import sequence

if sequence_length != x_test.shape[1]:
    print("Adjusting sequence length for actual size")
    sequence_length = x_test.shape[1]

print("x_train shape:", x_train.shape)
print("x_test shape:", x_test.shape)
print("Vocabulary Size: {:d}".format(len(vocabulary_inv)))

# Prepare embedding layer weights and convert inputs for static model
print("Model type is", model_type)
if model_type in ["CNN-non-static", "CNN-static"]:
    print('Initiating word2vec.')
    embedding_weights = train_word2vec(np.vstack((x_train, x_test, neutral_tweets)), vocabulary_inv, num_features=embedding_dim,
                                       min_word_count=min_word_count, context=context)
    print('Word2vec done.')
    if model_type == "CNN-static":
        x_train = np.stack([np.stack([embedding_weights[word] for word in sentence]) for sentence in x_train])
        x_test = np.stack([np.stack([embedding_weights[word] for word in sentence]) for sentence in x_test])
        print("x_train static shape:", x_train.shape)
        print("x_test static shape:", x_test.shape)

elif model_type == "CNN-rand":
    embedding_weights = None
else:
    raise ValueError("Unknown model type")

# Build model
if model_type == "CNN-static":
    input_shape = (sequence_length, embedding_dim)
else:
    input_shape = (sequence_length,)

model_input = Input(shape=input_shape)

# Static model does not have embedding layer
if model_type == "CNN-static":
    z = model_input
else:
    z = Embedding(len(vocabulary_inv), embedding_dim, input_length=sequence_length, name="embedding")(model_input)

z = Dropout(dropout_prob[0])(z)

# Convolutional block
conv_blocks = []
for sz in filter_sizes:
    conv = Convolution1D(filters=num_filters,
                         kernel_size=sz,
                         padding="valid",
                         activation="relu",
                         strides=1)(z)
    conv = MaxPooling1D(pool_size=2)(conv)
    conv = Flatten()(conv)
    conv_blocks.append(conv)
z = Concatenate()(conv_blocks) if len(conv_blocks) > 1 else conv_blocks[0]

z = Dropout(dropout_prob[1])(z)
z = Dense(hidden_dims, activation="relu")(z)
model_output = Dense(1, activation="sigmoid")(z)

model = Model(model_input, model_output)
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

# Initialize weights with word2vec
if model_type == "CNN-non-static":
    weights = np.array([v for v in embedding_weights.values()])
    print("Initializing embedding layer with word2vec weights, shape", weights.shape)
    embedding_layer = model.get_layer("embedding")
    embedding_layer.set_weights([weights])



Using TensorFlow backend.


Adjusting sequence length for actual size
x_train shape: (284121, 136)
x_test shape: (31570, 136)
Vocabulary Size: 408513
Model type is CNN-non-static
Initiating word2vec.
Load existing Word2Vec model '50features_1minwords_10context'
Word2vec done.
Initializing embedding layer with word2vec weights, shape (408513, 50)


In [6]:
# Part 3: Training the model
# one epoch performs better on new data
# on 16th of Oct, training with 350k annon and 130k neutral took about 260sec

model.fit(x_train, y_train, batch_size=batch_size, epochs=num_epochs, validation_data=(x_test, y_test), verbose=1)

Train on 284121 samples, validate on 31570 samples
Epoch 1/1


<keras.callbacks.History at 0x112d85fd0>

In [None]:
#model.save("/Users/Bart/Desktop/AITeam/AI_team_CNN_v0.2/models/theperfectmodel.h5")
from keras.models import load_model
model = load_model("/Users/Bart/Desktop/AITeam/AI_team_CNN_v0.2/models/theperfectmodel.h5")

In [7]:
import assignSentiment

fpath = r'/Users/Bart/Desktop/AITeam/TweetsSplitByCompany/'

#This is very slow, 600 lines per minute for me, shows progress every 10k lines
assignSentiment.test(fpath, vocabulary_inv, model, sequence_length)

b'Skipping line 485: expected 2 fields, saw 6\nSkipping line 7898: expected 2 fields, saw 6\nSkipping line 8039: expected 2 fields, saw 6\nSkipping line 16598: expected 2 fields, saw 6\nSkipping line 17199: expected 2 fields, saw 6\n'
b'Skipping line 1889: expected 2 fields, saw 6\nSkipping line 3489: expected 2 fields, saw 6\nSkipping line 5346: expected 2 fields, saw 6\nSkipping line 5560: expected 2 fields, saw 6\nSkipping line 7894: expected 2 fields, saw 6\nSkipping line 9838: expected 2 fields, saw 6\nSkipping line 15582: expected 2 fields, saw 6\nSkipping line 15637: expected 2 fields, saw 6\nSkipping line 16868: expected 2 fields, saw 6\nSkipping line 22888: expected 2 fields, saw 6\nSkipping line 24481: expected 2 fields, saw 6\nSkipping line 26265: expected 2 fields, saw 6\nSkipping line 31100: expected 2 fields, saw 6\nSkipping line 31788: expected 2 fields, saw 6\nSkipping line 32804: expected 2 fields, saw 6\nSkipping line 33690: expected 2 fields, saw 6\nSkipping line 339

In [None]:
import assignSentiment
assignSentiment.test(fpath, vocabulary_inv, model, sequence_length)

In [None]:
### Optional writing in a command line:
import classify_sentiment
file = open('save_examples.tsv', "w")
command=[]
while command!='quit':
    command = input()
    y = classify_sentiment.checkTweet(command,vocabulary_inv, model, sequence_length)
    output = str(y)+' '+str(classify_sentiment.val2sen(y))+r' Tweet: "'+str(command)+'"'
    print(output)
    file.write(output+"\n")
file.close()

In [None]:
from keras.models import load_model
model = load_model("/Users/Bart/Desktop/AITeam/AI_team_CNN_v0.2/models/amodel.h5")

In [None]:
from data_helpers_neutrals import *

print(load_neutral_data())