In [None]:
from copy import deepcopy
from string import punctuation
from random import shuffle
import numpy as np

from tqdm import tqdm

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import re
from nltk.tokenize import TweetTokenizer
import gc

from keras.models import Sequential
from keras.layers import Activation
from keras.callbacks import EarlyStopping 
from keras.layers.convolutional import Conv1D
from keras.optimizers import Adam
from keras.layers.core import Dense, Dropout, Flatten,Reshape
import keras.backend as K


In [None]:
neg_twit_path='data/train_neg_full.txt'
pos_twit_path='data/train_pos_full.txt'
test_tweet_path='data/test_data.txt'
test_tweet_only_path='data/test_tweet_data.txt'
neg_twit_cleaned='cleaned twits/train_neg.txt'
pos_twit_cleaned='cleaned twits/train_pos.txt'
test_tweet_cleaned='cleaned twits/test_data.txt'
gloveFile='Glove embeddings/glove.twitter.27B.50d.txt'

tknzr = TweetTokenizer(strip_handles=True, reduce_len=True)


In [None]:
def cleanTweets(tweets):
    
    remove_numbers_start = r'\d+(\w*|\d*)'
    remove_numbers_end = r'(\w*|\d*)\d+'
    remove_hashtags = r'#(\w*|#*|\d*)'
    remove_underscores = r'_(\w*)'
    remove_punctuation = r"[.,;'?():-_!$&%{}~/|]"
    remove_quotation = r'"'
    remove_math_ops = r'[-+.^:,*]'  
    remove_delimeted_words = r'<[^>]+>'
    
    combined_pattern = r'|'.join((remove_numbers_start,remove_numbers_end,remove_hashtags,remove_underscores,remove_quotation,remove_delimeted_words))
    combined_pattern2 = r'|'.join((remove_punctuation,remove_math_ops))
    cleaned_tweets1 = re.sub(combined_pattern,'',tweets)
    cleaned_tweets2 = re.sub(combined_pattern2,'',cleaned_tweets1)
    print "clean tweets: ",len(cleaned_tweets2)
    return cleaned_tweets2

In [None]:
# replaces all words having an instance of a character more than 2 timess
def repl(matchObj):	
    char = matchObj.group(1)
    return "%s%s" % (char, char)

In [None]:
def replaceRepeatingCharacters(tweets):
    pattern = re.compile(r"(\w)\1+")
    corrected_words = pattern.sub(repl,tweets)
    return corrected_words

In [None]:
def loadData(path,inputF, outputTweets):
    print 'Loading raw data, cleaning and writing cleaned data...'

    inputSet = path + inputF
    outputSetA = outputTweets
    
    #read data sets
    file = open(inputSet)
    raw_data = file.read()
    print "raw_data:",len(raw_data)

    
    cleaned_data = cleanTweets(raw_data)

    strip_repeated_chars = replaceRepeatingCharacters(cleaned_data)
    
    
    writeToFile(strip_repeated_chars,outputSetA)


In [None]:
def writeToFile(data,outputSet):
    print "writing to file..."
    result = open(outputSet, 'w')
    if (type(data) is list):
        for item in data:
            result.write("%s\n" % item)
    elif (type(data) is str):
        # remove whitespaces generated by data cleaning
        newform = re.sub(' +',' ',data.lstrip())
        result.write(newform)
        


In [None]:
def create_glove_model():
    print "Loading Glove Model..."
    f = open(gloveFile,'r')
    glove_model = {}
    for line in tqdm(f):
        splitLine = line.split()
        word = splitLine[0]
        embedding = np.array([float(val) for val in splitLine[1:]])
        glove_model[word] = embedding
    print "Done.",len(glove_model)," words loaded!"
    return glove_model;

In [None]:
def tokenize(tweet):
    try:
        tweet = unicode(tweet.decode('utf-8').lower())
        tokens = tknzr.tokenize(tweet)
        return tokens
    except:
        return 'NC'

In [None]:
def file_to_tokenized_twits(filename):
    print 'File reading and tokenizing clean tweets...'
    data = open(filename, 'rb')
    tokenized_tweet_all = []
    for tweet in tqdm(data):
        tokenized_tweet=tokenize(tweet)
        tokenized_tweet_all.append(tokenized_tweet)
    return np.array(tokenized_tweet_all)

In [None]:
def trim_dataset(X,max_tweet_lenght,tfidf,glove_model):
    print 'Trim dataset with tweet lenght...'
    X_trim=[]
    for i,tweet in enumerate(X):
        if(len(tweet)<=max_tweet_length):
            X_trim.append(tweet)
        else:
            trimmed_tweet=select_most_important_words(tweet,max_tweet_length,tfidf,glove_model)
            X_trim.append(trimmed_tweet)

    
    return np.array(X_trim)

In [None]:
def define_tf_idf(corpus):
    print 'building tf-idf matrix ...'
    vectorizer = TfidfVectorizer(analyzer=lambda x: x, min_df=5)
    matrix = vectorizer.fit_transform([tweet for tweet in corpus])
    tfidf = dict(zip(vectorizer.get_feature_names(), vectorizer.idf_))
    print 'vocab size :', len(tfidf)
    return tfidf

In [None]:
def calculate_max_avarage_lenght(corpus):
    print 'Calculating avarage and max tweet lenght...'
    avg_length = 0.0
    max_length = 0
    for tweet in corpus:
        if len(tweet) > max_length:
            max_length = len(tweet)
        avg_length += float(len(tweet))
    
    print('Average tweet length: {}'.format(avg_length / float(len(corpus))))
    print('Max tweet length: {}'.format(max_length))


In [None]:
def merge_dataset(pos_data, neg_data):
    print 'Merging positive and negative dataset...'
    X = np.concatenate((pos_data, neg_data))
    y = np.array([1] * pos_data.shape[0] + [0] * neg_data.shape[0])
    assert len(y) == X.shape[0]
    assert X.shape[0] == pos_data.shape[0] + neg_data.shape[0]
    return X, y

In [None]:
def define_embeddings(X,y,max_tweet_length,n_dim,glove_model):
    print "Embedding dataset..."
    X_embedded = np.zeros((X.shape[0], max_tweet_length, n_dim), dtype=K.floatx())
    Y_embedded = np.zeros((y.shape[0], 2), dtype=np.int32)
    deficit=0
    for k,tweet in tqdm(enumerate(X)):
        for i, token in enumerate(tweet):
            try:
                X_embedded[k, i, :] = glove_model[token]
                
            except KeyError:
                deficit+=1
                continue
        Y_embedded[k, :] = [1.0, 0.0] if y[k] == 0 else [0.0, 1.0]
        
    print "deficit: ",deficit
    return X_embedded, Y_embedded

In [None]:
#select most important n (max_tweet_lenght) words in order to truncate longer tweets using
#tf-idf structure
def select_most_important_words(tweet, max_tweet_lenght,tfidf,glove_model):
    
    tfidf_val=[]
    tweet=np.array(tweet)
    
    if (len(tweet)<=max_tweet_lenght):
        return tweet
    
    else:
        for token in tweet:
            try:
                if token not in glove_model:
                    tfidf_val.append(-1)
                    continue
                    
                tfidf_val.append(tfidf[token])
                
            except KeyError:
                tfidf_val.append(-1)
                continue
    #get largest n values index            
    idx=np.array(tfidf_val,dtype=np.int64).argsort()[-max_tweet_lenght:][::-1]
    idx_f=idx.flatten()
    
    return tweet[idx_f]
    

In [None]:
#not used this time but beneficial for one vector for tweet
def find_vector_representation_of_tweets(tokenized_tweets,dimension,tfidf):
    tweets_embeddings = np.zeros((len(tokenized_tweets), dimension))
    vec = np.zeros(dimension).reshape((1, dimension))
    count = 0.
    deficit=0
    for i, tokenized_tweet in enumerate(tokenized_tweets):
        for word in tokenized_tweet:
            try:
                vec += glove_model[word].reshape((1, dimension)) * tfidf[word]
                count += 1.
            except KeyError:
                deficit+=1
                continue
        if count != 0:
            vec /= count
            
        tweets_embeddings[i, :] = vec
    
    print "deficit: ",deficit
    return tweets_embeddings


In [None]:
max_tweet_length = 50
n_dim=50
batch_size = 32
nb_epochs = 10

loadData('../',neg_twit_path,neg_twit_cleaned)
loadData('../',pos_twit_path,pos_twit_cleaned)

pos_tokenized_twits=file_to_tokenized_twits(pos_twit_cleaned)
neg_tokenized_twits=file_to_tokenized_twits(neg_twit_cleaned)

glove_model=create_glove_model()


X, y = merge_dataset(pos_tokenized_twits,neg_tokenized_twits)

tfidf=define_tf_idf(X)


X_trim=trim_dataset(X,max_tweet_length,tfidf,glove_model)

calculate_max_avarage_lenght(X_trim)




X_train, X_test, Y_train, Y_test = train_test_split(X_trim, y, test_size=0.2, random_state=42)

print X_train.shape



In [None]:
#real data so large for ny memory so I split train and test sets into
#split_count set.
split_count=3
train_idx = list(range(X_train.shape[0]))
test_idx = list(range(X_test.shape[0]))

s_train_idx=np.array_split(train_idx, split_count)
s_test_idx=np.array_split(test_idx, split_count)

model = Sequential()
    
model.add(Conv1D(32, kernel_size=3, activation='elu', padding='same', input_shape=(max_tweet_length,n_dim)))
model.add(Conv1D(32, kernel_size=3, activation='elu', padding='same'))
model.add(Conv1D(32, kernel_size=3, activation='elu', padding='same'))
model.add(Conv1D(32, kernel_size=3, activation='elu', padding='same'))
model.add(Dropout(0.25))

model.add(Conv1D(32, kernel_size=2, activation='elu', padding='same'))
model.add(Conv1D(32, kernel_size=2, activation='elu', padding='same'))
model.add(Conv1D(32, kernel_size=2, activation='elu', padding='same'))
model.add(Conv1D(32, kernel_size=2, activation='elu', padding='same'))
model.add(Dropout(0.25))

model.add(Flatten())

model.add(Dense(256, activation='tanh'))
model.add(Dense(256, activation='tanh'))
model.add(Dropout(0.5))


model.add(Dense(2, activation='softmax'))

    # Compile the model
model.compile(loss='categorical_crossentropy',
                  optimizer=Adam(lr=0.0001, decay=1e-6),
                  metrics=['accuracy'])


In [None]:
#train and test split_count times
for i in range(split_count):
    print "iteration of data: ",i
    current_X_train=X_train[s_train_idx[i]]
    current_Y_train=Y_train[s_train_idx[i]]
    current_X_test=X_train[s_test_idx[i]]
    current_Y_test=Y_train[s_test_idx[i]]
    embedded_X_train,embedded_Y_train=define_embeddings(current_X_train,current_Y_train,max_tweet_length,n_dim,glove_model)
    embedded_X_test,embedded_Y_test=define_embeddings(current_X_test,current_Y_test,max_tweet_length,n_dim,glove_model)
    # Fit the model
    model.fit(embedded_X_train, embedded_Y_train,
              batch_size=batch_size,
              shuffle=True,
              epochs=nb_epochs,
              validation_data=(embedded_X_test, embedded_Y_test),
    callbacks=[EarlyStopping(min_delta=0.00025, patience=2)])
    
    del current_X_train,current_Y_train,current_X_test,current_Y_test,embedded_X_train,embedded_X_test

In [None]:
#read test id, tweet and write only tweets
def read_write_test_data(base_dir,input_path,output_path):
    print "reading test data, seperating id and write only tweets"
    data = open(base_dir+input_path, 'rb')
    result = open(base_dir+output_path, 'w')
    idxes = []
    for line in tqdm(data):
        idx, line = line.strip().decode("utf-8").split(',', 1)
        idxes.append(idx)
        result.write("%s\n" % line)
    return idxes
    

In [None]:
#create embeddings of test tweets and truncute longer tweets than max_tweet_lenght
#input fotmat of cnn require this format [seq,max_lenght,emb_vec]
def test_embeddings(corpus,max_tweet_length,n_dim,glove_model,tfidf):
    print "Embedding test data..."
    test_embedded = np.zeros((corpus.shape[0], max_tweet_length, n_dim), dtype=K.floatx())
    
    deficit=0
    for k,tweet in tqdm(enumerate(corpus)):
        trimmed_tweet=select_most_important_words(tweet,max_tweet_length,tfidf,glove_model)
        for i, token in enumerate(trimmed_tweet):
            try:
                
                test_embedded[k, i, :] = glove_model[token]
                
            except KeyError:
                deficit+=1
                continue
                        
    print "deficit: ",deficit
    return test_embedded

In [None]:
#load data (only tweet file)
#Note that cleaning methods are disabled because these methods remove
#some test tweets.possible bug
def loadTestData(path,inputF, outputTweets):
    print 'Loading raw data, cleaning and writing cleaned data...'

    inputSet = path + inputF
    outputSetA = outputTweets
    
    #read data sets
    file = open(inputSet)
    raw_data = file.read()
    print "raw_data:",len(raw_data)

    
    #cleaned_data = cleanTweets(raw_data)

    #strip_repeated_chars = replaceRepeatingCharacters(cleaned_data)
    
    
    #writeToFile(strip_repeated_chars,outputSetA)


In [None]:
#load id, tweet file -->seperate tweets and write back-->load only tweets
#-->tokenize tweets-->get embeddings of tweets-->predict tweets as whole
def handle_test_data():
    print "handling test data and prediction..."
    f = open('submission_cnn.txt', 'w')
    f.write("Id,Prediction\n")
    results=0
    idx=read_write_test_data("../",test_tweet_path,test_tweet_only_path)
    loadTestData('../',test_tweet_only_path,test_tweet_cleaned)
    test_tokenized_twits=file_to_tokenized_twits(test_tweet_cleaned)
    print "tokenized:",len(test_tweet_cleaned)

    embedded_test_tweets=test_embeddings(test_tokenized_twits,max_tweet_length,n_dim,glove_model,tfidf)
    prediction=model.predict(embedded_test_tweets)
    for i,tweet in tqdm(enumerate(prediction)):
        if(tweet[0]>tweet[1]):
            results=-1
        else:
            results=1
            
        f.write("%s,%s\n" % (idx[i], results))
    f.close()

In [None]:
handle_test_data()