In [1]:
import numpy as np
import random as rd
import regex as re
import glob
import nltk
import pandas as pd

from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from collections import Counter
from keras.preprocessing.sequence import pad_sequences

from textblob import TextBlob

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
#for reproducability of the results
seed = 7
np.random.seed(seed)
rd.seed(seed)

In [3]:
path = 'data/'
output= 'output_trainTestData/'

In [4]:
TRAIN_SAMEVAL = path + 'SemEval2018-T3-train-taskA_emoji.txt'
GOLD_TEST_SAMEVAL  = path + 'SemEval2018-T3_gold_test_taskA_emoji.txt'

In [5]:
#Create train data, test data and word_index
def create_train_test_wordindex(vocab, tweets, corpustrain):
    MAX_NB_WORDS = len(vocab)

    MAX_SEQUENCE_LENGTH= 0
    for tweet in tweets:
        if len(tweet) > MAX_SEQUENCE_LENGTH:
            MAX_SEQUENCE_LENGTH = len(tweet)
            
    print('Maximal Sequence Length: '+str(MAX_SEQUENCE_LENGTH))

    word_index = {t[0]: i+1 for i,t in enumerate(vocab.most_common(MAX_NB_WORDS))}

    sequences = [[word_index.get(t, 0) for t in tweet]
                 for tweet in tweets[:len(corpustrain)]]

    test_sequences = [[word_index.get(t, 0) for t in tweet] 
                      for tweet in tweets[len(corpustrain):]]

    train_data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH, 
                         padding="pre", truncating="post")

    test_data = pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LENGTH, padding="pre",
                              truncating="post")

    return train_data, test_data, word_index

In [6]:
#Store data, labels and word_index for reproducability
def write_file(filename, data):
    file_present = glob.glob(filename)
    if not file_present:
        pd.DataFrame(data).to_csv(filename, header=None, index=None)
    else:
        print('Warining: File '+str(filename)+' already exists.')
        
def write_index(filename, index):
    file_present = glob.glob(filename)
    if not file_present:
        f = open(filename, 'w+', encoding="utf-8")
        isnew = True
        for key, value in index.items():
            if isnew:
                f.write(str(key)+'\t'+str(value))
                isnew = False
            else:
                f.write('\n'+str(key)+'\t'+str(value))
        f.close()
    else:
        print('Warining: File '+str(filename)+' already exists.') 

In [7]:
def reconstruct(index_file, data_file):
   
    loaded_word_index = {}
    with open(index_file, 'r', encoding='utf-8') as word_index: #Decoding utf-8, else exception
        for line in word_index:
            line = line.rstrip().split("\t")
            loaded_word_index[line[0]] = line[1]  
            
    loaded_data =[]
    with open(data_file, 'r', encoding='utf-8') as data: #Decoding utf-8, else exception
        for line in data:
            line = line.rstrip().split('\n')
            loaded_data.append(line) 
            
    recunstructed_data = []
    for line in loaded_data:
        for string in line:
            tweet = []
            string = string.rstrip().split(',')
            for item in string:
                if item is not ('0'):
                    for key, value in loaded_word_index.items():
                        if value == item:
                            tweet.append(key)
            recunstructed_data.append(tweet)
        
    return recunstructed_data

In [8]:
def create_pos_sequence(index_file, data_file, MAXLEN):
   
    recunstructed_data = reconstruct(index_file, data_file)
            
    pos = []
    pos_list = []
    for entry in recunstructed_data:
        postags = nltk.pos_tag(entry)
        for tag in postags:
            if tag[1] not in pos_list:
                pos_list.append(tag[1])
        pos.append(postags)
        
    pos_dict ={}
    i = 1

    for entry in pos_list:
        pos_dict[entry] = i
        i += 1
        
    decoded_pos =[]
    for entry in pos:
        tweet = []
        for tup in entry:
            decode = pos_dict.get(tup[1])
            tweet.append(decode)
        decoded_pos.append(tweet)
        
    pos_sequences = pad_sequences(decoded_pos, maxlen=MAXLEN,padding="pre", truncating="post")
    
    return pos_sequences

In [9]:
def write_pos_file(filename, data):
    file_present = glob.glob(filename)
    if not file_present:
        pd.DataFrame(data).to_csv(filename, header=None, index=None)
    else:
        print('Warining: File '+str(filename)+' already exists.')

In [10]:
def blob_sentiment(data):
    sentiment_blob = []

    for entry in data:
        sentiment_line = []
        for word in entry:
            word = TextBlob(word)
            polarity = word.sentiment.polarity
            sentiment_line.append(polarity)
        sentiment_blob.append(sentiment_line)
    return sentiment_blob

In [11]:
def write_sentiment_file(filename, data, maxlength):
    #data_sequence = pad_sequences(data, maxlen=MAXLEN,padding="pre", truncating="post")
    file_present = glob.glob(filename)
    pad_data= pad_sequences(data, maxlen=maxlength,padding="pre", truncating="post", dtype='float32')
    if not file_present:
        pd.DataFrame(pad_data).to_csv(filename, header=None, index=None)
    else:
        print('Warining: File '+str(filename)+' already exists.')

# Load data

In [12]:
def read_data_samEval(FILENAME):
    labels = []
    corpus = []   
    with open(FILENAME, 'r', encoding='utf-8') as train: #Decoding utf-8, else exception
        for line in train:
            if not line.lower().startswith("tweet index"): #skip header
                line = line.rstrip().split("\t")
                label = line[1] #erste Spalte - label
                labels.append(int(label))
                tweet = line[2] #zweite Spalte - tweet
                corpus.append(tweet)  
    returnvalue = []
    returnvalue.append(labels)
    returnvalue.append(corpus)
    return returnvalue

In [13]:
resulttrain = read_data_samEval(TRAIN_SAMEVAL)
labelstrain_sameval = resulttrain[0]
corpustrain_sameval = resulttrain[1]

resulttest = read_data_samEval(GOLD_TEST_SAMEVAL)
labelstest_sameval = resulttest[0]
corpustest_sameval = resulttest[1]

In [14]:
print('Length train corpus: '+str(len(corpustrain_sameval)))
print('Lenght train labels: '+str(len(labelstrain_sameval)))
print('Lenght test corpus: '+str(len(corpustest_sameval)))
print('lenght test labels: '+str(len(labelstest_sameval)))

Length train corpus: 3834
Lenght train labels: 3834
Lenght test corpus: 784
lenght test labels: 784


# With lowercasing

In [15]:
file_sameval_lower_train_data = output + 'samEval_lower_train_data_preprocessed.csv'
file_sameval_lower_train_labels = output + 'samEval_lower_train_labels_preprocessed.csv'
file_sameval_lower_test_data = output + 'samEval_lower_test_data_preprocessed.csv'
file_sameval_lower_test_labels = output + 'samEval_lower_test_labels_preprocessed.csv'
file_sameval_lower_word_index = output + 'samEval_lower_word_index_preprocessed.csv'

In [16]:
# Preprocess including lowercasing
def text_to_wordlist_lower(tweet, vocab):
    #Remove hashtags
    tweet = re.sub('#','', tweet)
    
    #Remove usermentions
    tweet= re.sub(r'(\w+|^|)@\w+','', tweet)
    
    #Treats url's as special tokens (actually twitter specific)
    tweet=re.sub(r'((http|https)://)(\w|[.]|/)+', 'URL', tweet)
    
    #Lowercasing
    tweet = tweet.lower()
    
    #Tokenize
    tokenizer = TweetTokenizer()
    tweet = tokenizer.tokenize(tweet)
    
    vocab.update(tweet)
    return tweet

def process_tweets_lower(list_sentences, vocab):
    tweets = []
    for tweet in list_sentences:
        twt = text_to_wordlist_lower(tweet, vocab)
        tweets.append(twt)
    return tweets

In [17]:
#Prepocess the data
vocab_lower = Counter()
tweets_lower = process_tweets_lower(corpustrain_sameval + corpustest_sameval, vocab_lower)

In [18]:
train_data_lower, test_data_lower, word_index_lower = create_train_test_wordindex(vocab_lower, tweets_lower, corpustrain_sameval)

Maximal Sequence Length: 40


In [19]:
write_file(file_sameval_lower_train_data, train_data_lower)
write_file(file_sameval_lower_train_labels, labelstrain_sameval)
write_file(file_sameval_lower_test_data, test_data_lower)
write_file(file_sameval_lower_test_labels, labelstest_sameval)
write_index(file_sameval_lower_word_index, word_index_lower)

### create pos

##### If POS is created directly after creating the train and test data, then tweets_lower could be used instad of recreating the data

In [20]:
sameval_lower_pos = create_pos_sequence(file_sameval_lower_word_index,file_sameval_lower_train_data, 40)

In [21]:
write_pos_file(output+'samEval_lower_train_pos.csv', sameval_lower_pos)

In [22]:
sameval_lower_test_pos = create_pos_sequence(file_sameval_lower_word_index, file_sameval_lower_test_data, 40)

In [23]:
write_pos_file(output+'samEval_lower_test_pos.csv', sameval_lower_test_pos)

### create sentiment

##### If Sentiment is created directly after creating the train and test data, then tweets_lower could be used instad of recreating the data

In [24]:
rec_data_lower = reconstruct(file_sameval_lower_word_index, file_sameval_lower_train_data)

In [25]:
sentiment_blob_lower = blob_sentiment(rec_data_lower)

In [26]:
write_sentiment_file(output+'samEval_lower_train_data_sentiment_blob.csv', sentiment_blob_lower, 40)

In [27]:
rec_data_test_lower = reconstruct(file_sameval_lower_word_index,file_sameval_lower_test_data)

sentiment_blob_lower_test = blob_sentiment(rec_data_test_lower)

write_sentiment_file(output+'samEval_lower_test_data_sentiment_blob.csv', sentiment_blob_lower_test,40)

## With stopword removal

In [15]:
file_sameval_stopword_train_data = output + 'sameval_stopword_train_data_preprocessed.csv'
file_sameval_stopword_train_labels = output + 'sameval_stopword_train_labels_preprocessed.csv'
file_sameval_stopword_test_data = output + 'sameval_stopword_test_data_preprocessed.csv'
file_sameval_stopword_test_labels = output + 'sameval_stopword_test_labels_preprocessed.csv'
file_sameval_stopword_word_index = output + 'sameval_stopword_word_index_preprocessed.csv'

In [18]:
# Preprocess including stopwordremoval
def text_to_wordlist_stopword(tweet, vocab):
    #Remove hashtags
    tweet = re.sub('#','', tweet)
    
    #Remove usermentions
    tweet= re.sub(r'(\w+|^|)@\w+','', tweet)
    
    #Treats url's as special tokens (actually twitter specific)
    tweet=re.sub(r'((http|https)://)(\w|[.]|/)+', 'URL', tweet)
    
    #Tokenize
    tokenizer = TweetTokenizer()
    tweet = tokenizer.tokenize(tweet)
    
    #Remove stopwords
    stopWords = stopwords.words('english')
    wordsFiltered = []

    for w in tweet:
        if w not in stopWords:
            wordsFiltered.append(w)
    
    vocab.update(wordsFiltered)
    return wordsFiltered

def process_tweets_stopword(list_sentences, vocab):
    tweets = []
    for tweet in list_sentences:
        twt = text_to_wordlist_stopword(tweet, vocab)
        tweets.append(twt)
    return tweets

In [21]:
#Prepocess the data
vocab_stopword = Counter()
tweets_stopword = process_tweets_stopword(corpustrain_sameval + corpustest_sameval, vocab_stopword)

In [24]:
train_data_stopword, test_data_stopword, word_index_stopword = create_train_test_wordindex(vocab_stopword, tweets_stopword, corpustrain_sameval)

Maximal Sequence Length: 33


In [25]:
write_file(file_sameval_stopword_train_data, train_data_stopword)
write_file(file_sameval_stopword_train_labels, labelstrain_sameval)
write_file(file_sameval_stopword_test_data, test_data_stopword)
write_file(file_sameval_stopword_test_labels, labelstest_sameval)
write_index(file_sameval_stopword_word_index, word_index_stopword)

### create pos

##### If POS is created directly after creating the train and test data, then tweets_lower could be used instad of recreating the data

In [34]:
sameval_stopword_pos = create_pos_sequence(file_sameval_stopword_word_index,file_sameval_stopword_train_data, 33)

In [35]:
write_pos_file(output+'samEval_stopword_train_pos.csv', sameval_stopword_pos)

In [36]:
sameval_stopword_test_pos = create_pos_sequence(file_sameval_stopword_word_index, file_sameval_stopword_test_data, 33)

In [37]:
write_pos_file(output+'samEval_stopword_test_pos.csv', sameval_stopword_test_pos)

### create sentiment

##### If Sentiment is created directly after creating the train and test data, then tweets_lower could be used instad of recreating the data

In [38]:
rec_data_stopword = reconstruct(file_sameval_stopword_word_index, file_sameval_stopword_train_data)

In [39]:
sentiment_blob_stopword = blob_sentiment(rec_data_stopword)

In [40]:
write_sentiment_file(output+'samEval_stopword_train_data_sentiment_blob.csv', sentiment_blob_stopword, 33)

In [41]:
rec_data_test_stopword = reconstruct(file_sameval_stopword_word_index,file_sameval_stopword_test_data)

sentiment_blob_stopword_test = blob_sentiment(rec_data_test_stopword)

write_sentiment_file(output+'samEval_stopword_test_data_sentiment_blob.csv', sentiment_blob_stopword_test,33)

## Remove punctuation

In [15]:
file_sameval_punctuation_train_data = output + 'sameval_punctuation_train_data_preprocessed.csv'
file_sameval_punctuation_train_labels = output + 'sameval_punctuation_train_labels_preprocessed.csv'
file_sameval_punctuation_test_data = output + 'sameval_punctuation_test_data_preprocessed.csv'
file_sameval_punctuation_test_labels = output + 'sameval_punctuation_test_labels_preprocessed.csv'
file_sameval_punctuation_word_index = output + 'sameval_punctuation_word_index_preprocessed.csv'

In [24]:
# Preprocess including punctuationremoval
import string

def text_to_wordlist_punctuation(tweet, vocab):
    #Remove hashtags
    tweet = re.sub('#','', tweet)
    
    #Remove usermentions
    tweet= re.sub(r'(\w+|^|)@\w+','', tweet)
    
    #Treats url's as special tokens (actually twitter specific)
    tweet=re.sub(r'((http|https)://)(\w|[.]|/)+', 'URL', tweet)
    
    #Remove punctuations
    tweet = tweet.translate(str.maketrans('', '', string.punctuation))
    vocab.update(tweet)
    
    #Tokenize
    tokenizer = TweetTokenizer()
    tweet = tokenizer.tokenize(tweet)
    
    return tweet

def process_tweets_punctuation(list_sentences, vocab):
    tweets = []
    for tweet in list_sentences:
        twt = text_to_wordlist_punctuation(tweet, vocab)
        tweets.append(twt)
    return tweets

In [25]:
#Prepocess the data
vocab_punctuation = Counter()
tweets_punctuation = process_tweets_punctuation(corpustrain_sameval + corpustest_sameval, vocab_punctuation)

In [27]:
train_data_punctuation, test_data_punctuation, word_index_punctuation = create_train_test_wordindex(vocab_punctuation, tweets_punctuation, corpustrain_sameval)

Maximal Sequence Length: 32


In [28]:
write_file(file_sameval_punctuation_train_data, train_data_punctuation)
write_file(file_sameval_punctuation_train_labels, labelstrain_sameval)
write_file(file_sameval_punctuation_test_data, test_data_punctuation)
write_file(file_sameval_punctuation_test_labels, labelstest_sameval)
write_index(file_sameval_punctuation_word_index, word_index_punctuation)

### create pos

##### If POS is created directly after creating the train and test data, then tweets_lower could be used instad of recreating the data

In [30]:
sameval_punctuation_pos = create_pos_sequence(file_sameval_punctuation_word_index,file_sameval_punctuation_train_data, 32)
write_pos_file(output+'samEval_punctuation_train_pos.csv', sameval_punctuation_pos)

sameval_punctuation_test_pos = create_pos_sequence(file_sameval_punctuation_word_index, file_sameval_punctuation_test_data, 32)
write_pos_file(output+'samEval_punctuation_test_pos.csv', sameval_punctuation_test_pos)

### create sentiment

##### If Sentiment is created directly after creating the train and test data, then tweets_lower could be used instad of recreating the data

In [32]:
rec_data_punctuation = reconstruct(file_sameval_punctuation_word_index, file_sameval_punctuation_train_data)
sentiment_blob_punctuation = blob_sentiment(rec_data_punctuation)
write_sentiment_file(output+'samEval_punctuation_train_data_sentiment_blob.csv', sentiment_blob_punctuation, 32)

rec_data_test_punctuation = reconstruct(file_sameval_punctuation_word_index,file_sameval_punctuation_test_data)
sentiment_blob_punctuation_test = blob_sentiment(rec_data_test_punctuation)
write_sentiment_file(output+'samEval_punctuation_test_data_sentiment_blob.csv', sentiment_blob_punctuation_test,32)

## Lemmatization

In [18]:
file_sameval_lemma_train_data = output + 'sameval_lemma_train_data_preprocessed.csv'
file_sameval_lemma_train_labels = output + 'sameval_lemma_train_labels_preprocessed.csv'
file_sameval_lemma_test_data = output + 'sameval_lemma_test_data_preprocessed.csv'
file_sameval_lemma_test_labels = output + 'sameval_lemma_test_labels_preprocessed.csv'
file_sameval_lemma_word_index = output + 'sameval_lemma_word_index_preprocessed.csv'

In [44]:
# Preprocess including lemmaremoval
import string
from nltk.corpus import wordnet

def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

def text_to_wordlist_lemma(tweet, vocab):
    #Remove hashtags
    tweet = re.sub('#','', tweet)
    
    #Remove usermentions
    tweet= re.sub(r'(\w+|^|)@\w+','', tweet)
    
    #Treats url's as special tokens (actually twitter specific)
    tweet=re.sub(r'((http|https)://)(\w|[.]|/)+', 'URL', tweet)

    #Tokenize
    tokenizer = TweetTokenizer()
    tweet = tokenizer.tokenize(tweet)
    
    #Lemmatization
    lemmatizer=WordNetLemmatizer()
    tweet_lemma =[]
    for word in tweet:
        tweet_lemma.append(lemmatizer.lemmatize(word, get_wordnet_pos(word)))
    
    return tweet_lemma

def process_tweets_lemma(list_sentences, vocab):
    tweets = []
    for tweet in list_sentences:
        twt = text_to_wordlist_lemma(tweet, vocab)
        tweets.append(twt)
    return tweets

In [45]:
#Prepocess the data
vocab_lemma = Counter()
tweets_lemma = process_tweets_lemma(corpustrain_sameval + corpustest_sameval, vocab_lemma)

In [54]:
train_data_lemma, test_data_lemma, word_index_lemma = create_train_test_wordindex(vocab_lemma, tweets_lemma, corpustrain_sameval)

Maximal Sequence Length: 40


In [55]:
write_file(file_sameval_lemma_train_data, train_data_lemma)
write_file(file_sameval_lemma_train_labels, labelstrain_sameval)
write_file(file_sameval_lemma_test_data, test_data_lemma)
write_file(file_sameval_lemma_test_labels, labelstest_sameval)
write_index(file_sameval_lemma_word_index, word_index_lemma)

### create pos

##### If POS is created directly after creating the train and test data, then tweets_lower could be used instad of recreating the data

In [58]:
sameval_lemma_pos = create_pos_sequence(file_sameval_lemma_word_index,file_sameval_lemma_train_data, 40)
write_pos_file(output+'samEval_lemma_train_pos.csv', sameval_lemma_pos)

sameval_lemma_test_pos = create_pos_sequence(file_sameval_lemma_word_index, file_sameval_lemma_test_data, 40)
write_pos_file(output+'samEval_lemma_test_pos.csv', sameval_lemma_test_pos)

### create sentiment

##### If Sentiment is created directly after creating the train and test data, then tweets_lower could be used instad of recreating the data

In [59]:
rec_data_lemma = reconstruct(file_sameval_lemma_word_index, file_sameval_lemma_train_data)
sentiment_blob_lemma = blob_sentiment(rec_data_lemma)
write_sentiment_file(output+'samEval_lemma_train_data_sentiment_blob.csv', sentiment_blob_lemma, 40)

rec_data_test_lemma = reconstruct(file_sameval_lemma_word_index,file_sameval_lemma_test_data)
sentiment_blob_lemma_test = blob_sentiment(rec_data_test_lemma)
write_sentiment_file(output+'samEval_lemma_test_data_sentiment_blob.csv', sentiment_blob_lemma_test,40)

## Combination best preprocessing

In [15]:
file_sameval_bestpp_train_data = output + 'sameval_bestpp_train_data_preprocessed.csv'
file_sameval_bestpp_train_labels = output + 'sameval_bestpp_train_labels_preprocessed.csv'
file_sameval_bestpp_test_data = output + 'sameval_bestpp_test_data_preprocessed.csv'
file_sameval_bestpp_test_labels = output + 'sameval_bestpp_test_labels_preprocessed.csv'
file_sameval_bestpp_word_index = output + 'sameval_bestpp_word_index_preprocessed.csv'

In [16]:
# Preprocess including best methods
def text_to_wordlist_bestpp(tweet, vocab):
    #Remove hashtags
    tweet = re.sub('#','', tweet)
    
    #Remove usermentions
    tweet= re.sub(r'(\w+|^|)@\w+','', tweet)
    
    #Treats url's as special tokens (actually twitter specific)
    tweet=re.sub(r'((http|https)://)(\w|[.]|/)+', 'URL', tweet)
    
    #Lowercasing
    tweet = tweet.lower()
    
    #Tokenize
    tokenizer = TweetTokenizer()
    tweet = tokenizer.tokenize(tweet)
    
    #Remove stopwords
    stopWords = stopwords.words('english')
    wordsFiltered = []

    for w in tweet:
        if w not in stopWords:
            wordsFiltered.append(w)
    
    vocab.update(wordsFiltered)
    return wordsFiltered

def process_tweets_bestpp(list_sentences, vocab):
    tweets = []
    for tweet in list_sentences:
        twt = text_to_wordlist_bestpp(tweet, vocab)
        tweets.append(twt)
    return tweets

In [17]:
#Prepocess the data
vocab_bestpp = Counter()
tweets_bestpp = process_tweets_bestpp(corpustrain_sameval + corpustest_sameval, vocab_bestpp)

In [19]:
train_data_bestpp, test_data_bestpp, word_index_bestpp = create_train_test_wordindex(vocab_bestpp, tweets_bestpp, corpustrain_sameval)

Maximal Sequence Length: 33


In [20]:
write_file(file_sameval_bestpp_train_data, train_data_bestpp)
write_file(file_sameval_bestpp_train_labels, labelstrain_sameval)
write_file(file_sameval_bestpp_test_data, test_data_bestpp)
write_file(file_sameval_bestpp_test_labels, labelstest_sameval)
write_index(file_sameval_bestpp_word_index, word_index_bestpp)

### create pos

##### If POS is created directly after creating the train and test data, then tweets_lower could be used instad of recreating the data

In [21]:
sameval_bestpp_pos = create_pos_sequence(file_sameval_bestpp_word_index,file_sameval_bestpp_train_data, 33)
write_pos_file(output+'samEval_bestpp_train_pos.csv', sameval_bestpp_pos)

sameval_bestpp_test_pos = create_pos_sequence(file_sameval_bestpp_word_index, file_sameval_bestpp_test_data, 33)
write_pos_file(output+'samEval_bestpp_test_pos.csv', sameval_bestpp_test_pos)

### create sentiment

##### If Sentiment is created directly after creating the train and test data, then tweets_lower could be used instad of recreating the data

In [22]:
rec_data_bestpp = reconstruct(file_sameval_bestpp_word_index, file_sameval_bestpp_train_data)
sentiment_blob_bestpp = blob_sentiment(rec_data_bestpp)
write_sentiment_file(output+'samEval_bestpp_train_data_sentiment_blob.csv', sentiment_blob_bestpp, 33)

rec_data_test_bestpp = reconstruct(file_sameval_bestpp_word_index,file_sameval_bestpp_test_data)
sentiment_blob_bestpp_test = blob_sentiment(rec_data_test_bestpp)
write_sentiment_file(output+'samEval_bestpp_test_data_sentiment_blob.csv', sentiment_blob_bestpp_test,33)

## With username

In [15]:
file_sameval_username_train_data = output + 'sameval_username_train_data_preprocessed.csv'
file_sameval_username_train_labels = output + 'sameval_username_train_labels_preprocessed.csv'
file_sameval_username_test_data = output + 'sameval_username_test_data_preprocessed.csv'
file_sameval_username_test_labels = output + 'sameval_username_test_labels_preprocessed.csv'
file_sameval_username_word_index = output + 'sameval_username_word_index_preprocessed.csv'

In [16]:
# Preprocess including best methods
def text_to_wordlist_username(tweet, vocab):
    #Remove hashtags
    tweet = re.sub('#','', tweet)
    
    #Treats usermentions as special tokens
    tweet= re.sub(r'(\w+|^|)@\w+','USERNAME', tweet)
    
    #Treats url's as special tokens (actually twitter specific)
    tweet=re.sub(r'((http|https)://)(\w|[.]|/)+', 'URL', tweet)
    
    #Lowercasing
    #tweet = tweet.lower()
    
    #Tokenize
    tokenizer = TweetTokenizer()
    tweet = tokenizer.tokenize(tweet)
    
    #Remove stopwords
    #stopWords = stopwords.words('english')
    #wordsFiltered = []

    #for w in tweet:
      #  if w not in stopWords:
       #     wordsFiltered.append(w)
    
    vocab.update(tweet)
    return tweet

def process_tweets_username(list_sentences, vocab):
    tweets = []
    for tweet in list_sentences:
        twt = text_to_wordlist_username(tweet, vocab)
        tweets.append(twt)
    return tweets

In [17]:
#Prepocess the data
vocab_username = Counter()
tweets_username = process_tweets_username(corpustrain_sameval + corpustest_sameval, vocab_username)

In [18]:
train_data_username, test_data_username, word_index_username = create_train_test_wordindex(vocab_username, tweets_username, corpustrain_sameval)

Maximal Sequence Length: 40


In [20]:
write_file(file_sameval_username_train_data, train_data_username)
write_file(file_sameval_username_train_labels, labelstrain_sameval)
write_file(file_sameval_username_test_data, test_data_username)
write_file(file_sameval_username_test_labels, labelstest_sameval)
write_index(file_sameval_username_word_index, word_index_username)

### create pos

##### If POS is created directly after creating the train and test data, then tweets_lower could be used instad of recreating the data

In [23]:
sameval_username_pos = create_pos_sequence(file_sameval_username_word_index,file_sameval_username_train_data, 40)
write_pos_file(output+'samEval_username_train_pos.csv', sameval_username_pos)

sameval_username_test_pos = create_pos_sequence(file_sameval_username_word_index, file_sameval_username_test_data, 40)
write_pos_file(output+'samEval_username_test_pos.csv', sameval_username_test_pos)

### create sentiment

##### If Sentiment is created directly after creating the train and test data, then tweets_lower could be used instad of recreating the data

In [24]:
rec_data_username = reconstruct(file_sameval_username_word_index, file_sameval_username_train_data)
sentiment_blob_username = blob_sentiment(rec_data_username)
write_sentiment_file(output+'samEval_username_train_data_sentiment_blob.csv', sentiment_blob_username, 40)

rec_data_test_username = reconstruct(file_sameval_username_word_index,file_sameval_username_test_data)
sentiment_blob_username_test = blob_sentiment(rec_data_test_username)
write_sentiment_file(output+'samEval_username_test_data_sentiment_blob.csv', sentiment_blob_username_test,40)