In [1]:
import regex as re
import pandas as pd
import numpy as np
import glob
import random

from nltk.tokenize import TweetTokenizer
from collections import Counter
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
path = 'data/'
output= 'output_trainTestData/'

In [3]:
#Preprocessing
def text_to_wordlist(tweet, vocab):
    #Remove hashtags
    #tweet = re.sub('#','', tweet)
    
    #Remove usermentions
    tweet= re.sub(r'(\w+|^|)@\w+','', tweet)
    
    #Treats url's as special tokens (actually twitter specific)
    tweet=re.sub(r'((http|https)://)(\w|[.]|/)+', 'URL', tweet)
    
    #Tokenize
    tokenizer = TweetTokenizer()
    tweet = tokenizer.tokenize(tweet)
    
    vocab.update(tweet)
    return tweet

def process_tweets(list_sentences, vocab):
    tweets = []
    for tweet in list_sentences:
        twt = text_to_wordlist(tweet, vocab)
        tweets.append(twt)
    return tweets

In [4]:
#Create train data, test data and word_index
def create_train_test_wordindex(vocab, tweets, corpustrain):
    MAX_NB_WORDS = len(vocab)

    MAX_SEQUENCE_LENGTH= 0
    for tweet in tweets:
        if len(tweet) > MAX_SEQUENCE_LENGTH:
            MAX_SEQUENCE_LENGTH = len(tweet)
            
    print('Maximal Sequence Length: '+str(MAX_SEQUENCE_LENGTH))

    word_index = {t[0]: i+1 for i,t in enumerate(vocab.most_common(MAX_NB_WORDS))}

    sequences = [[word_index.get(t, 0) for t in tweet]
                 for tweet in tweets[:len(corpustrain)]]

    test_sequences = [[word_index.get(t, 0) for t in tweet] 
                      for tweet in tweets[len(corpustrain):]]

    train_data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH, 
                         padding="pre", truncating="post")

    test_data = pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LENGTH, padding="pre",
                              truncating="post")

    return train_data, test_data, word_index

In [5]:
#Store data, labels and word_index for reproducability
def write_file(filename, data):
    file_present = glob.glob(filename)
    if not file_present:
        pd.DataFrame(data).to_csv(filename, header=None, index=None)
    else:
        print('Warining: File '+str(filename)+' already exists.')
        
def write_index(filename, index):
    file_present = glob.glob(filename)
    if not file_present:
        f = open(filename, 'w+', encoding="utf-8")
        isnew = True
        for key, value in index.items():
            if isnew:
                f.write(str(key)+'\t'+str(value))
                isnew = False
            else:
                f.write('\n'+str(key)+'\t'+str(value))
        f.close()
    else:
        print('Warining: File '+str(filename)+' already exists.')
    

## SamEval2018

In [8]:
def read_data_samEval(FILENAME):
    labels = []
    corpus = []   
    with open(FILENAME, 'r', encoding='utf-8') as train: #Decoding utf-8, else exception
        for line in train:
            if not line.lower().startswith("tweet index"): #skip header
                line = line.rstrip().split("\t")
                label = line[1] #erste Spalte - label
                labels.append(int(label))
                tweet = line[2] #zweite Spalte - tweet
                corpus.append(tweet)  
    returnvalue = []
    returnvalue.append(labels)
    returnvalue.append(corpus)
    return returnvalue

In [5]:
TRAIN_SAMEVAL = path + 'SemEval2018-T3-train-taskA_emoji.txt'
GOLD_TEST_SAMEVAL  = path + 'SemEval2018-T3_gold_test_taskA_emoji.txt'

file_sameval_train_data = output + 'SamEval_train_data.csv'
file_sameval_train_labels = output + 'SamEval_train_labels.csv'
file_sameval_test_data = output + 'SamEval_test_data.csv'
file_sameval_test_labels = output + 'SamEval_test_labels.csv'
file_sameval_word_index = output + 'SamEval_word_index.csv'

In [6]:
resulttrain = read_data_samEval(TRAIN_SAMEVAL)
labelstrain_sameval = resulttrain[0]
corpustrain_sameval = resulttrain[1]

resulttest = read_data_samEval(GOLD_TEST_SAMEVAL)
labelstest_sameval = resulttest[0]
corpustest_sameval = resulttest[1]

In [7]:
print('Length train corpus: '+str(len(corpustrain_sameval)))
print('Lenght train labels: '+str(len(labelstrain_sameval)))
print('Lenght test corpus: '+str(len(corpustest_sameval)))
print('lenght test labels: '+str(len(labelstest_sameval)))

Length train corpus: 3834
Lenght train labels: 3834
Lenght test corpus: 784
lenght test labels: 784


In [30]:
print('Number labels train data: '+str(Counter(labelstrain_sameval)))
print('Number labels test data: '+str(Counter(labelstest_sameval)))

Number labels train data: Counter({0: 1923, 1: 1911})
Number labels test data: Counter({0: 473, 1: 311})


In [10]:
#Prepocess the data
vocab_sameval = Counter()
tweets_sameval = process_tweets(corpustrain_sameval + corpustest_sameval, vocab_sameval)

In [12]:
train_data_sameval, test_data_sameval, word_index_sameval = create_train_test_wordindex(vocab_sameval, tweets_sameval, corpustrain_sameval)

Maximal Sequence Length: 40


In [13]:
write_file(file_sameval_train_data, train_data_sameval)
write_file(file_sameval_train_labels, labelstrain_sameval)
write_file(file_sameval_test_data, test_data_sameval)
write_file(file_sameval_test_labels, labelstest_sameval)
write_index(file_sameval_word_index, word_index_sameval)

### Task B

In [10]:
TRAIN_SAMEVAL_B = path + 'SemEval2018-T3-train-taskB_emoji_ironyHashtags.txt'
GOLD_TEST_SAMEVAL_B = path + 'SemEval2018-T3_gold_test_taskB_emoji.txt'

file_sameval_b_train_data = output + 'SamEval_B_train_data.csv'
file_sameval_b_train_labels = output + 'SamEval_B_train_labels.csv'
file_sameval_b_test_data = output + 'SamEval_B_test_data.csv'
file_sameval_b_test_labels = output + 'SamEval_B_test_labels.csv'
file_sameval_b_word_index = output + 'SamEval_B_word_index.csv'

In [11]:
resulttrain_b = read_data_samEval(TRAIN_SAMEVAL_B)
labelstrain_sameval_b = resulttrain_b[0]
corpustrain_sameval_b = resulttrain_b[1]

resulttest_b = read_data_samEval(GOLD_TEST_SAMEVAL_B)
labelstest_sameval_b = resulttest_b[0]
corpustest_sameval_b = resulttest_b[1]

In [12]:
print('Data overall: '+str(len(corpustrain_sameval_b+corpustest_sameval_b)))
print('Length train corpus: '+str(len(corpustrain_sameval_b)))
print('Lenght train labels: '+str(len(labelstrain_sameval_b)))
print('Lenght test corpus: '+str(len(corpustest_sameval_b)))
print('lenght test labels: '+str(len(labelstest_sameval_b)))

Data overall: 4618
Length train corpus: 3834
Lenght train labels: 3834
Lenght test corpus: 784
lenght test labels: 784


In [13]:
print('Number labels train data: '+str(Counter(labelstrain_sameval_b)))
print('Number labels test data: '+str(Counter(labelstest_sameval_b)))

Number labels train data: Counter({0: 1923, 1: 1390, 2: 316, 3: 205})
Number labels test data: Counter({0: 473, 1: 164, 2: 85, 3: 62})


In [14]:
#Prepocess the data
vocab_sameval_b = Counter()
tweets_sameval_b = process_tweets(corpustrain_sameval_b + corpustest_sameval_b, vocab_sameval_b)

In [15]:
train_data_sameval_b, test_data_sameval_b, word_index_sameval_b = create_train_test_wordindex(vocab_sameval_b, tweets_sameval_b, corpustrain_sameval_b)

Maximal Sequence Length: 40


In [29]:
write_file(file_sameval_b_train_data, train_data_sameval_b)
write_file(file_sameval_b_train_labels, labelstrain_sameval_b)
write_file(file_sameval_b_test_data, test_data_sameval_b)
write_file(file_sameval_b_test_labels, labelstest_sameval_b)
write_index(file_sameval_b_word_index, word_index_sameval_b)

## with #irony

In [8]:
TRAIN_SAMEVAL = path + 'SemEval2018-T3-train-taskA_emoji_ironyHashtags.txt'
GOLD_TEST_SAMEVAL  = path + 'SemEval2018-T3_gold_test_taskA_emoji.txt'

file_sameval_train_data = output + 'SamEval_train_data_hash.csv'
file_sameval_train_labels = output + 'SamEval_train_labels_hash.csv'
file_sameval_test_data = output + 'SamEval_test_data_hash.csv'
file_sameval_test_labels = output + 'SamEval_test_labels_hash.csv'
file_sameval_word_index = output + 'SamEval_word_index_hash.csv'

In [9]:
resulttrain = read_data_samEval(TRAIN_SAMEVAL)
labelstrain_sameval = resulttrain[0]
corpustrain_sameval = resulttrain[1]

resulttest = read_data_samEval(GOLD_TEST_SAMEVAL)
labelstest_sameval = resulttest[0]
corpustest_sameval = resulttest[1]

In [11]:
print('Length train corpus: '+str(len(corpustrain_sameval)))
print('Lenght train labels: '+str(len(labelstrain_sameval)))
print('Lenght test corpus: '+str(len(corpustest_sameval)))
print('lenght test labels: '+str(len(labelstest_sameval)))

Length train corpus: 3834
Lenght train labels: 3834
Lenght test corpus: 784
lenght test labels: 784


In [12]:
print('Number labels train data: '+str(Counter(labelstrain_sameval)))
print('Number labels test data: '+str(Counter(labelstest_sameval)))

Number labels train data: Counter({0: 1923, 1: 1911})
Number labels test data: Counter({0: 473, 1: 311})


In [13]:
#Prepocess the data
vocab_sameval = Counter()
tweets_sameval = process_tweets(corpustrain_sameval + corpustest_sameval, vocab_sameval)

In [15]:
train_data_sameval, test_data_sameval, word_index_sameval = create_train_test_wordindex(vocab_sameval, tweets_sameval, corpustrain_sameval)

Maximal Sequence Length: 40


In [18]:
write_file(file_sameval_train_data, train_data_sameval)
write_file(file_sameval_train_labels, labelstrain_sameval)
write_file(file_sameval_test_data, test_data_sameval)
write_file(file_sameval_test_labels, labelstest_sameval)
write_index(file_sameval_word_index, word_index_sameval)

## Reyes, Rosso and Veale (2013)

In [24]:
IRONY = path + 'Irony.txt'
EDUCATION = path + 'Education.txt'
HUMOR = path + 'Humor.txt'
POLITICS = path + 'Politics.txt'

file_reyesall_train_data = output + 'reyes_train_data.csv'
file_reyesall_train_labelsbinary = output + 'reyes_train_labelsbinary.csv'
file_reyesall_train_labelsmulty = output + 'reyes_train_labelsmulty.csv'
file_reyesall_test_data = output + 'reyes_test_data.csv'
file_reyesall_test_labelsbinary = output + 'reyes_test_labelsbinary.csv'
file_reyesall_test_labelsmulty = output + 'reyes_test_labelmulty.csv'
file_reyesall_word_index = output + 'reyes_word_index.csv'

file_reyesivse_train_data = output + 'reyes_i_e_train_data.csv'
file_reyesivse_train_labels = output + 'reyes_i_e_train_labels.csv'
file_reyesivse_test_data = output + 'reyes_i_e_test_data.csv'
file_reyesivse_test_labels = output + 'reyes_i_e_test_labels.csv'
file_reyesivse_word_index = output + 'reyes_i_e_word_index.csv'

file_reyesivsh_train_data = output + 'reyes_i_h_train_data.csv'
file_reyesivsh_train_labels = output + 'reyes_i_h_train_labels.csv'
file_reyesivsh_test_data = output + 'reyes_i_h_test_data.csv'
file_reyesivsh_test_labels = output + 'reyes_i_h_test_labels.csv'
file_reyesivsh_word_index = output + 'reyes_i_h_word_index.csv'

file_reyesivsp_train_data = output + 'reyes_i_p_train_data.csv'
file_reyesivsp_train_labels = output + 'reyes_i_p_train_labels.csv'
file_reyesivsp_test_data = output + 'reyes_i_p_test_data.csv'
file_reyesivsp_test_labels = output + 'reyes_i_p_test_labels.csv'
file_reyesivsp_word_index = output + 'reyes_i_p_word_index.csv'

In [26]:
#Read the data from file
def read_data(FILENAME, labelbinary, labelmulty):
    result = []
    with open(FILENAME, 'r', encoding='utf-8') as train:
        for line in train:
            listentry = []
            line = line.rstrip().split("\n")
            listentry.append(line)
            listentry.append(labelbinary)
            listentry.append(labelmulty)
            result.append(listentry)
    return result

#define Labels
IRONYLABEL = 1
NONIRONYLABEL = 0
MULTY = 0

ironycorpus = read_data(IRONY, IRONYLABEL, MULTY)
educationcorpus = read_data(EDUCATION, NONIRONYLABEL, MULTY+1)
humorcorpus = read_data(HUMOR, NONIRONYLABEL, MULTY+2)
politicscorpus = read_data(POLITICS, NONIRONYLABEL, MULTY+3)

print("Number data per file")
print("Number data Irony: " + str(len(ironycorpus)))
print("Number data Education: " + str(len(educationcorpus)))
print("Number data Humor: " + str(len(humorcorpus)))
print("Number data Politics: " + str(len(politicscorpus)))
print("-------------------------")

#Concatenate the individual corpuses to create a single data set and vs-datasets
corpus = np.concatenate((ironycorpus, educationcorpus, humorcorpus, politicscorpus))
ironyvseducation = np.concatenate((ironycorpus, educationcorpus))
ironyvshumor = np.concatenate((ironycorpus, humorcorpus))
ironyvspolitics = np.concatenate ((ironycorpus, politicscorpus))

print("Number data altogether: " + str(len(corpus)))
print("Number data Irony vs. Education: " + str(len(ironyvseducation)))
print("Number data Irony vs Humor: " + str(len(ironyvshumor)))
print("Number data Irony vs. Politics: " + str(len(ironyvspolitics)))
print("------------------------------")

#Create random train and test data sets

#Randomize the data
corpus = random.sample(list(corpus),len(corpus))
ironyvseducation = random.sample(list(ironyvseducation), len(ironyvseducation))
ironyvshumor = random.sample(list(ironyvshumor), len(ironyvshumor))
ironyvspolitics = random.sample(list(ironyvspolitics), len(ironyvspolitics))

numbertraindata = int((len(corpus))*0.8)
numbertrainironyvseducation = int((len(ironyvseducation))*0.8)
numbertrainironyvshumor = int((len(ironyvshumor))*0.8)
numbertrainironyvspolitics = int((len(ironyvspolitics))*0.8)

train = corpus[:numbertraindata]
test = corpus[numbertraindata:]

trainironyvseducation = ironyvseducation[:numbertrainironyvseducation]
testironyvseducation = ironyvseducation[numbertrainironyvseducation:]

trainironyvshumor = ironyvshumor[:numbertrainironyvshumor]
testironyvshumor = ironyvshumor[numbertrainironyvshumor:]

trainironyvspolitics = ironyvspolitics[:numbertrainironyvspolitics]
testironyvspolitics = ironyvspolitics[numbertrainironyvspolitics:]

print("Number train data overall: " + str(len(train)))
print("Number test data overall: " + str(len(test)))
print("-------------------------")
print("Number train data Irony vs Education: " + str(len(trainironyvseducation)))
print("Number test data Irony vs. Education: " + str(len(testironyvseducation)))
print("-------------------------")
print("Number train data Irony vs. Humor: " + str(len(trainironyvshumor)))
print("Number test data Irony vs. Humor: " + str(len(testironyvshumor)))
print("-------------------------")
print("Number train data Irony vs. Politics: " + str(len(trainironyvspolitics)))
print("Number test data Irony vs. Politics: " + str(len(testironyvspolitics)))


#Split the data into text and labels
def split_data(corpus):
    data = []
    labelsbinary =[]
    labelsmulty =[]
    
    for entry in corpus:
        text = entry[0] #entry[0] is a list by istself but, what is needed is the string text
        data.append(text[0])
        labelsbinary.append(entry[1])
        labelsmulty.append(entry[2])
        
    return data, labelsbinary, labelsmulty

corpustrain_reyes, labelstrainbinary, labelstrainmulty = split_data(train)
corpustest_reyes, labelstestbinary, labelstestmulty = split_data(test)  

ironyvseducation_train, ironyvseducation_labelstrainbinary, ironyvseducation_labelstrainmulty = split_data(trainironyvseducation)
ironyvseducation_test, ironyvseducation_labelstestbinary, ironyvseducation_labelstestmulty = split_data(testironyvseducation) 

ironyvshumor_train, ironyvshumor_labelstrainbinary, ironyvshumor_labelstrainmulty = split_data(trainironyvshumor)
ironyvshumor_test, ironyvshumor_labelstestbinary, ironyvshumor_labelstestmulty = split_data(testironyvshumor) 

ironyvspolitics_train, ironyvspolitics_labelstrainbinary, ironyvspolitics_labelstrainmulty = split_data(trainironyvspolitics)
ironyvspolitics_test, ironyvspolitics_labelstestbinary, ironyvspolitics_labelstestmulty = split_data(testironyvspolitics) 

Number data per file
Number data Irony: 9964
Number data Education: 9832
Number data Humor: 9938
Number data Politics: 8461
-------------------------
Number data altogether: 38195
Number data Irony vs. Education: 19796
Number data Irony vs Humor: 19902
Number data Irony vs. Politics: 18425
------------------------------
Number train data overall: 30556
Number test data overall: 7639
-------------------------
Number train data Irony vs Education: 15836
Number test data Irony vs. Education: 3960
-------------------------
Number train data Irony vs. Humor: 15921
Number test data Irony vs. Humor: 3981
-------------------------
Number train data Irony vs. Politics: 14740
Number test data Irony vs. Politics: 3685


In [16]:
#Preprocess the data

#all data
vocab_reyes_all = Counter()
tweets_reyes = process_tweets(corpustrain_reyes + corpustest_reyes, vocab_reyes_all)

#Irony vs Education
vocab_ironyvseducation = Counter()
tweetsironyvseducation = process_tweets(ironyvseducation_train + ironyvseducation_test, vocab_ironyvseducation)

#Irony vs Humor
vocab_ironyvshumor = Counter()
tweetsironyvshumor = process_tweets(ironyvshumor_train + ironyvshumor_test, vocab_ironyvshumor)

#Irony vs Politics
vocab_ironyvspolitics = Counter()
tweetsironyvspolitics = process_tweets(ironyvspolitics_train + ironyvspolitics_test, vocab_ironyvspolitics)

In [18]:
#Create train data, test data and word index for Reyes, Rosso and Veale

#all Data
train_data_reyes, test_data_reyes, word_index_reyes = create_train_test_wordindex(vocab_reyes_all, tweets_reyes, corpustrain_reyes)

#Irony vs education
ironyvseducation_train_data, ironyvseducation_test_data, ironyvseducation_word_index = create_train_test_wordindex(vocab_ironyvseducation, tweetsironyvseducation, ironyvseducation_train)

#Irony vs Humor
ironyvshumor_train_data, ironyvshumor_test_data, ironyvshumor_word_index = create_train_test_wordindex(vocab_ironyvshumor, tweetsironyvshumor, ironyvshumor_train)

#Irony vs Politics
ironyvspolitics_train_data, ironyvspolitics_test_data, ironyvspolitics_word_index = create_train_test_wordindex(vocab_ironyvspolitics, tweetsironyvspolitics, ironyvspolitics_train)

Maximal Sequence Length: 167
Maximal Sequence Length: 99
Maximal Sequence Length: 90
Maximal Sequence Length: 167


In [21]:
#Store train data, test data and word index fpr Reyes, Rosso and Veale for reproducability

#all data
write_file(file_reyesall_train_data, train_data_reyes)
write_file(file_reyesall_train_labelsbinary, labelstrainbinary)
write_file(file_reyesall_train_labelsmulty, labelstrainmulty)
write_file(file_reyesall_test_data, test_data_reyes)
write_file(file_reyesall_test_labelsbinary, labelstestbinary)
write_file(file_reyesall_test_labelsmulty, labelstestmulty)
write_index(file_reyesall_word_index, word_index_reyes)

#Irony vs Education
write_file(file_reyesivse_train_data, ironyvseducation_train_data)
write_file(file_reyesivse_train_labels, ironyvseducation_labelstrainbinary)
write_file(file_reyesivse_test_data, ironyvseducation_test_data)
write_file(file_reyesivse_test_labels, ironyvseducation_labelstestbinary)
write_index(file_reyesivse_word_index, ironyvseducation_word_index)

#Irony vs Humor
write_file(file_reyesivsh_train_data, ironyvshumor_train_data)
write_file(file_reyesivsh_train_labels, ironyvshumor_labelstrainbinary)
write_file(file_reyesivsh_test_data, ironyvshumor_test_data)
write_file(file_reyesivsh_test_labels, ironyvshumor_labelstestbinary)
write_index(file_reyesivsh_word_index, ironyvshumor_word_index)

#Irony vs Politics
write_file(file_reyesivsp_train_data, ironyvspolitics_train_data)
write_file(file_reyesivsp_train_labels, ironyvspolitics_labelstrainbinary)
write_file(file_reyesivsp_test_data, ironyvspolitics_test_data)
write_file(file_reyesivsp_test_labels, ironyvspolitics_labelstestbinary)
write_index(file_reyesivsp_word_index, ironyvspolitics_word_index)

# Ghosh, Fabri, Muresan

In [7]:
Ghosh_train = path + 'Ghosh_sarcasm_v2_tab_train.txt'
Ghosh_test  = path + 'Ghosh_sarcasm_v2_tab_test.txt'
Ghosh_validate  = path + 'Ghosh_sarcasm_v2_tab_valid.txt'

#because of cross-valdiation train and valid data are concatinated
file_ghosh_train_data = output + 'Ghosh_train_data.csv'b
file_ghosh_train_labels = output + 'Ghosh_train_labels.csv'
file_ghosh_test_data = output + 'Ghosh_test_data.csv'
file_ghosh_test_labels = output + 'Ghosh_test_labels.csv'
file_ghosh_word_index = output + 'Ghosh_word_index.csv'

In [13]:
def read_data_ghosh(FILENAME):
    corpus = []   
    with open(FILENAME, 'r', encoding='utf-8') as train: #Decoding utf-8, else exception
        for line in train:
            listentry = []
            label = int(line.rstrip().split()[0].rstrip().split('.')[0]) # label
            tweet = line.lstrip()[4:].rstrip().split('/n') #tweet
            listentry.append(tweet)
            listentry.append(label)
            corpus.append(listentry)
    return corpus

resulttrain_ghosh = read_data_ghosh(Ghosh_train)
resultvalid_ghosh = read_data_ghosh(Ghosh_validate)
#concatinate
resulttrain_all_ghosh = np.concatenate((resulttrain_ghosh,resultvalid_ghosh))
#randomize train
resulttrain_all_ghosh = random.sample(list(resulttrain_all_ghosh), len(resulttrain_all_ghosh))

resulttest_ghosh = read_data_ghosh(Ghosh_test)
#randomize test
resulttest_ghosh = random.sample(list(resulttest_ghosh), len(resulttest_ghosh))

def split_data(corpus):
    data = []
    labelsbinary =[]
    
    for entry in corpus:
        text = entry[0] #entry[0] is a list by istself but, what is needed is the string text
        data.append(text[0])
        labelsbinary.append(entry[1])
        
    return data, labelsbinary

train_ghosh, trainlabels_ghosh = split_data(resulttrain_all_ghosh)
test_ghosh, testlabels_ghosh = split_data(resulttest_ghosh)

print("Number train data overall: " + str(len(train_ghosh)))
print("Number test data overall: " + str(len(resulttest_ghosh)))

Number train data overall: 4224
Number test data overall: 468


In [9]:
print('Number labels train data: '+str(Counter(trainlabels_ghosh)))
print('Number labels test data: '+str(Counter(testlabels_ghosh)))

Number labels train data: Counter({0: 2112, 1: 2112})
Number labels test data: Counter({1: 234, 0: 234})


In [123]:
#Preprocess the data
vocab_ghosh = Counter()
tweets_ghosh = process_tweets(train_ghosh+test_ghosh, vocab_ghosh)

In [127]:
#Create train data, test data and word index for Reyes, Rosso and Veale
train_data_ghosh, test_data_ghosh, word_index_ghosh = create_train_test_wordindex(vocab_ghosh, tweets_ghosh, train_ghosh)

Maximal Sequence Length: 1894


In [155]:
# Store the train data, test data and word index for Ghosh, Fabri and Muresan
#train
write_file(file_ghosh_train_data, train_data_ghosh)
write_file(file_ghosh_train_labels, train_labels_ghosh)
#test
write_file(file_ghosh_test_data, test_data_ghosh)
write_file(file_ghosh_test_labels, test_labels_ghosh)
#index
write_index(file_ghosh_word_index, word_index_ghosh)

# IronITA 2018

In [14]:
TRAIN_IRONITA = path + 'training_ironita2018.csv'
TEST_IRONITA  = path + 'test_gold_ironita2018.csv'

file_ironita_train_data = output + 'ironita_train_data.csv'
file_ironita_train_labels = output + 'ironita_train_labels.csv'
file_ironita_train_labels_sarcasm = output + 'ironita_train_labels_sarcasm.csv'
file_ironita_test_data = output + 'ironita_test_data.csv'
file_ironita_test_labels = output + 'ironita_test_labels.csv'
file_ironita_test_labels_sarcasm = output + 'ironita_test_labels_sarcasm.csv'
file_ironita_word_index = output + 'ironita_word_index.csv'

In [56]:
def read_data_ironita(FILENAME):
    labels = []
    labels_sarcasm = []
    corpus = []   
    with open(FILENAME, 'r', encoding='utf-8') as train: #Decoding utf-8, else exception
        for line in train:
            if not line.lower().startswith("id	text	irony	sarcasm	topic"): #skip header
                line = line.rstrip().split("\t")
                label = line[2]
                labels.append(int(label))
                labels_sarcasm.append(line[3])
                corpus.append(line[1])  
    returnvalue = []
    returnvalue.append(labels)
    returnvalue.append(labels_sarcasm)
    returnvalue.append(corpus)
    return returnvalue

resulttrain = read_data_ironita(TRAIN_IRONITA)
labelstrain_ironita = resulttrain[0]
labelstrain_sarcasm_ironita = resulttrain[1]
corpustrain_ironita = resulttrain[2]

resulttest = read_data_ironita(TEST_IRONITA)
labelstest_ironita = resulttest[0]
labelstest_sarcasm_ironita = resulttest[1]
corpustest_ironita = resulttest[2]

In [16]:
print('Length train corpus: '+str(len(corpustrain_ironita)))
print('Lenght train labels: '+str(len(labelstrain_ironita)))
print('Lenght test corpus: '+str(len(corpustest_ironita)))
print('lenght test labels: '+str(len(labelstest_ironita)))

Length train corpus: 3977
Lenght train labels: 3977
Lenght test corpus: 872
lenght test labels: 872


In [95]:
print('Number labels train data: '+str(Counter(labelstrain_ironita)))
print('Number labels test data: '+str(Counter(labelstest_ironita)))

Number labels train data: Counter({1: 2023, 0: 1954})
Number labels test data: Counter({0: 437, 1: 435})


In [102]:
count_ironitab = []
leng = len(resulttrain[0])
for i in range(leng):
    if resulttrain[0][i] == 0 and resulttrain[1][i] == '0':
        count_ironitab.append('not-ironic')
    if resulttrain[0][i] == 1 and resulttrain[1][i] == '0':
        count_ironitab.append('irony_not_sarcasm')
    if resulttrain[0][i] == 1 and resulttrain[1][i] == '1':
        count_ironitab.append('sarcasm')

In [94]:
Counter(count_ironitab)

Counter({'not-ironic': 1954, 'irony_not_sarcasm': 1110, 'sarcasm': 913})

In [109]:
count_ironitab_test = []
leng = len(resulttest[0])
for i in range(leng):
    if resulttest[0][i] == 0 and resulttest[1][i] == '0':
        count_ironitab_test.append('not-ironic')
    if resulttest[0][i] == 1 and resulttest[1][i] == '0':
        count_ironitab_test.append('irony_not_sarcasm')
    if resulttest[0][i] == 1 and resulttest[1][i] == '1':
        count_ironitab_test.append('sarcasm')

In [110]:
Counter(count_ironitab_test)

Counter({'irony_not_sarcasm': 219, 'sarcasm': 216, 'not-ironic': 437})

In [187]:
#Prepocess the data
vocab_ironita = Counter()
tweets_ironita = process_tweets(corpustrain_ironita + corpustest_ironita, vocab_ironita)

In [188]:
train_data_ironita, test_data_ironita, word_index_ironita = create_train_test_wordindex(vocab_ironita, tweets_ironita, corpustrain_ironita)

Maximal Sequence Length: 39


In [203]:
write_file(file_ironita_train_data, train_data_ironita)
write_file(file_ironita_train_labels, labelstrain_ironita)
write_file(file_ironita_train_labels_sarcasm, labelstrain_sarcasm_ironita)
write_file(file_ironita_test_data, test_data_ironita)
write_file(file_ironita_test_labels, labelstest_ironita)
write_file(file_ironita_test_labels_sarcasm, labelstest_sarcasm_ironita)
write_index(file_ironita_word_index, word_index_ironita)

# Ravi and Ravie 2017

In [20]:
IRONY = path + 'Ravi_finalpos499.txt'
NON_IRONY = path + 'Ravi_finalNeg2498.csv'

file_ravi_train_data = output + 'ravi_train_data.csv'
file_ravi_train_labels = output + 'ravi_train_labels.csv'
file_ravi_test_data = output + 'ravi_test_data.csv'
file_ravi_test_labels = output + 'ravi_test_labelsb.csv'
file_ravi_word_index = output + 'ravi_word_index.csv'

In [21]:
def read_data_ravi(FILENAME, label):
    result = []
    with open(FILENAME, 'r', encoding='utf-8') as train:
        for line in train:
            listentry = []
            line = line.rstrip().split("\n")
            listentry.append(line)
            listentry.append(label)
            result.append(listentry)
    return result

#define Labels
IRONYLABEL = 1
NONIRONYLABEL= 0

ironycorpus = read_data_ravi(IRONY, IRONYLABEL)
nonironycorpus = read_data_ravi(NON_IRONY, NONIRONYLABEL)

#Concatenate irony and non-irony
corpus = np.concatenate((ironycorpus, nonironycorpus))

#Randomize the data
corpus = random.sample(list(corpus),len(corpus))

#Split data in train and test data
numbertraindata = int((len(corpus))*0.8)
train = corpus[:numbertraindata]
test = corpus[numbertraindata:]

#Split the data into text and labels
def split_data(corpus):
    data = []
    labels =[]
    
    for entry in corpus:
        text = entry[0] #entry[0] is a list by istself but, what is needed is the string text
        data.append(text[0])
        labels.append(entry[1])
        
    return data, labels

corpustrain_ravi, labelstrain= split_data(train)
corpustest_ravi, labelstest= split_data(test)

print("Number train: " + str(len(train)))
print("Number test: " + str(len(test)))

Number train: 2397
Number test: 600


In [22]:
print('Number labels train data: '+str(Counter(labelstrain)))
print('Number labels test data: '+str(Counter(labelstest)))

Number labels train data: Counter({0: 2003, 1: 394})
Number labels test data: Counter({0: 495, 1: 105})


In [216]:
#Preprocess the data
vocab_ravi = Counter()
tweets_ravi = process_tweets(corpustrain_ravi + corpustest_ravi, vocab_ravi)

In [221]:
#Create train data, test data and word index for ravi, Rosso and Veale
train_data_ravi, test_data_ravi, word_index_ravi = create_train_test_wordindex(vocab_ravi, tweets_ravi, corpustrain_ravi)

Maximal Sequence Length: 1233


In [232]:
#Store train data, test data and word index for Ravi and Ravi for reproducability
write_file(file_ravi_train_data, train_data_ravi)
write_file(file_ravi_train_labels, labelstrain)
write_file(file_ravi_test_data, test_data_ravi)
write_file(file_ravi_test_labels, labelstest)
write_index(file_ravi_word_index, word_index_ravi)

# SentiPOLC 2016

In [32]:
TRAIN_SENTIPOLC = path + 'training_set_sentipolc16.csv'
TEST_SENTIPOLC = path + 'test_set_sentipolc16_gold2000.csv'

file_sentipolc_train_data = output + 'sentipolc_train_data.csv'
file_sentipolc_train_label_irony = output + 'sentipolc_train_labelirony.csv'
file_sentipolc_train_label_opos = output + 'sentipolc_train_labelopos.csv'
file_sentipolc_train_label_oneg = output + 'sentipolc_train_labeloneg.csv'
file_sentipolc_train_label_lpos = output + 'sentipolc_train_labellpos.csv'
file_sentipolc_train_label_lneg = output + 'sentipolc_train_labellneg.csv'
file_sentipolc_test_data = output + 'sentipolc_test_data.csv'
file_sentipolc_test_label_irony = output + 'sentipolc_test_labelirony.csv'
file_sentipolc_test_label_opos = output + 'sentipolc_test_labelopos.csv'
file_sentipolc_test_label_oneg = output + 'sentipolc_test_labeloneg.csv'
file_sentipolc_test_label_lpos = output + 'sentipolc_test_labellpos.csv'
file_sentipolc_test_label_lneg = output + 'sentipolc_test_labelneg.csv'
file_sentipolc_word_index = output + 'sentipolc_word_index.csv'

In [33]:
#Read data
sentipolc_train = pd.read_csv(TRAIN_SENTIPOLC)
sentipolc_test = pd.read_csv(TEST_SENTIPOLC, sep = '","', header = None, encoding = 'utf-8')

  This is separate from the ipykernel package so we can avoid doing imports until


In [34]:
#Split the dataframe in text and labels for training
train_labelsirony = sentipolc_train['iro']
train_labelsopos = sentipolc_train['opos']
train_labelsoneg = sentipolc_train['oneg']
train_labelslpos = sentipolc_train['lpos']
train_labelslneg = sentipolc_train['lneg']
train_corpus_sentipolc = sentipolc_train['text']

In [35]:
#Split the dataframe in text and labels for testing
test_labelsirony = sentipolc_test[4]
test_labelsopos = sentipolc_test[2]
test_labelsoneg = sentipolc_test[3]
test_labelslpos = sentipolc_test[5]
test_labelslneg = sentipolc_test[6]
test_corpus_sentipolc = sentipolc_test[8]

In [36]:
print('Length train corpus: '+str(len(train_corpus_sentipolc)))
print('Lenght train labels: '+str(len(train_labelsirony)))
print('Lenght test corpus: '+str(len(test_corpus_sentipolc)))
print('lenght test labels: '+str(len(test_labelsirony)))

Length train corpus: 7410
Lenght train labels: 7410
Lenght test corpus: 2000
lenght test labels: 2000


In [37]:
print('Number labels train data: '+str(Counter(train_labelsirony)))
print('Number labels test data: '+str(Counter(test_labelsirony)))

Number labels train data: Counter({0: 6542, 1: 868})
Number labels test data: Counter({0: 1765, 1: 235})


In [308]:
#Prepocess the data
traincorpus_sentipolc = train_corpus_sentipolc.tolist()
testcorpus_sentipolc = test_corpus_sentipolc.tolist()

vocab_sentipolc = Counter()
tweets_sentipolc = process_tweets( traincorpus_sentipolc + testcorpus_sentipolc, vocab_sentipolc)

In [309]:
train_data_sentipolc, test_data_sentipolc, word_index_sentipolc = create_train_test_wordindex(vocab_sentipolc, tweets_sentipolc, traincorpus_sentipolc)

Maximal Sequence Length: 43


In [322]:
#Store the train data, test data and dictionary, before have to convert pandas series object to list objekt
labelstrain_irony_sentipolc = train_labelsirony.tolist()
labelstrain_opos_sentipolc = train_labelsopos.tolist()
labelstrain_oneg_sentipolc = train_labelsoneg.tolist()
labelstrain_lpos_sentipolc = train_labelslpos.tolist()
labelstrain_lneg_sentipolc = train_labelslneg.tolist()

labelstest_irony_sentipolc = test_labelsirony.tolist()
labelstest_opos_sentipolc = test_labelsopos.tolist()
labelstest_oneg_sentipolc = test_labelsoneg.tolist()
labelstest_lpos_sentipolc = test_labelslpos.tolist()
labelstest_lneg_sentipolc = test_labelslneg.tolist()

write_file(file_sentipolc_train_data, train_data_sentipolc)
write_file(file_sentipolc_train_label_irony, labelstrain_irony_sentipolc)
write_file(file_sentipolc_train_label_opos, labelstrain_opos_sentipolc)
write_file(file_sentipolc_train_label_oneg, labelstrain_oneg_sentipolc)
write_file(file_sentipolc_train_label_lpos, labelstrain_lpos_sentipolc)
write_file(file_sentipolc_train_label_lneg, labelstrain_lneg_sentipolc)

write_file(file_sentipolc_test_data, test_data_sentipolc)
write_file(file_sentipolc_test_label_irony, labelstest_irony_sentipolc)
write_file(file_sentipolc_test_label_opos, labelstest_opos_sentipolc)
write_file(file_sentipolc_test_label_oneg, labelstest_oneg_sentipolc)
write_file(file_sentipolc_test_label_lpos, labelstest_lpos_sentipolc)
write_file(file_sentipolc_test_label_lneg, labelstest_lneg_sentipolc)

write_index(file_sentipolc_word_index, word_index_sentipolc)

# Wallace, Cho, Kertz and Charnika (2014) 

In [39]:
DATA_WALLACE = path + 'Wallace_irony-labeled.csv'

file_wallace_train_data = output + 'wallace_train_data.csv'
file_wallace_train_labels = output + 'wallace_train_labels.csv'
file_wallace_test_data = output + 'wallace_test_data.csv'
file_wallace_test_labels = output + 'wallace_test_labels.csv'
file_wallace_word_index = output + 'wallace_word_index.csv'

In [40]:
corpus_wallace = pd.read_csv(DATA_WALLACE)

In [41]:
#split in train and test data
numbertraindata = int((len(corpus_wallace))*0.8)

train_wallace = corpus_wallace[:numbertraindata]
test_wallace = corpus_wallace[numbertraindata:]

train_corpus_wallace = train_wallace['comment_text'].tolist()
train_labels_wallace = train_wallace['label'].tolist()
test_corpus_wallace = test_wallace['comment_text'].tolist()
test_labels_wallace = test_wallace['label'].tolist()

In [42]:
print('Number labels train data: '+str(Counter(train_labels_wallace)))
print('Number labels test data: '+str(Counter(test_labels_wallace)))

Number labels train data: Counter({-1: 1122, 1: 437})
Number labels test data: Counter({-1: 290, 1: 100})


In [339]:
#Preproces the data
vocab_wallace = Counter()
tweets_wallace = process_tweets(train_corpus_wallace + test_corpus_wallace, vocab_wallace)

In [341]:
#Creat train data, test data and word index for Wallace et al.
train_data_wallace, test_data_wallace, word_index_wallace = create_train_test_wordindex(vocab_wallace, tweets_wallace, train_corpus_wallace)

Maximal Sequence Length: 891


In [348]:
# Store the train data, test data and word index for Wallace et al.
#train
write_file(file_wallace_train_data, train_data_wallace)
write_file(file_wallace_train_labels, train_labels_wallace)
#test
write_file(file_wallace_test_data, test_data_wallace)
write_file(file_wallace_test_labels, test_labels_wallace)
#index
write_index(file_wallace_word_index, word_index_wallace)

# Own (German) data set

In [46]:
DATA_TETZNER = path + 'GermanIronyCorpus_cleaned.txt'

file_tetzner_train_data = output + 'tetzner_train_data.csv'
file_tetzner_train_labels_polarity = output + 'tetzner_train_labels_polarity.csv'
file_tetzner_train_labels_ironyform = output + 'tetzner_train_labels_ironyform.csv'
file_tetzner_train_labels_change = output + 'tetzner_train_labels_change.csv'
file_tetzner_train_labels_ironybinary = output + 'tetzner_train_labels_ironybinary.csv'
file_tetzner_train_labels_ironymulty = output + 'tetzner_train_labels_ironymulty.csv'
file_tetzner_test_data = output + 'tetzner_test_data.csv'
file_tetzner_test_labels_polarity = output + 'tetzner_test_labels_polarity.csv'
file_tetzner_test_labels_ironyform = output + 'tetzner_test_labels_ironyform.csv'
file_tetzner_test_labels_change = output + 'tetzner_test_labels_change.csv'
file_tetzner_test_labels_ironybinary = output + 'tetzner_test_labels_ironybinary.csv'
file_tetzner_test_labels_ironymulty = output + 'tetzner_test_labels_ironymulty.csv'
file_tetzner_word_index = output + 'tetzner_word_index.csv'

In [47]:
result = []

with open(DATA_TETZNER, 'r', encoding='utf-8-sig' ) as file: #Decoding utf-8, else exception
    for line in file:
        listentry = []
        line = line.rstrip().split("\t")
        listentry.append(line[0])# text
        listentry.append(line[1])# polarity
        listentry.append(line[2])# irony form
        listentry.append(line[3])# change
        listentry.append(line[4])# binary label
        listentry.append(line[5])# multy label
        result.append(listentry)

In [48]:
#Create train and test data
result = random.sample(list(result),len(result))
numbertraindata = int((len(result))*0.8)

train_tetzner = result[:numbertraindata]
test_tetzner = result[numbertraindata:]

#Split the data into text and labels
def split_data(corpus):
    data = []
    labels_polarity =[]
    labels_ironyform =[]
    labels_change = []
    labels_ironybinary = []
    labels_ironymulty = []
    
    for entry in corpus:
        data.append(entry[0])
        labels_polarity.append(entry[1])
        labels_ironyform.append(entry[2])
        labels_change.append(entry[3])
        labels_ironybinary.append(entry[4])
        labels_ironymulty.append(entry[5])

    return data, labels_polarity, labels_ironyform, labels_change, labels_ironybinary, labels_ironymulty

corpustrain_tetzner, labelstrainpolarity, labelstrainironyform, labelstrainchange, labelstrainironybinary, labelstrainironymulty = split_data(train_tetzner)
corpustest_tetzner, labelstestpolarity, labelstestironyform, labelstestchange, labelstestironybinary, labelstestironymulty = split_data(test_tetzner)

In [397]:
#Preprocess data
vocab_tetzner = Counter()
tweets_tetzner = process_tweets(corpustrain_tetzner + corpustest_tetzner, vocab_tetzner)

In [398]:
# Create train data, test data and dirctionary
train_data_tetzner, test_data_tetzner, word_index_tetzner = create_train_test_wordindex(vocab_tetzner, tweets_tetzner, corpustrain_tetzner)

Maximal Sequence Length: 73


In [400]:
#Store the train data, test data and word index for the German Irony Corpus
write_file(file_tetzner_train_data, train_data_tetzner)
write_file(file_tetzner_train_labels_polarity, labelstrainpolarity)
write_file(file_tetzner_train_labels_ironyform, labelstrainironyform)
write_file(file_tetzner_train_labels_change, labelstrainchange)
write_file(file_tetzner_train_labels_ironybinary, labelstrainironybinary)
write_file(file_tetzner_train_labels_ironymulty, labelstrainironymulty)

write_file(file_tetzner_test_data, test_data_tetzner)
write_file(file_tetzner_test_labels_polarity, labelstestpolarity)
write_file(file_tetzner_test_labels_ironyform, labelstestironyform)
write_file(file_tetzner_test_labels_change, labelstestchange)
write_file(file_tetzner_test_labels_ironybinary, labelstestironybinary)
write_file(file_tetzner_test_labels_ironymulty, labelstestironymulty)

write_index(file_tetzner_word_index, word_index_tetzner)

# Own (German)  balanced data set

In [46]:
DATA_TETZNER = path + 'GermanIronyCorpus_cleaned.txt'

file_tetzner_train_data = output + 'tetzner_train_data_balanced.csv'
file_tetzner_train_labels_polarity = output + 'tetzner_train_labels_polarity_balanced.csv'
file_tetzner_train_labels_ironyform = output + 'tetzner_train_labels_ironyform_balanced.csv'
file_tetzner_train_labels_change = output + 'tetzner_train_labels_change_balanced.csv'
file_tetzner_train_labels_ironybinary = output + 'tetzner_train_labels_ironybinary_balanced.csv'
file_tetzner_train_labels_ironymulty = output + 'tetzner_train_labels_ironymulty_balanced.csv'
file_tetzner_test_data = output + 'tetzner_test_data_balanced.csv'
file_tetzner_test_labels_polarity = output + 'tetzner_test_labels_polarity_balanced.csv'
file_tetzner_test_labels_ironyform = output + 'tetzner_test_labels_ironyform_balanced.csv'
file_tetzner_test_labels_change = output + 'tetzner_test_labels_change_balanced.csv'
file_tetzner_test_labels_ironybinary = output + 'tetzner_test_labels_ironybinary_balanced.csv'
file_tetzner_test_labels_ironymulty = output + 'tetzner_test_labels_ironymulty_balanced.csv'
file_tetzner_word_index = output + 'tetzner_word_index_balanced.csv'

In [47]:
result = []

with open(DATA_TETZNER, 'r', encoding='utf-8-sig' ) as file: #Decoding utf-8, else exception
    for line in file:
        listentry = []
        line = line.rstrip().split("\t")
        listentry.append(line[0])# text
        listentry.append(line[1])# polarity
        listentry.append(line[2])# irony form
        listentry.append(line[3])# change
        listentry.append(line[4])# binary label
        listentry.append(line[5])# multy label
        result.append(listentry)

In [48]:
len(result)

4300

In [49]:
#Create train and test data
result = random.sample(list(result),len(result))
numbertraindata = int((len(result))*0.8)

test_tetzner = result[numbertraindata:]
train_tetzner = result[:numbertraindata]

In [50]:
train_tetzner_irony = []
train_tetzner_noirony = []
for entry in train_tetzner:
    if entry[4] == '1':
        train_tetzner_irony.append(entry)
    else:
        train_tetzner_noirony.append(entry)
count_irony_entries = len(train_tetzner_irony)
train_tetzner = train_tetzner_irony + train_tetzner_noirony[:count_irony_entries]

In [54]:
len(train_tetzner_irony)

867

In [55]:
#Split the data into text and labels
def split_data(corpus):
    data = []
    labels_polarity =[]
    labels_ironyform =[]
    labels_change = []
    labels_ironybinary = []
    labels_ironymulty = []
    
    for entry in corpus:
        data.append(entry[0])
        labels_polarity.append(entry[1])
        labels_ironyform.append(entry[2])
        labels_change.append(entry[3])
        labels_ironybinary.append(entry[4])
        labels_ironymulty.append(entry[5])

    return data, labels_polarity, labels_ironyform, labels_change, labels_ironybinary, labels_ironymulty

corpustrain_tetzner, labelstrainpolarity, labelstrainironyform, labelstrainchange, labelstrainironybinary, labelstrainironymulty = split_data(train_tetzner)
corpustest_tetzner, labelstestpolarity, labelstestironyform, labelstestchange, labelstestironybinary, labelstestironymulty = split_data(test_tetzner)

In [40]:
Counter(labelstrainironybinary+labelstestironybinary)

Counter({'1': 1087, '0': 1509})

In [56]:
#Preprocess data
vocab_tetzner = Counter()
tweets_tetzner = process_tweets(corpustrain_tetzner + corpustest_tetzner, vocab_tetzner)

In [57]:
# Create train data, test data and dirctionary
train_data_tetzner, test_data_tetzner, word_index_tetzner = create_train_test_wordindex(vocab_tetzner, tweets_tetzner, corpustrain_tetzner)

Maximal Sequence Length: 69


In [58]:
#Store the train data, test data and word index for the German Irony Corpus
write_file(file_tetzner_train_data, train_data_tetzner)
write_file(file_tetzner_train_labels_polarity, labelstrainpolarity)
write_file(file_tetzner_train_labels_ironyform, labelstrainironyform)
write_file(file_tetzner_train_labels_change, labelstrainchange)
write_file(file_tetzner_train_labels_ironybinary, labelstrainironybinary)
write_file(file_tetzner_train_labels_ironymulty, labelstrainironymulty)

write_file(file_tetzner_test_data, test_data_tetzner)
write_file(file_tetzner_test_labels_polarity, labelstestpolarity)
write_file(file_tetzner_test_labels_ironyform, labelstestironyform)
write_file(file_tetzner_test_labels_change, labelstestchange)
write_file(file_tetzner_test_labels_ironybinary, labelstestironybinary)
write_file(file_tetzner_test_labels_ironymulty, labelstestironymulty)

write_index(file_tetzner_word_index, word_index_tetzner)

Warining: File output_trainTestData/tetzner_train_data_balanced.csv already exists.
Warining: File output_trainTestData/tetzner_train_labels_polarity_balanced.csv already exists.
Warining: File output_trainTestData/tetzner_train_labels_ironyform_balanced.csv already exists.
Warining: File output_trainTestData/tetzner_train_labels_change_balanced.csv already exists.
Warining: File output_trainTestData/tetzner_train_labels_ironybinary_balanced.csv already exists.
Warining: File output_trainTestData/tetzner_train_labels_ironymulty_balanced.csv already exists.
Warining: File output_trainTestData/tetzner_test_data_balanced.csv already exists.
Warining: File output_trainTestData/tetzner_test_labels_polarity_balanced.csv already exists.
Warining: File output_trainTestData/tetzner_test_labels_ironyform_balanced.csv already exists.
Warining: File output_trainTestData/tetzner_test_labels_change_balanced.csv already exists.
Warining: File output_trainTestData/tetzner_test_labels_ironybinary_balan

# Combine data sets

In [7]:
def read_data_for_combination(filename, textposition, labelindicator, header, splitsign, labelexist):
    result = []
    with open(filename, 'r', encoding='utf-8') as train:
        for line in train:
            if not line.lower().startswith(header): #skip header, if exist
                listentry = []
                line = line.rstrip().split(splitsign)
                if textposition != -1:
                    listentry.append(line[textposition])
                else:
                    listentry.append(line)
                if labelexist:
                    listentry.append(int(line[labelindicator]))
                else:
                    listentry.append(int(labelindicator))
                result.append(listentry)
    return result

def concatenate_train_test(corpuslist):
    corpus = corpuslist[0]
    j = len(corpuslist)
    for i in range(1,j):
        corpus = np.concatenate((corpus, corpuslist[i]))
        
    corpus = random.sample(list(corpus),len(corpus))
    
    numbertraindata = int((len(corpus))*0.8)
    trainsplit = corpus[:numbertraindata]
    testsplit = corpus[numbertraindata:]
    
    return trainsplit, testsplit

def concatenate_train(corpuslist):
    corpus = corpuslist[0]
    j = len(corpuslist)
    for i in range(1,j):
        corpus = np.concatenate((corpus, corpuslist[i]))
        
    corpus = random.sample(list(corpus),len(corpus))
    
    return corpus

def split_text_label(corpus):
    texts = []
    labels =[]
    
    for entry in corpus:
        text = entry[0] #entry[0] is a list by istself but, what is needed is the string text
        texts.append(text)
        labels.append(int(entry[1]))
        
    return texts, labels


# Combine all english data sets

In [6]:
ENGLISH_ALL = path + 'english_datasets_all.csv'

file_all_train_data = output + 'english_all_train_data_balanced.csv'
file_all_train_labels = output +'english_all_train_labels_balanced.csv'
file_all_test_data = output +'english_all_test_data_balanced.csv'
file_all_test_labels = output +'english_all_test_labels_balanced.csv'
file_all_word_index = output +'english_all_word_index_balanced.csv'

In [9]:
corpus_all = read_data_for_combination(ENGLISH_ALL,2 ,1 ,'﻿filename	' , '\t', True)

In [10]:
#Create train and test data
result = random.sample(list(corpus_all),len(corpus_all))
numbertraindata = int((len(result))*0.8)

test_corpus_all = result[numbertraindata:]
train_corpus_all = result[:numbertraindata]

In [11]:
#Undersampling
train_corpus_all_irony = []
train_corpus_all_noirony = []
for entry in train_corpus_all:
    if entry[1] == 1:
        train_corpus_all_irony.append(entry)
    else:
        train_corpus_all_noirony.append(entry)
count_irony_entries = len(train_corpus_all_irony)
train_corpus_all = train_corpus_all_irony + train_corpus_all_noirony[:count_irony_entries]

In [12]:
len(train_corpus_all)

24844

In [13]:
#Split the data into text and labels
def split_data(corpus):
    data = []
    label =[]
    
    for entry in corpus:
        data.append(entry[0])
        label.append(entry[1])
    return data, label

corpustrain_all,corpustrain_label  = split_data(train_corpus_all)
corpustest_all, corpustest_label = split_data(test_corpus_all)

In [14]:
#Preprocess data
vocab_all = Counter()
tweets_all = process_tweets(corpustrain_all + corpustest_all, vocab_all)

In [15]:
# Create train data, test data and dirctionary
train_data_all, test_data_all, word_index_all = create_train_test_wordindex(vocab_all, tweets_all, corpustrain_all)

Maximal Sequence Length: 1513


In [17]:
#Store the train data, test data and word index for the German Irony Corpus
write_file(file_all_test_data,test_data_all)
write_file(file_all_test_labels,corpustest_label)
write_file(file_all_train_data,train_data_all)
write_file(file_all_train_labels,corpustrain_label)

write_index(file_all_word_index,word_index_all)

# Combine all englisch Twitter data sets

In [6]:
ENGLISH_ALL = path + 'english_datasets_all_without_Ravi_and_Wallace.csv'

file_all_train_data = output + 'english_twitter_train_data_balanced.csv'
file_all_train_labels = output +'english_twitter_train_labels_balanced.csv'
file_all_test_data = output +'english_twitter_test_data_balanced.csv'
file_all_test_labels = output +'english_twitter_test_labels_balanced.csv'
file_all_word_index = output +'english_twitter_word_index_balanced.csv'

In [9]:
corpus_all = read_data_for_combination(ENGLISH_ALL,2 ,1 ,'﻿filename	' , '\t', True)

In [10]:
#Create train and test data
result = random.sample(list(corpus_all),len(corpus_all))
numbertraindata = int((len(result))*0.8)

test_corpus_all = result[numbertraindata:]
train_corpus_all = result[:numbertraindata]

In [11]:
#Undersampling
train_corpus_all_irony = []
train_corpus_all_noirony = []
for entry in train_corpus_all:
    if entry[1] == 1:
        train_corpus_all_irony.append(entry)
    else:
        train_corpus_all_noirony.append(entry)
count_irony_entries = len(train_corpus_all_irony)
train_corpus_all = train_corpus_all_irony + train_corpus_all_noirony[:count_irony_entries]

In [12]:
len(train_corpus_all)

23246

In [13]:
#Split the data into text and labels
def split_data(corpus):
    data = []
    label =[]
    
    for entry in corpus:
        data.append(entry[0])
        label.append(entry[1])
    return data, label

corpustrain_all,corpustrain_label  = split_data(train_corpus_all)
corpustest_all, corpustest_label = split_data(test_corpus_all)

In [14]:
#Preprocess data
vocab_all = Counter()
tweets_all = process_tweets(corpustrain_all + corpustest_all, vocab_all)

In [15]:
# Create train data, test data and dirctionary
train_data_all, test_data_all, word_index_all = create_train_test_wordindex(vocab_all, tweets_all, corpustrain_all)

Maximal Sequence Length: 1747


In [19]:
#Store the train data, test data and word index for the German Irony Corpus
write_file(file_all_test_data,test_data_all)
write_file(file_all_test_labels,corpustest_label)
write_file(file_all_train_data,train_data_all)
write_file(file_all_train_labels,corpustrain_label)

write_index(file_all_word_index,word_index_all)

# Combine all english Comment data sets

In [191]:
ENGLISH_ALL = path + 'english_datasets_only_wallace_and_ravi.csv'

file_all_train_data = output + 'english_comment_train_data_balanced.csv'
file_all_train_labels = output +'english_comment_train_labels_balanced.csv'
file_all_test_data = output +'english_comment_test_data_balanced.csv'
file_all_test_labels = output +'english_comment_test_labels_balanced.csv'
file_all_word_index = output +'english_comment_word_index_balanced.csv'

In [192]:
corpus_all = read_data_for_combination(ENGLISH_ALL,2 ,1 ,'﻿filename	' , '\t', True)

In [193]:
#Create train and test data
result = random.sample(list(corpus_all),len(corpus_all))
numbertraindata = int((len(result))*0.8)

test_corpus_all = result[numbertraindata:]
train_corpus_all = result[:numbertraindata]

In [194]:
#Undersampling
train_corpus_all_irony = []
train_corpus_all_noirony = []
for entry in train_corpus_all:
    if entry[1] == 1:
        train_corpus_all_irony.append(entry)
    else:
        train_corpus_all_noirony.append(entry)
count_irony_entries = len(train_corpus_all_irony)
train_corpus_all = train_corpus_all_irony + train_corpus_all_noirony[:count_irony_entries]

In [195]:
len(train_corpus_all)

1694

In [196]:
#Split the data into text and labels
def split_data(corpus):
    data = []
    label =[]
    
    for entry in corpus:
        data.append(entry[0])
        label.append(entry[1])
    return data, label

corpustrain_all,corpustrain_label  = split_data(train_corpus_all)
corpustest_all, corpustest_label = split_data(test_corpus_all)

In [197]:
#Preprocess data
vocab_all = Counter()
tweets_all = process_tweets(corpustrain_all + corpustest_all, vocab_all)

In [198]:
# Create train data, test data and dirctionary
train_data_all, test_data_all, word_index_all = create_train_test_wordindex(vocab_all, tweets_all, corpustrain_all)

Maximal Sequence Length: 1233


In [199]:
#Store the train data, test data and word index for the German Irony Corpus
write_file(file_all_test_data,corpustest_all)
write_file(file_all_test_labels,corpustest_label)
write_file(file_all_train_data,corpustrain_all)
write_file(file_all_train_labels,corpustrain_label)

write_index(file_all_word_index,word_index_all)

## Combine SamEval and Reyes et al.

In [7]:
TRAIN_SAMEVAL = path + 'SemEval2018-T3-train-taskA_emoji.txt'
TEST_SAMEVAL = path + 'SemEval2018-T3_gold_test_taskA_emoji.txt' 
IRONY = path + 'Irony.txt'
EDUCATION = path + 'Education.txt'

samevalcorpus = read_data_for_combination(TRAIN_SAMEVAL, 2, 1, 'tweet index', '\t', True)
samevalcorpustest = read_data_for_combination (TEST_SAMEVAL, 2, 1, 'tweet index', '\t', True )
reyes_irony_corpus = read_data_for_combination(IRONY, -1, 1,'#+no header+#',  '\n', False)
reyes_education_corpus = read_data_for_combination(EDUCATION, -1, 0,'#+no header+#','\n', False  )

##### Combine the train and test data from samEval with 10.000 data from irony corpus and 10.000 from education corpus from reyes at al.. Test data will be 20% of the whole data from all corpuses.

In [72]:
corpuslist = []
corpuslist.append(samevalcorpus)
corpuslist.append(samevalcorpustest)
corpuslist.append(reyes_irony_corpus)
corpuslist.append(reyes_education_corpus)

train, test = concatenate_train_test(corpuslist)

print('Number data allthogether: '+str(len(train+test)))
print('Number train data: '+str(len(train)))
print('Number test data: '+str(len(test)))

#next split
traindata, trainlabels = split_text_label(train)
testdata, testlabels = split_text_label(test)

print('-----------')
print('Trainlabel distribution: '+str(Counter(trainlabels)))
print('Testlabel distribution: '+str(Counter(testlabels)))

Number data allthogether: 24414
Number train data: 19531
Number test data: 4883
-----------
Trainlabel distribution: Counter({1: 9848, 0: 9683})
Testlabel distribution: Counter({0: 2545, 1: 2338})


In [73]:
#Preprocess data
vocab_sam_reyes_all = Counter()
tweets_sam_reyes_all = process_tweets(traindata + testdata, vocab_sam_reyes_all)

In [74]:
# Create train data, test data and dirctionary
train_data_sam_reyes_all, test_data_sam_reyes_all, word_index_sam_reyes_all = create_train_test_wordindex(vocab_sam_reyes_all, tweets_sam_reyes_all, train)

Maximal Sequence Length: 99


In [78]:
# Store train data, test data and word index for the combination of samEval and Reyes et al.
write_file(output + 'sam_reyes_all_train_data.csv', train_data_sam_reyes_all)
write_file(output + 'sam_reyes_all_train_labels.csv', trainlabels)

write_file(output + 'sam_reyes_all_test_data.csv', test_data_sam_reyes_all)
write_file(output + 'sam_reyes_all_test_labels.csv', testlabels)

write_index(output + 'sam_reyes_all_word_index.csv', word_index_sam_reyes_all)

## Combination of SamEval and Ravi

In [8]:
TRAIN_SAMEVAL = path + 'SemEval2018-T3-train-taskA_emoji_ironyHashtags.txt'
TEST_SAMEVAL = path + 'SemEval2018-T3_gold_test_taskA_emoji.txt' 
RAVIIRONY = path + 'Ravi_finalpos499.txt'
RAVINONIRONY = path + 'Ravi_finalNeg2498.csv'

samevalcorpustrain = read_data_for_combination(TRAIN_SAMEVAL, 2, 1, 'tweet index', '\t', True)
samevalcorpustest = read_data_for_combination (TEST_SAMEVAL, 2, 1, 'tweet index', '\t', True )
ravi_irony = read_data_for_combination(RAVIIRONY,0, 1, '#+no header+#', '\n', False )
ravi_non_irony = read_data_for_combination(RAVINONIRONY,0, 0, '#+no header+#', '\n', False )

In [9]:
corpuslist = []
corpuslist.append(samevalcorpustrain)
corpuslist.append(samevalcorpustest)
corpuslist.append(ravi_irony)
corpuslist.append(ravi_non_irony)

train, test = concatenate_train_test(corpuslist)

print('Number train data: '+str(len(train)))
print('Number test data: '+str(len(test)))

traindata, trainlabels = split_text_label(train)
testdata, testlabels = split_text_label(test)

print('Label distribution train data: '+str(Counter(trainlabels)))
print('Label distribution test data: '+str(Counter(testlabels)))

Number train data: 6092
Number test data: 1523
Label distribution train data: Counter({0: 3957, 1: 2135})
Label distribution test data: Counter({0: 937, 1: 586})


In [10]:
#Preprocess data
vocab_sam_ravi = Counter()
tweets_sam_ravi = process_tweets(traindata + testdata, vocab_sam_ravi)

In [11]:
# Create train data, test data and dirctionary
train_data_sam_ravi, test_data_sam_ravi, word_index_sam_ravi = create_train_test_wordindex(vocab_sam_ravi, tweets_sam_ravi, traindata)

Maximal Sequence Length: 1233


In [12]:
# Store train data, test data and word index for the combination of samEval and ravi corpus
write_file(output + 'sam_ravi_train_data.csv', train_data_sam_ravi)
write_file(output + 'sam_ravi_train_labels.csv', trainlabels)

write_file(output + 'sam_ravi_test_data.csv', test_data_sam_ravi)
write_file(output + 'sam_ravi_test_labels.csv', testlabels)

write_index(output + 'sam_ravi_word_index.csv', word_index_sam_ravi)

## Combine SamEval and IronITA

In [7]:
TRAIN_SAMEVAL = path + 'SemEval2018-T3-train-taskA_emoji_ironyHashtags.txt'
TEST_SAMEVAL = path + 'SemEval2018-T3_gold_test_taskA_emoji.txt' 

TRAIN_IRONITA = path + 'training_ironita2018.csv'
TEST_IRONITA  = path + 'test_gold_ironita2018.csv'

samevalcorpustrain = read_data_for_combination(TRAIN_SAMEVAL, 2, 1, 'tweet index', '\t', True)
samevalcorpustest = read_data_for_combination (TEST_SAMEVAL, 2, 1, 'tweet index', '\t', True )
ironitacorpustrain = read_data_for_combination(TRAIN_IRONITA,1, 2, 'id', '\t', True )
ironitacorpustest = read_data_for_combination(TEST_IRONITA,1, 2, 'id','\t', True  )

In [8]:
corpuslist = []
corpuslist.append(samevalcorpustrain)
corpuslist.append(samevalcorpustest)
corpuslist.append(ironitacorpustrain)
corpuslist.append(ironitacorpustest)

train, test = concatenate_train_test(corpuslist)

print('Number train data: '+str(len(train)))
print('Number test data: '+str(len(test)))

traindata, trainlabels = split_text_label(train)
testdata, testlabels = split_text_label(test)

print('Label distribution train data: '+str(Counter(trainlabels)))
print('Label distribution test data: '+str(Counter(testlabels)))

Number train data: 7573
Number test data: 1894
Label distribution train data: Counter({0: 3791, 1: 3782})
Label distribution test data: Counter({0: 996, 1: 898})


In [9]:
#Preprocess data
vocab_sam_ironita = Counter()
tweets_sam_ironita = process_tweets(traindata + testdata, vocab_sam_ironita)

In [10]:
# Create train data, test data and dirctionary
train_data_sam_ironita, test_data_sam_ironita, word_index_sam_ironita = create_train_test_wordindex(vocab_sam_ironita, tweets_sam_ironita, traindata)

Maximal Sequence Length: 40


In [11]:
# Store train data, test data and word index for the combination of samEval and IronITA
write_file(output + 'sam_ironita_train_data.csv', train_data_sam_ironita)
write_file(output + 'sam_ironita_train_labels.csv', trainlabels)

write_file(output + 'sam_ironita_test_data.csv', test_data_sam_ironita)
write_file(output + 'sam_ironita_test_labels.csv', testlabels)

write_index(output + 'sam_ironita_word_index.csv', word_index_sam_ironita)

## Combine SamEval and German data set

###### Test data are a combination of samEval and German corpus

In [30]:
TRAIN_SAMEVAL = path + 'SemEval2018-T3-train-taskA_emoji.txt'
TEST_SAMEVAL = path + 'SemEval2018-T3_gold_test_taskA_emoji.txt' 
DATA_TETZNER = path + 'GermanIronyCorpus_cleaned.txt'

samevalcorpustrain = read_data_for_combination(TRAIN_SAMEVAL, 2, 1, 'tweet index', '\t', True)
samevalcorpustest = read_data_for_combination (TEST_SAMEVAL, 2, 1, 'tweet index', '\t', True )
germancorpus = read_data_for_combination(DATA_TETZNER,0, 4, '#+no header+#', '\t', True )

In [31]:
corpuslist = []
corpuslist.append(samevalcorpustrain)
corpuslist.append(samevalcorpustest)
corpuslist.append(germancorpus)

train, test = concatenate_train_test(corpuslist)

print('Number train data: '+str(len(train)))
print('Number test data: '+str(len(test)))

traindata, trainlabels = split_text_label(train)
testdata, testlabels = split_text_label(test)

print('Label distribution train data: '+str(Counter(trainlabels)))
print('Label distribution test data: '+str(Counter(testlabels)))

Number train data: 7134
Number test data: 1784
Label distribution train data: Counter({0: 4518, 1: 2616})
Label distribution test data: Counter({0: 1091, 1: 693})


In [32]:
#Preprocess data
vocab_sam_german = Counter()
tweets_sam_german = process_tweets(traindata + testdata, vocab_sam_german)

In [33]:
# Create train data, test data and dirctionary
train_data_sam_german, test_data_sam_german, word_index_sam_german = create_train_test_wordindex(vocab_sam_german, tweets_sam_german, traindata)

Maximal Sequence Length: 73


In [48]:
# Store train data, test data and word index for the combination of samEval and GErman corpus
write_file(output + 'sam_german_train_data.csv', train_data_sam_german)
write_file(output + 'sam_german_train_labels.csv', trainlabels)

write_file(output + 'sam_german_test_data.csv', test_data_sam_german)
write_file(output + 'sam_german_test_labels.csv', testlabels)

write_index(output + 'sam_german_word_index.csv', word_index_sam_german)