In [1]:
import pandas as pd
import numpy as np
from keras.models import Model
from keras.layers import Dense, Embedding, Input
from keras.layers import LSTM, Bidirectional, GlobalMaxPool1D, Dropout
from keras.preprocessing import text, sequence
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.optimizers import Adam
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

Using TensorFlow backend.
[nltk_data] Downloading package stopwords to /Users/mobby/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
import re

def  clean_text(df, text_field):
    df[text_field] = df[text_field].str.lower()
    df[text_field] = df[text_field].apply(lambda elem: re.sub(r"(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", "", elem))  
    return df

In [3]:
def remove_emoji(df, text_field):
    
    #processed = sentence.decode('utf-8')
    
    emoji_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags=re.UNICODE)
    df[text_field] = df[text_field].apply(lambda elem: emoji_pattern.sub(r'', elem))  

    
    return df

def replace_users(df, text_field):
    df[text_field] = df[text_field].apply(lambda elem: re.sub('(?<=^|(?<=[^a-zA-Z0-9-_\.]))@([A-Za-z]+[A-Za-z0-9-_]+)','@USER', elem))  
    return df

def replace_url(df, text_field):
    df[text_field] = df[text_field].apply(lambda elem: re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+','website', elem))  
    return df

def remove_white(df, text_field):
    
    df[text_field] = df[text_field].apply(lambda elem: elem.replace(r'[^\w\d\s]',' '))
    df[text_field] = df[text_field].apply(lambda elem: elem.replace(r'\s+', ' '))
    df[text_field] = df[text_field].apply(lambda elem: elem.replace(r'^\s+|\s+?$',''))
    df[text_field] = df[text_field].apply(lambda elem: elem.replace('\n',''))
    return df

def remove_stop(df, text_field): #remove stop words
    
    stop_words = set(stopwords.words('english'))
    df[text_field] = df[text_field].apply(lambda elem: elem.apply(lambda x: ' '.join(term for term in x.split() is term not in stop_words)))    
    return df

def remove_stems(sentence):
    
    ps = nltk.PorterStemmer()
    processed = processed.apply(lambda x: ' '.join(ps.stem(term) for term in x.split()))    
    return(processed)


def lower_df(df, text_field):
    df[text_field] = df[text_field].str.lower()
    return df

def preprocessing(df, text_field):
    df = remove_emoji(df, text_field)
    df = replace_users(df, text_field)
    df = replace_url(df, text_field)
    df = remove_white(df, text_field)
    df = lower_df(df, text_field)
    return df

In [4]:
dataset = pd.read_csv('offenseval-training-v1.tsv', sep='\t')
df_test = pd.read_csv('offenseval-trial.txt',sep="\t", header=0, names = ['tweet','subtask_a','subtask_b','subtask_c'])


In [5]:
df_test['subtask_b'].value_counts()

UNT    39
TIN    34
TTH     4
Name: subtask_b, dtype: int64

In [6]:
dataoff = dataset[dataset.subtask_a=='OFF']
dataoff_test = df_test[df_test.subtask_a == 'OFF']

In [7]:
dataoff = preprocessing(dataoff, 'tweet')
dataoff_test = preprocessing(dataoff_test, 'tweet')
dataoff = clean_text(dataoff, 'tweet')
dataoff_test = clean_text(dataoff_test, 'tweet')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # This is added back by InteractiveShellApp.init_path()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stabl

In [8]:
dataoff_test = dataoff_test[dataoff_test.subtask_b != 'TTH']

In [9]:
datalength = dataoff.shape[0]
datatest_length = dataoff_test.shape[0]

In [10]:
dataoff.reset_index(inplace=True)
dataoff_test.reset_index(inplace=True)

In [11]:
dico_target = {'UNT': 0, 'TIN': 1}
dataoff['target'] = dataoff['tweet'].copy()
for i in range(datalength):
    dataoff.at[i, 'target'] = dico_target[dataoff.at[i, 'subtask_b']]
    
dataoff_test['target'] = dataoff_test['tweet'].copy()
for i in range(datatest_length):
    dataoff_test.at[i, 'target'] = dico_target[dataoff_test.at[i, 'subtask_b']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [12]:
def get_tokenized_corpus(corpus):
    tokenized_corpus = []
    for sentence in corpus:
        tokenized_sentence = []
        for token in sentence.split(' '): 
            tokenized_sentence.append(token)
        tokenized_corpus.append(tokenized_sentence)
 
    return tokenized_corpus


In [13]:
dataset = dataoff
df_test = dataoff_test

In [14]:
corpus_train = dataset['tweet'].tolist()
corpus_test = df_test['tweet'].tolist()
stop_words = set(stopwords.words('english'))

tokenized_corpus_train = get_tokenized_corpus(corpus_train)
filtered_corpus_train = []

for sentence in tokenized_corpus_train :
    new_sentence = []
    for word in sentence: 
        if word not in stop_words and word!='': #remove stop words and empty words
            new_sentence.append(word)       
    filtered_corpus_train.append(new_sentence)
    
tokenized_corpus_test = get_tokenized_corpus(corpus_test)
filtered_corpus_test = []

for sentence in tokenized_corpus_test :
    new_sentence = []
    for word in sentence: 
        if word not in stop_words and word!='': #remove stop words and empty words
            new_sentence.append(word)       
    filtered_corpus_test.append(new_sentence)
    
ps = nltk.PorterStemmer()
stemmed_corpus_train = []
for sentence in filtered_corpus_train :
    new_sentence = []
    for word in sentence: 
        new_sentence.append(ps.stem(word)) #stem words      
    stemmed_corpus_train.append(new_sentence)
    
stemmed_corpus_test = []
for sentence in filtered_corpus_test:
    new_sentence = []
    for word in sentence: 
        new_sentence.append(ps.stem(word)) #stem words      
    stemmed_corpus_test.append(new_sentence)
    
    


In [15]:
for i in range(len(stemmed_corpus_train)):
    sentence = ''
    sentence_list = stemmed_corpus_train[i]
    for word in sentence_list:
        sentence += word
        sentence += ' '
    dataset.at[i, 'tweet'] = sentence
    
for i in range(len(stemmed_corpus_test)):
    sentence = ''
    sentence_list = stemmed_corpus_test[i]
    for word in sentence_list:
        sentence += word
        sentence += ' '
    df_test.at[i, 'tweet'] = sentence

In [17]:
X_train = dataset['tweet']
y_train = dataset['target']
X_test = df_test['tweet']
y_test = df_test['target']

In [18]:
#Handle imbalanced dataset
from imblearn.over_sampling import RandomOverSampler

print(np.shape(X_train))

ros = RandomOverSampler(random_state=0)
X_train = X_train.reshape(-1,1)
X_train, y_train = ros.fit_resample(X_train, y_train)

(4400,)


AttributeError: 'Series' object has no attribute 'reshape'

In [18]:
X_train = X_train.reshape(X_train.shape[0])
X_train = pd.Series(X_train)

In [19]:
max_features = 11000
maxlen = 100
list_train = X_train.values
list_test = X_test.values
tokenizer = text.Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(list_train))
list_tokenized_train = tokenizer.texts_to_sequences(list_train)
list_tokenized_test = tokenizer.texts_to_sequences(list_test)
X_t = sequence.pad_sequences(list_tokenized_train, maxlen=maxlen)
X_te = sequence.pad_sequences(list_tokenized_test, maxlen=maxlen)

In [20]:
def compute_recall_precision_f1(confusion_matrix):
    true_pred = 0
    total_pred = 0
    total_f1 = 0
    for i in range(len(confusion_matrix)):
        true_pos = confusion_matrix[i][i]
        true_pred += true_pos
        total_pred += sum(confusion_matrix[i])
        false_neg = 0
        false_pos = 0
        for j in range(len(confusion_matrix)):
            if j!=i:
                false_neg += confusion_matrix[i][j]
                false_pos += confusion_matrix[j][i]
        recall = true_pos / (true_pos + false_neg)
        precision = true_pos / (true_pos + false_pos)
        f1 = 2*(precision * recall)/(precision + recall)
        total_f1 += f1
        print("==========================")
        print("For class ", i, " : ")
        print()
        print("Precision : ", precision)
        print("Recall : ", recall)
        print("F1 : ", f1)
        print("==========================")
    print("Classification Rate : ", true_pred/total_pred)
    print("Macro Average Classification Rate : ", total_f1/len(confusion_matrix))


In [21]:
def confusion_matrix(y_true, y_pred):
    conf_matrix = np.zeros((2,2))
    for i in range(len(y_true)):
        conf_matrix[y_true[i],y_pred[i]] += 1
    return conf_matrix

In [22]:
def get_model():
    embed_size = 128
    inp = Input(shape=(maxlen, ))
    x = Embedding(max_features, embed_size)(inp)
    x = Bidirectional(LSTM(50, return_sequences=True))(x)
    x = GlobalMaxPool1D()(x)
    x = Dropout(0.05)(x)
    x = Dense(50, activation="relu")(x)
    x = Dropout(0.05)(x)
    x = Dense(1, activation="sigmoid")(x)
    model = Model(inputs=inp, outputs=x)
    adam_optim = Adam(lr=0.00009, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.001, amsgrad=False)
    model.compile(loss='binary_crossentropy',
                  optimizer=adam_optim,
                  metrics=['accuracy'])

    return model

In [23]:
model = get_model()
batch_size = 64
epochs = 4

In [24]:
model.fit(X_t, y_train, batch_size=batch_size, epochs=epochs, validation_split=0.1)

Train on 6976 samples, validate on 776 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<keras.callbacks.History at 0x7f7ca314ceb8>

In [25]:
y_pred = model.predict(X_te)
y_pred = pd.Series(y_pred.round().astype(int).reshape(y_pred.shape[0]))


In [26]:
y_test = y_test.reset_index(drop=True)

In [27]:
conf_mat = confusion_matrix(y_test, y_pred)

In [28]:
conf_mat

array([[12., 27.],
       [ 3., 31.]])

In [29]:
compute_recall_precision_f1(conf_mat)

For class  0  : 

Precision :  0.8
Recall :  0.3076923076923077
F1 :  0.4444444444444444
For class  1  : 

Precision :  0.5344827586206896
Recall :  0.9117647058823529
F1 :  0.6739130434782609
Classification Rate :  0.589041095890411
Macro Average Classification Rate :  0.5591787439613527


# Compute prediction for submission

In [30]:
data_submission = pd.read_csv('testset-taskb.tsv', sep='\t')
datalength = data_submission.shape[0]

In [31]:
data_submission = preprocessing(data_submission, 'tweet')
data_submission = clean_text(data_submission, 'tweet')

In [32]:
corpus_submission = data_submission['tweet'].tolist()


In [33]:
tokenized_corpus_submission = get_tokenized_corpus(corpus_submission)
filtered_corpus_submission = []

for sentence in tokenized_corpus_submission :
    new_sentence = []
    for word in sentence: 
        if word not in stop_words and word!='': #remove stop words and empty words
            new_sentence.append(word)       
    filtered_corpus_submission.append(new_sentence)
    
ps = nltk.PorterStemmer()
stemmed_corpus_submission = []
for sentence in filtered_corpus_submission :
    new_sentence = []
    for word in sentence: 
        new_sentence.append(ps.stem(word)) #stem words      
    stemmed_corpus_submission.append(new_sentence)
    

In [34]:
for i in range(len(stemmed_corpus_submission)):
    sentence = ''
    sentence_list = stemmed_corpus_submission[i]
    for word in sentence_list:
        sentence += word
        sentence += ' '
    data_submission.at[i, 'tweet'] = sentence
    


In [35]:
X_test = data_submission['tweet']
list_test = X_test.values
list_tokenized_test = tokenizer.texts_to_sequences(list_test)
X_te = sequence.pad_sequences(list_tokenized_test, maxlen=maxlen)

In [36]:
y_pred = model.predict(X_te)
y_pred = pd.Series(y_pred.round().astype(int).reshape(y_pred.shape[0]))

In [37]:
data_submission['prediction'] = y_pred

In [38]:
data_submission = data_submission.drop(['tweet'], axis=1)

In [39]:
dico_target = {0:'UNT', 1:'TIN'}
for i in range(datalength):
    data_submission.at[i, 'target'] = dico_target[data_submission.at[i, 'prediction']]

In [40]:
data_submission = data_submission.drop(['prediction'], axis=1)

In [41]:
data_submission.to_csv('subpart_b.csv', index=False)

# Part 3

In [42]:
dataset = dataset[dataset.subtask_b == 'TIN']
df_test = df_test[df_test.subtask_b == 'TIN']

In [43]:
df_test = df_test[df_test.subtask_c != 'ORG']

In [44]:
X_train = dataset['tweet']
y_train = dataset['subtask_c']
X_test = df_test['tweet']
y_test = df_test['subtask_c']

In [45]:
X_train_ = X_train[:3300]
y_train_ = y_train[:3300]
X_test = X_train[3300:]
y_test = y_train[3300:]

In [46]:
pd.Series(y_test).value_counts()

IND    357
GRP    163
OTH     56
Name: subtask_c, dtype: int64

In [47]:
X_train = X_train_
y_train = y_train_

In [48]:
#Handle imbalanced dataset

from imblearn.over_sampling import RandomOverSampler

ros = RandomOverSampler(random_state=0)
X_train = X_train.reshape(-1,1)
X_train, y_train = ros.fit_resample(X_train, y_train)

  


In [49]:
from sklearn import preprocessing

le = preprocessing.LabelEncoder()
le.fit(y_train)
y_train = le.transform(y_train)
y_test = le.transform(y_test)

In [50]:
n_values = np.max(y_train) + 1
y_train = np.eye(n_values)[y_train]


In [51]:
X_train = X_train.reshape(X_train.shape[0])
X_train = pd.Series(X_train)

In [52]:
max_features = 11000
maxlen = 100
list_train = X_train.values
list_test = X_test.values
tokenizer = text.Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(list_train))
list_tokenized_train = tokenizer.texts_to_sequences(list_train)
list_tokenized_test = tokenizer.texts_to_sequences(list_test)
X_t = sequence.pad_sequences(list_tokenized_train, maxlen=maxlen)
X_te = sequence.pad_sequences(list_tokenized_test, maxlen=maxlen)

In [53]:
def get_model():
    embed_size = 128
    inp = Input(shape=(maxlen, ))
    x = Embedding(max_features, embed_size)(inp)
    x = Bidirectional(LSTM(50, return_sequences=True))(x)
    x = GlobalMaxPool1D()(x)
    x = Dropout(0.05)(x)
    x = Dense(50, activation="relu")(x)
    x = Dropout(0.05)(x)
    x = Dense(3, activation="sigmoid")(x)
    model = Model(inputs=inp, outputs=x)
    adam_optim = Adam(lr=0.00029, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0012, amsgrad=False)
    model.compile(loss='categorical_crossentropy',
                  optimizer=adam_optim,
                  metrics=['accuracy'])

    return model

In [54]:
model = get_model()
batch_size = 128
epochs = 6

In [55]:
model.fit(X_t, y_train, batch_size=batch_size, epochs=epochs, validation_split=0.1)

Train on 5535 samples, validate on 615 samples
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


<keras.callbacks.History at 0x7f7ca0078860>

In [56]:
y_pred = model.predict(X_te)
y_pred = [np.argmax(y_pred[i]) for i in range(len(y_pred))]

In [57]:
from sklearn.metrics import confusion_matrix

conf_mat = confusion_matrix(y_test, y_pred)

In [58]:
conf_mat

array([[ 80,  56,  27],
       [ 70, 259,  28],
       [ 23,  22,  11]])

In [59]:
compute_recall_precision_f1(conf_mat)

For class  0  : 

Precision :  0.4624277456647399
Recall :  0.49079754601226994
F1 :  0.47619047619047616
For class  1  : 

Precision :  0.7685459940652819
Recall :  0.7254901960784313
F1 :  0.7463976945244958
For class  2  : 

Precision :  0.16666666666666666
Recall :  0.19642857142857142
F1 :  0.180327868852459
Classification Rate :  0.6076388888888888
Macro Average Classification Rate :  0.4676386798558103


# Compute prediction for submission

In [79]:
# First we will train on the whole dataset

X_train = dataset['tweet']
y_train = dataset['subtask_c']

In [102]:
def preprocessing(df, text_field):
    df = remove_emoji(df, text_field)
    df = replace_users(df, text_field)
    df = replace_url(df, text_field)
    df = remove_white(df, text_field)
    df = lower_df(df, text_field)
    return df

In [96]:
data_submission = pd.read_csv('test_set_taskc.tsv', sep='\t')


In [112]:
data_submission = pd.read_csv('test_set_taskc.tsv', sep='\t')
datalength = data_submission.shape[0]
data_submission = preprocessing(data_submission, 'tweet')
data_submission = clean_text(data_submission, 'tweet')
corpus_submission = data_submission['tweet'].tolist()

tokenized_corpus_submission = get_tokenized_corpus(corpus_submission)
filtered_corpus_submission = []

for sentence in tokenized_corpus_submission :
    new_sentence = []
    for word in sentence: 
        if word not in stop_words and word!='': #remove stop words and empty words
            new_sentence.append(word)       
    filtered_corpus_submission.append(new_sentence)
    
ps = nltk.PorterStemmer()
stemmed_corpus_submission = []
for sentence in filtered_corpus_submission :
    new_sentence = []
    for word in sentence: 
        new_sentence.append(ps.stem(word)) #stem words      
    stemmed_corpus_submission.append(new_sentence)
    
for i in range(len(stemmed_corpus_submission)):
    sentence = ''
    sentence_list = stemmed_corpus_submission[i]
    for word in sentence_list:
        sentence += word
        sentence += ' '
    data_submission.at[i, 'tweet'] = sentence
    
    
X_test = data_submission['tweet']

    

In [82]:
len(stemmed_corpus_submission)

213

In [83]:
#Handle imbalanced dataset

from imblearn.over_sampling import RandomOverSampler

ros = RandomOverSampler(random_state=0)
X_train = X_train.reshape(-1,1)
X_train, y_train = ros.fit_resample(X_train, y_train)

  


In [84]:
from sklearn import preprocessing


le = preprocessing.LabelEncoder()
le.fit(y_train)
y_train = le.transform(y_train)
n_values = np.max(y_train) + 1
y_train = np.eye(n_values)[y_train]
X_train = X_train.reshape(X_train.shape[0])
X_train = pd.Series(X_train)

In [85]:
max_features = 11000
maxlen = 100
list_train = X_train.values
list_test = X_test.values
tokenizer = text.Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(list_train))
list_tokenized_train = tokenizer.texts_to_sequences(list_train)
list_tokenized_test = tokenizer.texts_to_sequences(list_test)
X_t = sequence.pad_sequences(list_tokenized_train, maxlen=maxlen)
X_te = sequence.pad_sequences(list_tokenized_test, maxlen=maxlen)

In [86]:
def get_model():
    embed_size = 128
    inp = Input(shape=(maxlen, ))
    x = Embedding(max_features, embed_size)(inp)
    x = Bidirectional(LSTM(50, return_sequences=True))(x)
    x = GlobalMaxPool1D()(x)
    x = Dropout(0.05)(x)
    x = Dense(50, activation="relu")(x)
    x = Dropout(0.05)(x)
    x = Dense(3, activation="sigmoid")(x)
    model = Model(inputs=inp, outputs=x)
    adam_optim = Adam(lr=0.00029, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0012, amsgrad=False)
    model.compile(loss='categorical_crossentropy',
                  optimizer=adam_optim,
                  metrics=['accuracy'])

    return model

In [87]:
model = get_model()
batch_size = 128
epochs = 6

In [88]:
model.fit(X_t, y_train, batch_size=batch_size, epochs=epochs, validation_split=0.1)

Train on 6498 samples, validate on 723 samples
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


<keras.callbacks.History at 0x7f7c5d747e80>

In [106]:
y_pred = model.predict(X_te)
y_pred = [np.argmax(y_pred[i]) for i in range(len(y_pred))]

In [107]:
data_submission['prediction'] = y_pred
data_submission = data_submission.drop(['tweet'], axis=1)
dico_target = {0:'GRP', 1:'IND', 2:'OTH'}
for i in range(datalength):
    data_submission.at[i, 'target'] = dico_target[data_submission.at[i, 'prediction']]

In [108]:
data_submission = data_submission.drop(['prediction'], axis=1)
data_submission.to_csv('subpart_c.csv', index=False)

In [109]:
dataset['subtask_c'].value_counts()

IND    2407
GRP    1074
OTH     395
Name: subtask_c, dtype: int64

In [114]:
type(data_submission['id'][0])

numpy.int64

In [100]:
datatest

Unnamed: 0,id,target
0,15923,GRP
1,60133,GRP
2,83681,IND
3,65507,IND
4,34263,IND
5,49139,IND
6,58995,IND
7,88490,GRP
8,46444,IND
9,60587,IND


In [99]:
datatest = pd.read_csv('subpart_c.csv')