In [1]:
# !pip install pandas
# !pip install scikit-learn
# !pip install emoji

In [20]:
import os
import re
import numpy as np
import pandas as pd
import sklearn
from sklearn.utils import shuffle

from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import classification_report

from nltk.tokenize import word_tokenize
from scipy.sparse import csr_matrix
from collections import Counter

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

from tensorflow.keras.models import Sequential,Model
from tensorflow.keras.layers import Embedding,Dense,Dropout,Bidirectional,GlobalMaxPool1D,GlobalAveragePooling1D, SpatialDropout1D,Input,Conv1D,MaxPooling1D,Flatten
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras.initializers import Constant
from sklearn.utils import class_weight
import tensorflow as tf

import emoji

import numpy as np
import re
import nltk
from nltk.corpus import stopwords
import string
import pandas as pd
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
df = pd.read_csv("data/OLIDv1/olid-training-v1.0.tsv", sep='\t')
df.head()

Unnamed: 0,id,tweet,subtask_a,subtask_b,subtask_c
0,86426,@USER She should ask a few native Americans wh...,OFF,UNT,
1,90194,@USER @USER Go home you’re drunk!!! @USER #MAG...,OFF,TIN,IND
2,16820,Amazon is investigating Chinese employees who ...,NOT,,
3,62688,"@USER Someone should'veTaken"" this piece of sh...",OFF,UNT,
4,43605,@USER @USER Obama wanted liberals &amp; illega...,NOT,,


In [4]:
# Reference: https://stackoverflow.com/questions/9662346/python-code-to-remove-html-tags-from-a-string
CLEANR = re.compile('<.*?>') 
def cleanhtml(raw_html):
    cleantext = re.sub(CLEANR, '', raw_html)
    return cleantext

def preprocess(sent):
    alphabet = "abcdefghijklmnopqrstuvwxyz 0123456789',."
    sent = sent.lower() 
    sent = cleanhtml(sent) # remove html tags 
    cleaned_sent_list = [char if char in alphabet else ' ' for char in sent] # remove all tags not in the alphabet
    cleaned_sent = ''.join(cleaned_sent_list)
    cleaned_sent = cleaned_sent.replace("n't",' not') # replace words like "isn't" with "is not"
    cleaned_sent = ' . '.join([x for x in cleaned_sent.split('.') if len(x)>0]) # remove multiple periods, and add spaces before and after a period
    cleaned_sent = ' , '.join([x for x in cleaned_sent.split(',') if len(x)>0]) # add spaces before and after a comma
    cleaned_sent = ' '.join(cleaned_sent.split()) # remove multiple spaces
    return cleaned_sent

def print_metrics(y,y_p):
    print('Accuracy:',accuracy_score(y,y_p))
    print('Precision:',precision_score(y,y_p))
    print('Recall:',recall_score(y,y_p))
    print('F1 score:',f1_score(y,y_p))

In [33]:
## Run this block only for additional preprocessing steps

def emoji_to_text(s):
    s = emoji.demojize(s)
    s = s.replace(':',' ')
    s = s.replace('_',' ')    
    s = ' '.join(s.split())
    return s

# loading twitter slang data
slang_df = pd.read_csv('data/twitterSlang.csv')
slang_dict = dict(zip(slang_df.slang, slang_df.formal_translation))

def fix_slang(s):
    s_list = s.split()
    new_s_list = []
    for word in s_list:
        if word in slang_dict.keys():
            new_s_list.append(slang_dict[word])
        else:
            new_s_list.append(word)
            
    return ' '.join(new_s_list)

def preprocess(sent):
    alphabet = "abcdefghijklmnopqrstuvwxyz 0123456789',."
    
    sent = emoji_to_text(sent)
    sent = fix_slang(sent)
    sent = sent.lower() 
    sent = cleanhtml(sent) # remove html tags 
    cleaned_sent_list = [char if char in alphabet else ' ' for char in sent] # remove all tags not in the alphabet
    cleaned_sent = ''.join(cleaned_sent_list)
    cleaned_sent = cleaned_sent.replace("n't",' not') # replace words like "isn't" with "is not"
    cleaned_sent = ' . '.join([x for x in cleaned_sent.split('.') if len(x)>0]) # remove multiple periods, and add spaces before and after a period
    cleaned_sent = ' , '.join([x for x in cleaned_sent.split(',') if len(x)>0]) # add spaces before and after a comma
    cleaned_sent = ' '.join(cleaned_sent.split()) # remove multiple spaces
    return cleaned_sent

## Training

In [34]:
x = df.tweet
y = df.subtask_a.apply(lambda x: 1 if x=='OFF' else 0)

x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.3, random_state=42)

In [35]:
data_train = [preprocess(tweet) for tweet in x_train]
data_val = [preprocess(tweet) for tweet in x_val]

n_features = 500
vectorizer = TfidfVectorizer(analyzer='word', ngram_range=(1,3),max_features=n_features)
X_train = vectorizer.fit_transform(data_train)
X_val = vectorizer.transform(data_val)

In [36]:
X_train = X_train.toarray().reshape([-1,n_features,1])
X_val = X_val.toarray().reshape([-1,n_features,1])

In [37]:
def model_cnn(n_features=500):
    model = tf.keras.models.Sequential()
    model.add(tf.keras.layers.Conv1D(filters=128, kernel_size=5, strides=5, activation='relu', input_shape=(n_features,1)))
    model.add(tf.keras.layers.Conv1D(filters=128, kernel_size=5, strides=5, activation='relu'))
    model.add(tf.keras.layers.Conv1D(filters=128, kernel_size=5, strides=5, activation='relu'))
    model.add(tf.keras.layers.Flatten())
    model.add(tf.keras.layers.Dense(128, activation='relu'))
    model.add(tf.keras.layers.Dense(1, activation='sigmoid'))
    
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [38]:
model = model_cnn(500)
history = model.fit(X_train.reshape([-1,n_features,1]),y_train,
                    validation_data=(X_val.reshape([-1,n_features,1]),y_val),batch_size=256,epochs=25)

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


In [39]:
y_val_p = model.predict(X_val)
y_val_p = np.round(y_val_p).flatten()
print(classification_report(y_val, y_val_p))

              precision    recall  f1-score   support

           0       0.74      0.77      0.75      2639
           1       0.50      0.45      0.48      1333

    accuracy                           0.66      3972
   macro avg       0.62      0.61      0.61      3972
weighted avg       0.66      0.66      0.66      3972



In [40]:
print_metrics(y_val, y_val_p)

Accuracy: 0.6644008056394763
Precision: 0.5
Recall: 0.45461365341335336
F1 score: 0.4762278978388998


In [27]:
y_val_p = model.predict(X_val)
y_val_p = np.round(y_val_p).flatten()
print(classification_report(y_val, y_val_p))

              precision    recall  f1-score   support

           0       0.73      0.80      0.77      2639
           1       0.52      0.42      0.46      1333

    accuracy                           0.67      3972
   macro avg       0.63      0.61      0.61      3972
weighted avg       0.66      0.67      0.66      3972



In [28]:
print_metrics(y_val, y_val_p)

Accuracy: 0.6739677744209466
Precision: 0.5175276752767528
Recall: 0.42085521380345087
F1 score: 0.4642118328506413


## Testing 

In [41]:
df_test = pd.read_csv("data/OLIDv1/testset-levela.tsv", sep='\t').set_index('id')
y_test = pd.read_csv("data/OLIDv1/labels-levela.csv", names = ['id','subtask_a']).set_index('id')
df_test = df_test.join(y_test).reset_index()

In [42]:
x_test = df_test.tweet
y_test = df_test.subtask_a.apply(lambda x: 1 if x=='OFF' else 0)

In [43]:
data_test = [preprocess(tweet) for tweet in x_test]
X_test = vectorizer.transform(data_test)
X_test = X_test.toarray().reshape([-1,n_features,1])
y_test_p = model.predict(X_test)
y_test_p = np.round(y_test_p).flatten()

In [44]:
print(classification_report(y_test, y_test_p))

              precision    recall  f1-score   support

           0       0.80      0.82      0.81       620
           1       0.50      0.46      0.48       240

    accuracy                           0.72       860
   macro avg       0.65      0.64      0.64       860
weighted avg       0.71      0.72      0.72       860



In [32]:
print(classification_report(y_test, y_test_p))

              precision    recall  f1-score   support

           0       0.81      0.86      0.83       620
           1       0.56      0.47      0.51       240

    accuracy                           0.75       860
   macro avg       0.69      0.67      0.67       860
weighted avg       0.74      0.75      0.74       860

