## environment prep

In [194]:
!pip install transformers



In [195]:
import torch
# import torch_xla
# import torch_xla.core.xla_model as xm
import string
import pandas as pd
import re
from torch.utils.data import TensorDataset, random_split, DataLoader
from transformers import AutoTokenizer, BertForSequenceClassification, AdamW, BertConfig, get_linear_schedule_with_warmup
import numpy as np
import random

In [196]:
from google.colab import drive

drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Data prep

In [197]:
text = u'This is a smiley face \U0001f602'
print(text) # with emoji

def deEmojify(text):
    regrex_pattern = re.compile(pattern = "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags = re.UNICODE)
    return regrex_pattern.sub(r'',text)

print(deEmojify(text))

This is a smiley face üòÇ
This is a smiley face 


https://stackoverflow.com/questions/33404752/removing-emojis-from-a-string-in-python

In [198]:
def text_preprocess(line):
  line = line.translate(str.maketrans('', '', string.punctuation))
  line = line.replace('anonymizedaccount', '')
  line = line.replace('\n', '')
  line = deEmojify(line)
  return line.lower()
  

In [199]:
def create_dataset(txt_path, labels_path):
  dict = {}
  dict['texts'] = []
  dict['labels'] = []
  with open(txt_path) as txt_file:
    for line in txt_file.readlines():
      line = text_preprocess(line)
      
      dict['texts'].append(line)
  with open(labels_path) as labels_files:
    for label in labels_files.readlines():
    
      dict['labels'].append(int(label))
  df = pd.DataFrame(dict)
  return df


In [202]:
df = create_dataset('/content/drive/MyDrive/CyberbullyingDetection/task1/training_set_clean_only_text.txt', '/content/drive/MyDrive/CyberbullyingDetection/task1/training_set_clean_only_tags.txt')

In [203]:
df.head()

Unnamed: 0,texts,labels
0,dla mnie faworytem do tytu≈Çu bƒôdzie cracovia z...,0
1,brawo ty daria kibic ma byƒá na dobre i z≈Çe,0
2,super polski premier sk≈Çada kwiaty na grobac...,0
3,musi innej drogi nie mamy,0
4,odrzut natychmiastowy kwa≈õna mina mam problem,0


In [204]:
df = df.sample(frac = 1)

## model bert for sequence classification

In [None]:
tokenizer = AutoTokenizer.from_pretrained("allegro/herbert-base-cased")

def tensor_dataset(sentences, labels):

  input_ids = []
  attention_masks = []


  for sent in sentences:
   
      encoded_dict = tokenizer.encode_plus(
                          sent,                      # Sentence to encode.
                          add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                          max_length = 64,           # Pad & truncate all sentences.
                          pad_to_max_length = True,
                          return_attention_mask = True,   # Construct attn. masks.
                          return_tensors = 'pt',     # Return pytorch tensors.
                     )

         
      input_ids.append(encoded_dict['input_ids'])

      attention_masks.append(encoded_dict['attention_mask'])

  # Convert the lists into tensors.
  input_ids = torch.cat(input_ids, dim=0)
  attention_masks = torch.cat(attention_masks, dim=0)
  labels = torch.tensor(labels)
  dataset = TensorDataset(input_ids, attention_masks, labels)
  return dataset

In [None]:
dataset = tensor_dataset(df['texts'], df['labels'])

In [14]:
val_size = int(0.2 * len(dataset))
test_size = val_size
train_size = (len(dataset) - val_size - test_size)
train_dataset, val_dataset, test_dataset = random_split(dataset, [train_size, val_size, test_size])

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-uncased').to(device)

Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model 

In [None]:
optimizer = AdamW(model.parameters(),
                  lr = 2e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5
                  eps = 1e-8 # args.adam_epsilon  - default is 1e-8.
                )

In [None]:
epochs = 2
train_dataloader = DataLoader(train_dataset)
validation_dataloader = DataLoader(val_dataset)
test_dataloader = DataLoader(test_dataset)
total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, 
                                            num_training_steps = total_steps)

In [None]:
# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [None]:
seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

# We'll store a number of quantities such as training and validation loss, 
# validation accuracy, and timings.
training_stats = []


for epoch_i in range(0, epochs):

    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    total_train_loss = 0


    model.train()

    for step, batch in enumerate(train_dataloader):

        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)


        model.zero_grad()        

        result = model(b_input_ids, 
                       token_type_ids=None, 
                       attention_mask=b_input_mask, 
                       labels=b_labels,
                       return_dict=True)

        loss = result.loss
        logits = result.logits

        total_train_loss += loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()

        scheduler.step()

    avg_train_loss = total_train_loss / len(train_dataloader)            


    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))

      

    print("")
    print("Running Validation...")

    model.eval()

    total_eval_accuracy = 0
    total_eval_loss = 0
    nb_eval_steps = 0

    for batch in validation_dataloader:
        
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        with torch.no_grad():        

            result = model(b_input_ids, 
                           token_type_ids=None, 
                           attention_mask=b_input_mask,
                           labels=b_labels,
                           return_dict=True)

        loss = result.loss
        logits = result.logits
            
        total_eval_loss += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        total_eval_accuracy += flat_accuracy(logits, label_ids)
        

    avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
    print("  Accuracy: {0:.2f}".format(avg_val_accuracy))

    avg_val_loss = total_eval_loss / len(validation_dataloader)
    
    print("  Validation Loss: {0:.2f}".format(avg_val_loss))

    training_stats.append(
        {
            'epoch': epoch_i + 1,
            'Training Loss': avg_train_loss,
            'Valid. Loss': avg_val_loss,
            'Valid. Accur.': avg_val_accuracy,
            
        }
    )

print("")
print("Training complete!")



Training...

  Average training loss: 0.48

Running Validation...
  Accuracy: 0.91
  Validation Loss: 0.53

Training...

  Average training loss: 0.47

Running Validation...
  Accuracy: 0.91
  Validation Loss: 0.49

Training complete!


## nn model in keras for task 1 

In [205]:
import tensorflow as tf
import numpy as np

In [206]:
tokenizer = AutoTokenizer.from_pretrained("allegro/herbert-base-cased", 
                                          
                                          add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                                          max_length = 64,           # Pad & truncate all sentences.
                                          pad_to_max_length = True,
                                          return_attention_mask = True,   # Construct attn. masks.
                                          return_tensors = 'tf',
                                          )


In [207]:
tokenizer = AutoTokenizer.from_pretrained("allegro/herbert-base-cased")

def input_bert(sentences):

  input_ids = []


  for sent in sentences:
   
      encoded_dict = tokenizer.encode_plus(
                          sent,                      # Sentence to encode.
                          add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                          max_length = 64,           # Pad & truncate all sentences.
                          pad_to_max_length = True,
                          return_attention_mask = True,   # Construct attn. masks.
                          return_tensors = 'tf',     # Return pytorch tensors.
                     )
      
      i = np.array(encoded_dict['input_ids'])
      print(i.shape)
      i = i.flatten()
      input_ids.append(i)

  
  return input_ids

In [208]:
tokenizer = tf.keras.preprocessing.text.Tokenizer(
    num_words=None,
    filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
    lower=True,
    split=" ",
    char_level=False,
    oov_token=None,
    document_count=0,
)

tokenizer.fit_on_texts(df['texts'])
# encoded = tokenizer.texts_to_sequences(df['texts'])

In [209]:
def input_keras(sentences):
  input_ids = []
  for lst in tokenizer.texts_to_sequences(sentences):
    lst = np.array(lst)
    lst = lst.flatten()

    input_ids.append(lst)
  input_ids = tf.keras.preprocessing.sequence.pad_sequences(input_ids)
   
  return input_ids 

In [214]:
# x = input_bert(df['texts'])  
x = input_keras(df['texts'])
y = df['labels']
x_array = np.array(x)
print(x_array.shape)
y_array = np.array(y)
# print(y_array.shape)

(10041, 29)


In [215]:
from sklearn.model_selection import train_test_split

x_train, x_rem, y_train, y_rem = train_test_split(x_array, y_array, train_size=0.8)
x_valid, x_test, y_valid, y_test = train_test_split(x_rem, y_rem, test_size=0.5)

In [216]:
train_true = list(y_train).count(1)
train_false = list(y_train).count(0)
train_count = train_true/train_false
valid_true = list(y_valid).count(1)
valid_false = list(y_valid).count(0)
valid_count = valid_true/valid_false
test_true = list(y_test).count(1)
test_false = list(y_test).count(0)
test_count = test_true/test_false
print(train_count, ' ', valid_count, ' ', test_count)


0.09353301565690947   0.09368191721132897   0.08414239482200647


In [217]:
gpus = tf.config.list_logical_devices('GPU')
print(gpus)
print(gpus[0].name)
device = tf.device(gpus[0].name)

[LogicalDevice(name='/device:GPU:0', device_type='GPU')]
/device:GPU:0


In [218]:
vocab_len = len(tokenizer.word_index)  # for keras tokenizer
# vocab_len = len(tokenizer.get_vocab())     #for bert tokenizer

model = tf.keras.Sequential([
    # encoder,
    tf.keras.layers.Embedding(
        input_dim=vocab_len,
        output_dim=64,
        # Use masking to handle the variable sequence lengths
        mask_zero=True),
    tf.keras.layers.LSTM(64),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

In [219]:
opt = tf.keras.optimizers.Adam(learning_rate=0.01)
loss = tf.keras.losses.BinaryCrossentropy()
# metrics = tf.keras.metrics.BinaryAccuracy()
metrics = [tf.keras.metrics.FalseNegatives(), tf.keras.metrics.BinaryAccuracy()]

model.compile(optimizer=opt, loss=loss, metrics=metrics)
history = model.fit(x_train, y_train, epochs=3, validation_data=(x_valid, y_valid))

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [223]:
print("Evaluate on test data")
results = model.evaluate(x_test, y_test, batch_size=128)
print("test loss, test acc:", results)

Evaluate on test data
test loss, test acc: [0.3729604482650757, 48.0, 0.902487576007843]


In [224]:
predictions = model.predict(x_test)
print(predictions[:10])

[[2.4709408e-04]
 [7.2311186e-06]
 [1.9983359e-02]
 [3.3694605e-04]
 [2.8943452e-03]
 [3.1133395e-06]
 [6.9879908e-03]
 [3.9171995e-04]
 [9.4164526e-01]
 [2.0697504e-05]]


In [225]:
from sklearn.metrics import precision_recall_fscore_support

predictions = [int(x) for x in predictions]
stats = precision_recall_fscore_support(y_test, predictions, average='macro')
print('precision ', stats[0])
print('recall ', stats[1])
print('fscore ', stats[2])

precision  0.46119402985074626
recall  0.5
fscore  0.47981366459627334


  _warn_prf(average, modifier, msg_start, len(result))


## keras model for task 2

In [227]:
df2 = create_dataset('/content/drive/MyDrive/CyberbullyingDetection/task2/training_set_clean_only_text.txt', '/content/drive/MyDrive/CyberbullyingDetection/task2/training_set_clean_only_tags.txt')

In [228]:
df2.head()

Unnamed: 0,texts,labels
0,dla mnie faworytem do tytu≈Çu bƒôdzie cracovia z...,0
1,brawo ty daria kibic ma byƒá na dobre i z≈Çe,0
2,super polski premier sk≈Çada kwiaty na grobac...,0
3,musi innej drogi nie mamy,0
4,odrzut natychmiastowy kwa≈õna mina mam problem,0


In [231]:
!pip install keras.utils

Collecting keras.utils
  Downloading keras-utils-1.0.13.tar.gz (2.4 kB)
Building wheels for collected packages: keras.utils
  Building wheel for keras.utils (setup.py) ... [?25l[?25hdone
  Created wheel for keras.utils: filename=keras_utils-1.0.13-py3-none-any.whl size=2656 sha256=dc26ccc0b3c0dce6809055cd53ab9527c8fa6c2ac3b74eeb234b04b47406affb
  Stored in directory: /root/.cache/pip/wheels/d0/dd/3b/493952a5240d486a83805d65360dedadbadeae71d25e2c877f
Successfully built keras.utils
Installing collected packages: keras.utils
Successfully installed keras.utils-1.0.13


In [234]:
from tensorflow.keras.utils import to_categorical
# define example
data = df2['labels']
data = array(data)
# print(data)
# one hot encode
encoded = to_categorical(data)
print(encoded)


[[1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 ...
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]]


In [237]:
tokenizer.fit_on_texts(df2['texts'])
x = input_keras(df2['texts'])

x_array = np.array(x)
print(x_array.shape)
y_array = np.array(encoded)

(10041, 29)


In [239]:
x_train, x_rem, y_train, y_rem = train_test_split(x_array, y_array, train_size=0.8)
x_valid, x_test, y_valid, y_test = train_test_split(x_rem, y_rem, test_size=0.5)

In [255]:
vocab_len = len(tokenizer.word_index)  # for keras tokenizer
# vocab_len = len(tokenizer.get_vocab())     #for bert tokenizer

model = tf.keras.Sequential([
    # encoder,
    tf.keras.layers.Embedding(
        input_dim=vocab_len,
        output_dim=64,
        # Use masking to handle the variable sequence lengths
        mask_zero=True),
    tf.keras.layers.LSTM(64),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(16),

    tf.keras.layers.Dense(3, activation='sigmoid')
])

In [260]:
opt = tf.keras.optimizers.Adam(learning_rate=0.01)
loss = tf.keras.losses.CategoricalCrossentropy()
metrics = tf.keras.metrics.CategoricalAccuracy()

model.compile(optimizer=opt, loss=loss, metrics=metrics)
history = model.fit(x_train, y_train, epochs=5, validation_data=(x_valid, y_valid))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [261]:
print("Evaluate on test data")
results = model.evaluate(x_test, y_test, batch_size=128)
print("test loss, test acc:", results)

Evaluate on test data
test loss, test acc: [1.9136103391647339, 0.8716418147087097]


In [264]:
predictions = model.predict(x_test)
predictions = [[int(x) for x in pred] for pred in predictions]

In [266]:
stats = precision_recall_fscore_support(y_test, predictions, average='macro')

print('macro')
print('fscore ', stats[2])

stats = precision_recall_fscore_support(y_test, predictions, average='micro')

print('micro')
print('fscore ', stats[2])

macro
fscore  0.4879788339176731
micro
fscore  0.729064039408867
