In [1]:
import os
import re
import numpy as np
import pandas as pd
import sklearn
from sklearn.utils import shuffle

from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import confusion_matrix

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')
nltk.download('wordnet')

from tensorboard.plugins.hparams import api as hp

from scipy.sparse import csr_matrix
from collections import Counter
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from sklearn.utils import class_weight

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/saumyamehta/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/saumyamehta/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
olid_data = pd.read_csv("data/OLIDv1/olid-training-v1.0.tsv", sep='\t')


In [3]:
def preprocess(sent, lemmatizer, stop_words):
    
    alphabet = "abcdefghijklmnopqrstuvwxyz 0123456789',."
    sent = sent.lower()
    sent = sent.replace('@user','')
    sent = sent.replace('@[\w\-]+','')

    cleaned_sent_list = [char if char in alphabet else ' ' for char in sent] # remove all tags not in the alphabet

    cleaned_sent = ''.join(cleaned_sent_list)
    cleaned_sent_list = [lemmatizer.lemmatize(token) for token in cleaned_sent.split(" ")]
    cleaned_sent_list = [word for word in cleaned_sent_list if not word in stop_words]
    cleaned_sent = ' '.join(cleaned_sent_list)
    cleaned_sent = cleaned_sent.replace("n't",' not') # replace words like "isn't" with "is not"
    cleaned_sent = ' . '.join([x for x in cleaned_sent.split('.') if len(x)>0]) # remove multiple periods, and add spaces before and after a period
    cleaned_sent = ' , '.join([x for x in cleaned_sent.split(',') if len(x)>0]) # add spaces before and after a comma
    cleaned_sent = ' '.join(cleaned_sent.split()) # remove multiple spaces
    return cleaned_sent

In [None]:
# emoji preprocess
def emoji_to_text(s):
    s = emoji.demojize(s)
    s = s.replace(':',' ')
    s = s.replace('_',' ')    
    s = ' '.join(s.split())
    return s



In [None]:
# loading twitter slang data
slang_df = pd.read_csv('data/twitterSlang.csv')
slang_dict = dict(zip(slang_df.slang, slang_df.formal_translation))

def fix_slang(s):
    s_list = s.split()
    new_s_list = []
    for word in s_list:
        if word in slang_dict.keys():
            new_s_list.append(slang_dict[word])
        else:
            new_s_list.append(word)
            
    return ' '.join(new_s_list)

In [None]:
def preprocess(sent, lemmatizer, stop_words):
    
    alphabet = "abcdefghijklmnopqrstuvwxyz 0123456789',."
    sent = emoji_to_text(sent)
    sent = fix_slang(sent)
    sent = sent.lower() 
    sent = sent.replace('@user','')
    sent = sent.replace('@[\w\-]+','')

    cleaned_sent_list = [char if char in alphabet else ' ' for char in sent] # remove all tags not in the alphabet

    cleaned_sent = ''.join(cleaned_sent_list)
    cleaned_sent_list = [lemmatizer.lemmatize(token) for token in cleaned_sent.split(" ")]
    cleaned_sent_list = [word for word in cleaned_sent_list if not word in stop_words]
    cleaned_sent = ' '.join(cleaned_sent_list)
    cleaned_sent = cleaned_sent.replace("n't",' not') # replace words like "isn't" with "is not"
    cleaned_sent = ' . '.join([x for x in cleaned_sent.split('.') if len(x)>0]) # remove multiple periods, and add spaces before and after a period
    cleaned_sent = ' , '.join([x for x in cleaned_sent.split(',') if len(x)>0]) # add spaces before and after a comma
    cleaned_sent = ' '.join(cleaned_sent.split()) # remove multiple spaces
    return cleaned_sent

In [4]:
#### Task A

X_train = olid_data.tweet
y_train = pd.factorize(olid_data.subtask_a)[0]

X_test=pd.read_csv('data/OLIDv1/testset-levela.tsv',sep="\t").tweet
y_test=pd.read_csv( 'data/OLIDv1/labels-levela.csv',header=None).iloc[:,-1]
y_test = pd.factorize(y_test)[0]
import collections
collections.Counter(y_train)
print(f'X train shape: {X_train.shape}, y train shape: {y_train.shape}')


X train shape: (13240,), y train shape: (13240,)


In [43]:
##### Task b
# X_train = olid_data.tweet
# y_train = olid_data.subtask_b

# X_train = X_train[y_train.notna()]
# y_train = y_train[y_train.notna()]
# X_test=pd.read_csv('data/OLIDv1/testset-levelb.tsv',sep="\t").tweet
# y_test=pd.read_csv( 'data/OLIDv1/labels-levelb.csv',header=None).iloc[:,-1]
# y_test = pd.factorize(y_test)[0]
# y_train = pd.factorize(y_train)[0]

# import collections
# collections.Counter(y_train)
# print(f'X train shape: {X_train.shape}, y train shape: {y_train.shape}')


X train shape: (4400,), y train shape: (4400,)


In [44]:
# from imblearn.over_sampling import ADASYN, SMOTE, RandomOverSampler
# from imblearn.under_sampling import NearMiss, RandomUnderSampler

# from sklearn.feature_extraction.text import CountVectorizer

# class_weights = class_weight.compute_class_weight(
#     class_weight = 'balanced',
#     classes = np.unique(y_train),
#     y = y_train)

# weights={}
# for index, weight in enumerate(class_weights) :
#   weights[index]=weight
# print(weights)
# #smt = SMOTE(random_state=777, k_neighbors=1)
# #rus = RandomUnderSampler(random_state=777)


{0: 4.198473282442748, 1: 0.5675954592363261}


In [5]:
X_train, X_valid, y_train, y_valid = train_test_split(X_train,y_train,test_size=0.3, random_state=42)
# check shapes of train, test and validation data
print(f'X train shape: {X_train.shape}, y train shape: {y_train.shape}')
print(f'X valid shape: {X_valid.shape}, y valid shape: {y_valid.shape}')
print(f'X valid shape: {X_test.shape}, y test shape: {y_test.shape}')


X train shape: (9268,), y train shape: (9268,)
X valid shape: (3972,), y valid shape: (3972,)
X valid shape: (860,), y test shape: (860,)


In [6]:
stop_words = set(stopwords.words("english")) 
lemmatizer = WordNetLemmatizer()
data_train = [preprocess(tweet,lemmatizer,stop_words) for tweet in X_train]
data_valid = [preprocess(tweet,lemmatizer,stop_words) for tweet in X_valid]
data_test = [preprocess(tweet,lemmatizer,stop_words) for tweet in X_test]

In [7]:
vocab_size = 10000
embedding_size = 100
lstm_output_dim = 32
max_length = 280
trunc_type='post'
padding_type='post'
oov_tok = "<UNK>"

In [8]:
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(data_train)

In [9]:
train_sequences = tokenizer.texts_to_sequences(data_train)
train_padded = pad_sequences(train_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

valid_sequences = tokenizer.texts_to_sequences(data_valid)
valid_padded = pad_sequences(valid_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)


In [10]:
test_sequences = tokenizer.texts_to_sequences(data_test)
test_padded = pad_sequences(test_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

In [11]:
# compute class weights : "https://scikit-learn.org/stable/modules/generated/sklearn.utils.class_weight.compute_class_weight.html"
class_weights = class_weight.compute_class_weight(
    class_weight = 'balanced',
    classes = np.unique(y_train),
    y = y_train)
class_weights = dict(zip(np.unique(y_train), class_weights))
class_weights


{0: 1.510922725790675, 1: 0.7472988227705208}

In [12]:
# print(np.bincount(y_train))
# from sklearn.preprocessing import StandardScaler
# scaler = StandardScaler()
# X_train = scaler.fit_transform(X_train)

# X_valid = scaler.transform(X_valid)

# X_train = np.clip(X_train, -5, 5)
# X_valid = np.clip(X_valid, -5, 5)
# print(X_train[0])

In [13]:
# Load the TensorBoard notebook extension
%load_ext tensorboard
!rm -rf ./logs/

In [14]:
HP_NUM_UNITS = hp.HParam('num_units', hp.Discrete([100,150,200]))
HP_DROPOUT = hp.HParam('dropout', hp.Discrete([0.05,0.2,0.5]))
HP_OPTIMIZER = hp.HParam('optimizer', hp.Discrete(['adam']))
METRIC_ACCURACY = 'accuracy'

with tf.summary.create_file_writer('logs/hparam_tuning').as_default():
  hp.hparams_config(
    hparams=[HP_NUM_UNITS, HP_DROPOUT, HP_OPTIMIZER],
    metrics=[hp.Metric(METRIC_ACCURACY, display_name='Accuracy')],
  )

In [18]:
def train_test_model(vocab_size, embedding_size, max_length, hparams):
    model = tf.keras.models.Sequential()
    model.add(tf.keras.layers.Embedding(vocab_size, embedding_size, input_length=max_length))
    model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32, return_sequences = True)))
    model.add(tf.keras.layers.Dense(hparams[HP_NUM_UNITS], activation="relu"))
    model.add(tf.keras.layers.Dropout(hparams[HP_DROPOUT]))
    model.add(tf.keras.layers.Dense(1, activation="sigmoid"))
    model.compile(
    optimizer=hparams[HP_OPTIMIZER],
    loss='binary_crossentropy',
    metrics=['accuracy'],
    )

    model.fit(train_padded, y_train, batch_size=256,epochs=25,class_weight=class_weights,validation_data=(valid_padded, y_valid),
    callbacks=[
        tf.keras.callbacks.TensorBoard('logs/hparam_tuning'),  # log metrics
        hp.KerasCallback('logs/hparam_tuning', hparams),  # log hparams
    ]) # Run with 1 epoch to speed things up for demo purposes
    _, accuracy = model.evaluate(test_padded, y_test)
    return accuracy

In [19]:
def run(run_dir, hparams):
  with tf.summary.create_file_writer(run_dir).as_default():
    hp.hparams(hparams)  # record the values used in this trial
    accuracy = train_test_model(10000, 100, 280,hparams)
    tf.summary.scalar(METRIC_ACCURACY, accuracy, step=1)

In [20]:
session_num = 0

for num_units in HP_NUM_UNITS.domain.values:
  for dropout_rate in HP_DROPOUT.domain.values:
    for optimizer in HP_OPTIMIZER.domain.values:
      hparams = {
          HP_NUM_UNITS: num_units,
          HP_DROPOUT: dropout_rate,
          HP_OPTIMIZER: optimizer,
      }
      run_name = "run-%d" % session_num
      print('--- Starting trial: %s' % run_name)
      print({h.name: hparams[h] for h in hparams})
      run('logs/hparam_tuning/' + run_name, hparams)
      session_num += 1

--- Starting trial: run-0
{'num_units': 100, 'dropout': 0.05, 'optimizer': 'adam'}
Epoch 1/25
2021-12-08 11:25:18.188473: I tensorflow/core/profiler/lib/profiler_session.cc:131] Profiler session initializing.
2021-12-08 11:25:18.188485: I tensorflow/core/profiler/lib/profiler_session.cc:146] Profiler session started.
2021-12-08 11:25:18.188498: I tensorflow/core/profiler/lib/profiler_session.cc:164] Profiler session tear down.
 1/37 [..............................] - ETA: 52s - loss: 0.6978 - accuracy: 0.35572021-12-08 11:25:19.697605: I tensorflow/core/profiler/lib/profiler_session.cc:131] Profiler session initializing.
2021-12-08 11:25:19.697616: I tensorflow/core/profiler/lib/profiler_session.cc:146] Profiler session started.
 2/37 [>.............................] - ETA: 10s - loss: 0.7015 - accuracy: 0.35562021-12-08 11:25:19.982113: I tensorflow/core/profiler/lib/profiler_session.cc:66] Profiler session collecting data.
2021-12-08 11:25:19.983281: I tensorflow/core/profiler/lib/pr

InternalError:  Output 6 of type float does not match declared output type variant for node node sequential_3/bidirectional_3/forward_lstm_3/PartitionedCall (defined at var/folders/pl/26nmhlbj75j4gr5q76d2vg3h0000gn/T/ipykernel_8690/1114116650.py:14)  [Op:__inference_train_function_37524]

Function call stack:
train_function


In [1]:
%tensorboard --logdir logs/hparam_tuning

UsageError: Line magic function `%tensorboard` not found.
