# ICPR 2024 Competition on Multilingual Claim-Span Identification

## Installing Dependencies

In [None]:
#!pip install tensorflow
#!pip install keras

### Importing Libraries

In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import tensorflow as tf
from keras.models import Sequential
from keras.layers import LSTM, GRU,SimpleRNN
from keras.layers import Dense, Activation, Dropout, Embedding, BatchNormalization
from keras.utils import to_categorical
from sklearn import preprocessing, decomposition, model_selection, metrics, pipeline
from keras.layers import GlobalMaxPooling1D, Conv1D, MaxPooling1D, Flatten, Bidirectional, SpatialDropout1D
from keras.preprocessing import sequence, text
from keras.callbacks import EarlyStopping

from sklearn.metrics import jaccard_score, f1_score
from sklearn.svm import SVC


import re
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from plotly import graph_objs as go
import plotly.express as px
import plotly.figure_factory as ff

### Configuring TPU's

For this version of Notebook we will be using TPU's as we have to built a BERT Model

In [None]:
import tensorflow as tf

try:
    # Detect TPU and create TPU cluster resolver
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print('Running on TPU ', tpu.master())

    # Connect to the TPU cluster
    tf.config.experimental_connect_to_cluster(tpu)

    # Initialize the TPU system
    tf.tpu.experimental.initialize_tpu_system(tpu)

    # Create a TPUStrategy
    strategy = tf.distribute.TPUStrategy(tpu)
except (ValueError, tf.errors.NotFoundError):
    tpu = None
    # Default distribution strategy for CPU/GPU
    strategy = tf.distribute.get_strategy()

print("REPLICAS: ", strategy.num_replicas_in_sync)

### Loading Datasets

In [None]:
# Training English and Hindi datasets
train_data = pd.read_json("/content/drive/MyDrive/Multilingual Datasets/ML Data/train_en_hi_encoded_labels.json")


# Validation English and Hindi datasets
val_data = pd.read_json("/content/drive/MyDrive/Multilingual Datasets/ML Data/val_en_hi_encoded_labels.json")

In [None]:
train_data.head()

In [None]:
train_data.info()

In [None]:
val_data.info()

We will check the maximum number of words that can be present in a comment , this will help us in padding later

In [None]:
train_data = train_data.loc[:12000,:]
train_data.shape

In [None]:
train_data['text_tokens'].apply(lambda x:len(str(x).split())).max()

In [None]:
X_val = val_data["text_tokens"]
y_val = val_data["claims"]

Writing a function for getting auccuracy score for validation

In [None]:
import matplotlib.pyplot as plt
from sklearn import metrics

def roc_auc(predictions, target):
    '''
    This method returns the AUC Score and plots the ROC Curve when given the Predictions
    and Labels
    '''
    fpr, tpr, thresholds = metrics.roc_curve(target, predictions)
    roc_auc = metrics.auc(fpr, tpr)

    # Plotting the ROC Curve
    plt.figure()
    plt.plot(fpr, tpr, color='blue', lw=2, label='ROC curve (AUC Score = %0.2f)' % roc_auc)
    plt.plot([0, 1], [0, 1], color='gray', lw=2, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic(ROC) Curve')
    plt.legend(loc="lower right")
    plt.show()

    return roc_auc

### Data Preparation

In [None]:
X_train, X_test, y_train, y_test = train_test_split(train_data.text_tokens.values, train_data.claims.values,
                                                  stratify=train_data.claims.values,
                                                  random_state=42,
                                                  test_size=0.10, shuffle=True)

## Model Training & Development

In [None]:
from keras.preprocessing import text, sequence


# Initialize the tokenizer
token = text.Tokenizer(num_words=None)
max_len = 1500

# Fit tokenizer on training data only
token.fit_on_texts(X_train)

# Convert texts to sequences
X_train_seq = token.texts_to_sequences(X_train)
X_test_seq = token.texts_to_sequences(X_test)
X_val_seq = token.texts_to_sequences(X_val)

# Zero pad the sequences
X_train_pad = sequence.pad_sequences(X_train_seq, maxlen=max_len)
X_test_pad = sequence.pad_sequences(X_test_seq, maxlen=max_len)
X_val_pad = sequence.pad_sequences(X_val_seq, maxlen=max_len)

# Get the word index
word_index = token.word_index

### Word Embeddings

The latest approach to getting word Embeddings is using pretained GLoVe or using Fasttext. Without going into too much details, I would explain how to create sentence vectors and how can we use them to create a machine learning model on top of it and since I am a fan of GloVe vectors, word2vec and fasttext. In this Notebook, I'll be using the GloVe vectors. You can download the GloVe vectors you can search for GloVe in datasets on Kaggle and add the file.

In [None]:
# load the GloVe vectors in a dictionary:

embeddings_index = {}
f = open('/content/drive/MyDrive/Multilingual Datasets/ GLoVe/glove.6B.300d.txt','r',encoding='utf-8')
for line in tqdm(f):
    values = line.split(' ')
    word = values[0]
    coefs = np.asarray([float(val) for val in values[1:]])
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

We have already tokenized and paded our text for input to LSTM's

In [None]:
# create an embedding matrix for the words we have in the dataset
embedding_matrix = np.zeros((len(word_index) + 1, 300))
for word, i in tqdm(word_index.items()):
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

### Bi-Directional LSTM

In [None]:
%%time
with strategy.scope():
    # A simple bidirectional LSTM with glove embeddings and one dense layer
    model = Sequential()
    model.add(Embedding(len(word_index) + 1,
                     300,
                     weights=[embedding_matrix],
                     input_length=max_len,
                     trainable=False))
    model.add(Bidirectional(LSTM(300, dropout=0.3, recurrent_dropout=0.3)))

    model.add(Dense(1,activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam',metrics=['accuracy'])


model.summary()

In [None]:
model.fit(X_train_pad, y_train, epochs=5, batch_size=64*strategy.num_replicas_in_sync)

In [None]:
scores = model.predict(X_val_pad)

In [None]:
scores_model = []

scores_model.append({'Model': 'Bi-directional LSTM','ROC Curve and AUC_Score': roc_auc(scores, y_val)})
print(scores_model)

In [None]:
from sklearn.metrics import f1_score
from sklearn.metrics import jaccard_score

# Step 1: Make predictions on the validation set
predictions = (model.predict(X_val_pad) > 0.5).astype(int)

# Step 2: Calculate Jaccard similarity score
jaccard = jaccard_score(y_val, predictions)

# Step 3: Calculate Macro-F1 score
macro_f1 = f1_score(y_val, predictions, average='macro')

print("Jaccard Similarity Score of Bi-Directional LSTM:", jaccard)
print("Macro-F1 Score of Bi-Directional LSTM:", macro_f1)

In [None]:
scores_model