In [None]:
%pip install gdown

In [None]:
train = '1-DTIOsUZVbmiGJZMMmLbcSN_NWuqSbql'
val = '1-8B6g2l8D9U_O370fv6a0O7o_pozyrx0'
test = '1-FAL4G--bLerOPdtHWoUb2FelHAc-cRN'

cleaned_train_embedding = '1--R-xfUHNI4XifRJlqOFQ_VUqlJ59rk6'
cleaned_val_embedding = '1-192wuFcIa3Gu1uHl4HP_Itg7It4_8lC'
cleaned_test_embedding = '1_gGH6CyYS0QXr2pAjd_nehu3WW3Bdpgu'

model_path = '1XfwX7_t0_W1vxFizbpzYsnaDn6S900QU'

In [None]:
!gdown {train}
!gdown {val}
!gdown {test}

!gdown {cleaned_train_embedding}
!gdown {cleaned_val_embedding}
!gdown {cleaned_test_embedding}



In [None]:
import pandas as pd
import numpy as np
import pickle
import torch
import shap
import seaborn as sns
import tensorflow as tf
import keras
import keras_tuner
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import GRU
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.preprocessing.text import Tokenizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.preprocessing import OneHotEncoder
import matplotlib.pyplot as plt
from lime.lime_text import LimeTextExplainer
from transformers import BertModel, AutoTokenizer, TFAutoModel

## Classes

In [None]:
label_to_class = {
    0: 'none',
    1: 'anger',
    2: 'joy',
    3: 'sadness',
    4: 'love',
    5: 'sympathy',
    6: 'surprise',
    7: 'fear'
}
classes = ['none', 'anger', 'joy', 'sadness', 'love', 'sympathy', 'surprise', 'fear']

## Loading data

In [None]:
with open('/kaggle/working/train.pkl', 'rb') as f:
    train = pickle.load(f)
    
with open('/kaggle/working/val.pkl', 'rb') as f:
    val = pickle.load(f)

with open('/kaggle/working/test.pkl', 'rb') as f:
    test = pickle.load(f)

with open('/kaggle/working/ls_train_embeddings.pkl', 'rb') as f:
    train_embeddings = pickle.load(f)
    
with open('/kaggle/working/ls_val_embeddings.pkl', 'rb') as f:
    val_embeddings = pickle.load(f)
    
with open('/kaggle/working/ls_test_embeddings.pkl', 'rb') as f:
    test_embeddings = pickle.load(f)

In [None]:
encoder = OneHotEncoder()

y_train = encoder.fit_transform(train['label'].values.reshape(-1,1)).toarray()
y_val = encoder.transform(val['label'].values.reshape(-1,1)).toarray()
y_test = encoder.transform(test['label'].values.reshape(-1,1)).toarray()

## Tokenizing and padding

In [None]:
X_train = train_embeddings
X_val = val_embeddings
X_test = test_embeddings

In [None]:
BATCH_SIZE = 64
EPOCHS = 50
EMBED_SIZE = 86#768#86
LEARNING_RATE =  0.006535000000000002

early_stopping_monitor = EarlyStopping(
    monitor='val_accuracy',
    min_delta=0,
    patience=10,
    verbose=0,
    mode='max',
    baseline=None,
    restore_best_weights=True)

In [None]:
def build_gru(embedding_layer=False):
    model = Sequential()
    if embedding_layer:
        model.add(Embedding(VOCAB_SIZE_MARBERT, EMBED_SIZE, input_length=max_length, trainable=True))#, weights=[embedding_matrix.detach().numpy()]))
        model.add(GRU(128, return_sequences=True))
    else:
        model.add(GRU(128, return_sequences=True, input_shape=(X_train.shape[1], X_train.shape[2])))

    model.add(Dropout(0.2))
    
    model.add(GRU(64, return_sequences=True))
    model.add(Dropout(0.2))
    
    model.add(GRU(32, return_sequences=False))
    model.add(Dropout(0.2))
    
    model.add(Dense(8, activation='softmax'))

    model.compile(optimizer=Adam(LEARNING_RATE),
                loss='categorical_crossentropy', metrics=['accuracy'])
    
    return model

In [None]:
model = build_gru(embedding_layer=False)

history = model.fit(X_train, np.asarray(y_train), validation_data=(X_val, np.asarray(y_val)), batch_size=BATCH_SIZE, epochs=EPOCHS, callbacks=[early_stopping_monitor])

In [None]:
# model = keras.models.load_model("GRRU")
model = keras.models.load_model("/kaggle/input/modelgru/kaggle/working/GRRU")

In [None]:
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])

plt.show()

In [None]:
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])

plt.show()

In [None]:
predictions = model.predict(X_test)
print(classification_report(y_test.argmax(axis=1), predictions.argmax(axis=1), target_names=classes))

In [None]:
cm = confusion_matrix(y_test.argmax(axis=1), predictions.argmax(axis=1))

plt.figure(figsize=(10, 8))
sns.set(font_scale=1.2)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=classes, yticklabels=classes)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

In [None]:
def map_label_to_class(indices, classes):
  return [classes[idx] for idx in indices]

def create_csv(y_pred_idx, y_true_idx, file_path="examples.csv"):
  y_pred, y_true = map_label_to_class(y_pred_idx, label_to_class), map_label_to_class(y_true_idx, label_to_class)
  with open(file_path, 'w') as file:
    file.write('Index,True Label,Predicted Label\n')
    for i in range(len(y_pred)):
      file.write(f'{i},{y_true[i]},{y_pred[i]}\n')
  print("Content has been written to the file.")

In [None]:
create_csv(predictions.argmax(axis=1), y_test.argmax(axis=1), file_path="gru.csv")

In [None]:
misclassified = np.where(y_test.argmax(axis=1)!=predictions.argmax(axis=1))[0]
correctly_classified = np.where(y_test.argmax(axis=1)==predictions.argmax(axis=1))[0]

In [None]:
len(misclassified), len(correctly_classified), len(predictions)

In [None]:
problematic_samples = {'predicted':['none', 'none', 'joy', 'anger', 'love', 'none'], 'actual':['joy', 'surprise', 'love', 'sadness', 'joy', 'sadness']}
problematic_indices = []

In [None]:
for predicted, actual in zip(problematic_samples['predicted'], problematic_samples['actual']):
    tmp = []
    for idx, data in enumerate(zip(predictions.argmax(axis=1), y_test.argmax(axis=1))):
        pred, true = data
        if classes[pred]==predicted and classes[true]==actual:
            tmp.append(idx)
    problematic_indices.append(tmp)

In [None]:
sum([len(x) for x in problematic_indices])

In [None]:
marbert_model_path = 'UBC-NLP/MARBERT'
tokenizer = AutoTokenizer.from_pretrained(marbert_model_path, from_tf=True)
marbert_model = TFAutoModel.from_pretrained(marbert_model_path, output_hidden_states=True)

remove_special_tokens=0  #change this to 0 if you want to keep the special token

test_df = test.reset_index()
test_true = y_test.argmax(axis=1)
test_pred = predictions.argmax(axis=1)

def bert_tokenize(text: str) -> dict:
    tokens = tokenizer(text, padding='max_length', truncation=True, max_length=50)
    if remove_special_tokens == 1:
        shape = np.array(tokens['input_ids']).shape
        modified_input_ids = np.zeros(shape).astype(np.int32)
        modified_attention_mask = np.zeros(shape).astype(np.int32)
        # Modify the input IDs and attention mask as per your requirement
        for i in range(shape[0]):
            modified_input_ids[i] = [0 if token_id == 1 else 0 if token_id == 3 else 0 if token_id == 0 else 0 if token_id == 2 else 0 if token_id == 4 else token_id for token_id in tokens['input_ids'][i]]
            modified_attention_mask[i] = [0 if token_id in [1, 3, 0, 2, 4] else 1 for token_id in tokens['input_ids'][i]]
        # Update the input IDs and attention mask in the tokens dictionary
        tokens['input_ids'] = modified_input_ids
        tokens['attention_mask'] = modified_attention_mask
    return tokens


def get_embeddings(ids, mask, type_ids):
    ids = tf.convert_to_tensor(ids)
    #print(ids.shape)
    mask = tf.convert_to_tensor(mask)
    #print(mask.shape)
    #print(mask)
    type_ids = tf.convert_to_tensor(type_ids)
    #print(type_ids.shape)
    hidden_states = marbert_model(input_ids=ids, attention_mask=mask, token_type_ids=type_ids)[0]
    averaged_embedding = tf.reduce_mean(hidden_states, axis=1)
    return hidden_states.numpy()

def embedd(text):
    if type(text)==type(pd.Series()):
        text=text.values.astype(str).tolist()
    tokens = bert_tokenize(text)
    xlen = np.array(tokens['input_ids']).shape[0]
    x_emb = np.zeros((xlen,50,768))
    for i in range(0,xlen,100):
        if(i+100 < xlen):
            input_ids = tokens['input_ids'][i:i+100]
            attention_mask = tokens['attention_mask'][i:i+100]
            token_type_ids = tokens['token_type_ids'][i:i+100]
            x_emb[i:i+100] = get_embeddings(input_ids,attention_mask,token_type_ids)
        else:
            input_ids = tokens['input_ids'][i:xlen]
            attention_mask = tokens['attention_mask'][i:xlen]
            token_type_ids = tokens['token_type_ids'][i:xlen]
            x_emb[i:xlen] = get_embeddings(input_ids,attention_mask,token_type_ids)
    return x_emb

def model_predict(texts):
    embedding = embedd(texts)
    return model.predict(embedding, verbose=0)

# Create a LimeTextExplainer
lime_explainer = LimeTextExplainer(class_names=classes)

def lime_explain(idx):
    text = test_df['Light Stemming'][idx]
    if len(text.split()) < 1:
        print("Text contains less than 2 words. Cannot explain.")
        return
    # Explain the specific prediction
    if test_true[idx]==test_pred[idx]:
        labels_to_explain = (test_true[idx],)
    else:
        labels_to_explain = (test_true[idx], test_pred[idx])
    explanation = lime_explainer.explain_instance(test_df['Light Stemming'][idx], model_predict, labels=labels_to_explain)
    # Show the explanation
    explanation.show_in_notebook()


def shap_model_predict(text):
  text = text.astype(str).tolist()
  tokens = bert_tokenize(text)
  xlen = np.array(tokens['input_ids']).shape[0]
  x_emb = np.zeros((xlen,50,768))
  for i in range(0,xlen,100):
    if(i+100 < xlen):
        input_ids = tokens['input_ids'][i:i+100]
        attention_mask = tokens['attention_mask'][i:i+100]
        token_type_ids = tokens['token_type_ids'][i:i+100]
        x_emb[i:i+100] = get_embeddings(input_ids,attention_mask,token_type_ids)
    else:
        input_ids = tokens['input_ids'][i:xlen]
        attention_mask = tokens['attention_mask'][i:xlen]
        token_type_ids = tokens['token_type_ids'][i:xlen]
        x_emb[i:xlen] = get_embeddings(input_ids,attention_mask,token_type_ids)
  return model.predict(x_emb, verbose=0)


masker = shap.maskers.Text(tokenizer=r"\W+")
shap_explainer = shap.Explainer(shap_model_predict, output_names=classes, masker=masker)


def deep_shap_explain(idx):
    instance = test_df['Light Stemming'][idx:idx+1].values.astype(str).tolist()
    if len(instance[0].split()) < 2:
        print("Text contains less than 2 words. Cannot explain.")
        return
    # Assuming 'explainer' is a function that computes SHAP values for the instance
    shap_values = shap_explainer(instance)  # You need to replace 'explainer' with the actual SHAP explainer function
    shap_values = shap_values[0,:,test_pred[idx]]
    # Assuming 'shap' is the SHAP library
#     shap.plots.bar(shap_values)  # Display a summary plot of SHAP values
    shap.text_plot(shap_values)
    
    # Print all SHAP values for the instance
    print("-------------------------------------------------------------------------------------")
    
def print_sample(idx):
    print('Index: ', idx)
    print('True label: ', classes[test_true[idx]])
    print('Predicted label: ', classes[test_pred[idx]])
    print("Original tweet:", test_df['tweet'][idx])
    print("Cleaned tweet:", test_df['Light Stemming'][idx])

In [None]:
for pair_idx, indices in enumerate(problematic_indices):
    print()
    print()
    print('--------------------------------------------------------------------------')
    print('Predicted: ', problematic_samples['predicted'][pair_idx])
    print('Actual: ', problematic_samples['actual'][pair_idx])
    for idx in indices:
        print('Index: ', idx)
        print('Tweet: ', test['tweet'].values.tolist()[idx])
        print('Light stemming: ', test['Light Stemming'].values.tolist()[idx])
        print()

In [None]:
def lime_explain_with_text(idx,text):
    # Explain the specific prediction
    if test_true[idx]==test_pred[idx]:
        labels_to_explain = (test_true[idx],)
    else:
        labels_to_explain = (test_true[idx], test_pred[idx])
    explanation = lime_explainer.explain_instance(text, model_predict, labels=labels_to_explain)
    # Show the explanation
    explanation.show_in_notebook()


def deep_shap_explain_with_text(idx,text):
    instance = text#test_df['Light Stemming'][idx:idx+1].values.astype(str).tolist()
    
    # Assuming 'explainer' is a function that computes SHAP values for the instance
    shap_values = shap_explainer(instance)  # You need to replace 'explainer' with the actual SHAP explainer function
   # shap_values = shap_values[0,:,test_pred[idx]]
    # Assuming 'shap' is the SHAP library
#     shap.plots.bar(shap_values)  # Display a summary plot of SHAP values
    shap.text_plot(shap_values)
    
    # Print all SHAP values for the instance
    print("-------------------------------------------------------------------------------------")
    

In [None]:
short_sentence_indices = test_df[test_df['Light Stemming'].apply(lambda x: len(x.split())) < 4].index.to_numpy()
long_sentence_indices = test_df[test_df['Light Stemming'].apply(lambda x: len(x.split())) > 20].index.to_numpy()

print("Indices of short sentences:", short_sentence_indices)
print("Indices of long sentences:", long_sentence_indices)


In [None]:
# Short sentences with correct classification
print("Correctly Classified Short Sentences:")
correct_short_indices = [idx for idx in short_sentence_indices if test_true[idx] == test_pred[idx]][:]  # Print only 5 samples
print(len(correct_short_indices))
for idx in correct_short_indices:
    print_sample(idx)
    print()

# Long sentences with correct classification
print("Correctly Classified Long Sentences:")
correct_long_indices = [idx for idx in long_sentence_indices if test_true[idx] == test_pred[idx]][:]  # Print only 5 samples

print(len(correct_long_indices))
for idx in correct_long_indices:
    print_sample(idx)
    print()

# Short sentences with misclassification
print("Misclassified Short Sentences:")

misclassified_short_indices = [idx for idx in short_sentence_indices if test_true[idx] != test_pred[idx]][:]  # Print only 5 samples
print(len(misclassified_short_indices))

for idx in misclassified_short_indices:
    print_sample(idx)
    print()

# Long sentences with misclassification
print("Misclassified Long Sentences:")
misclassified_long_indices = [idx for idx in long_sentence_indices if test_true[idx] != test_pred[idx]][:]  # Print only 5 samples
print(len(misclassified_long_indices))

for idx in misclassified_long_indices:
    print_sample(idx)
    print()


In [None]:
num_correct_short = len(correct_short_indices)
num_correct_long = len(correct_long_indices)
num_misclassified_short = len(misclassified_short_indices)
num_misclassified_long = len(misclassified_long_indices)

print("-------------------------------------------------------------------------------------")
print("Category                                        Correct    Misclassified")
print("-------------------------------------------------------------------------------------")
print("Short Sentences less than 4 words/sentence:     {:<12}     {:<12}".format(num_correct_short, num_misclassified_short))
print("Long Sentences more than 20 words/sentence:     {:<12}     {:<12}".format(num_correct_long, num_misclassified_long))
print("------------------------------------------------------------------------------------")


In [None]:
file ='/kaggle/working/newWords.txt'
def searchInFile(searchWord, filename=file):
  with open(filename, "r") as file:
    lines = file.readlines()
    for line in lines:
      lineWords = line.split(" ")
      # to remove white spaces from the word
      if searchWord.replace(" ", "") == lineWords[1]:
        print(line)
        return
    print("Word exists in training data")

In [None]:
searchInFile('شو')

# Trends 


## poetry

In [None]:
for index in [40,108,117,175,225,390,467,807,838,872,920,975,989,1065,1088,1415,1439,1472]:
    print_sample(index)
    lime_explain(index)
    deep_shap_explain(index)

## relgoius

In [None]:
for index in [102,156,286,295,372,407,656,756,782,854,957,981,1279,1293,1387,1508]:
    print_sample(index)
    lime_explain(index)
    deep_shap_explain(index)

## praying

In [None]:
for index in [150,172,173,176,186,615,677,699,705,725,1207,1247,1252,1503]:
    print_sample(index)
    lime_explain(index)
    deep_shap_explain(index) 

## non-romantic love

In [None]:
for index in [116,125,141,146,174,178,227,281,307,324,325]:
    print_sample(index)
    lime_explain(index)
    deep_shap_explain(index) 

## Politics

In [None]:
for index in [136,139,167,184,201,879,881,901,908,1423,1447,1480,149]:
    print_sample(index)
    lime_explain(index)
    deep_shap_explain(index) 

 ## repeated tries

In [None]:
for index in [23,260,932]:
    print_sample(index)
    lime_explain(index)
    deep_shap_explain(index) 

In [None]:
idx = 260
text = 'هنجيب ميداليه الاوليمبياد ياعماد  حزن حزن حزن حزن حزن حزن حزن حزن'
print_sample(idx)
lime_explain_with_text(idx,text)
deep_shap_explain_with_text(idx,[text])

In [None]:
idx = 11
text = 'سخريه سخريه سخريه سخريه كنت يدخل العشق قلبه يبصر جفونك يعشق تصميمي تصميم رمزي جده الخير الخير سخريه سخريه سخريه  سخريه  سخريه'
print_sample(idx)
lime_explain_with_text(idx,text)
deep_shap_explain_with_text(idx,[text])

In [None]:
idx = 260
text = 'هنجيب ميداليه الاوليمبياد ياعماد  فرح'
print_sample(idx)
lime_explain_with_text(idx,text)
deep_shap_explain_with_text(idx,[text])

In [None]:
idx = 932
text = 'شو مشتاقه تهدي باقه ورد صغيره  حب '
print_sample(idx)
lime_explain_with_text(idx,text)
deep_shap_explain_with_text(idx,[text])

In [None]:
idx = 932
text = 'شو مشتاقه تهدي باقه ورد صغيره حب حب حب حب حب حب '
print_sample(idx)
lime_explain_with_text(idx,text)
deep_shap_explain_with_text(idx,[text])

In [None]:
idx = 23
text = 'جايز ومم بالعوامه البطه  الجاي سخريه'
print_sample(idx)
lime_explain_with_text(idx,text)
deep_shap_explain_with_text(idx,[text])

In [None]:
for index in [75, 76, 100, 108, 130]:
    print_sample(index)
    lime_explain(index)
    deep_shap_explain(index) 

In [None]:
for index in [100, 108, 130]:
    print_sample(index)
    lime_explain(index)
    deep_shap_explain(index) 

In [None]:
for index in[ 656,1508,169,782]:
    print_sample(index)
    lime_explain(index)
    deep_shap_explain(index) 

In [None]:
idx = 727
text = 'عرف يبقي خير معرفناش يبقي لا يكلف الله نفسا الا وسع فين ايام كرم حابر ميداليه جيب'
print_sample(idx)
lime_explain_with_text(idx,text)
deep_shap_explain_with_text(idx,[text])

In [None]:
for index in misclassified_short_indices:
     print_sample(index)
for index in misclassified_short_indices:
     print_sample(index)
     lime_explain(index)
     deep_shap_explain(index)       

In [None]:
idx = 727
text = 'عرف يبقي خير معرفناش يبقي لا يكلف الله نفسا الا وسع فين ايام كرم حابر ميداليه جيب'
print_sample(idx)
lime_explain_with_text(idx,text)
deep_shap_explain_with_text(idx,[text])

In [None]:
idx = 469
text = 'ولقيت مايكل فيليبس بيكسب ذهبي ربع ساعه وحاجه معلش هكتب التاريخ وراجع تاني'
print([text])
print(text)
print_sample(idx)
lime_explain_with_text(idx,text)
deep_shap_explain_with_text(idx,[text])

## Explaining manually selected samples for each problematic block using LIME

In [None]:
idx =1198 
text = test_df['tweet'][idx:idx+1].values.astype(str).tolist()
print_sample(idx)
lime_explain_with_text(idx,test_df['tweet'][idx])
deep_shap_explain_with_text(idx,text)

In [None]:
sample = 307
print_sample(sample)
lime_explain(sample)
deep_shap_explain(sample)

In [None]:
sample = 1228
print_sample(sample)
lime_explain(sample)
deep_shap_explain(sample)

In [None]:
sample = 5
print_sample(sample)
lime_explain(sample)
deep_shap_explain(sample)

## isseue in preprocessing

In [None]:
for sample in [1317,768]:
    print_sample(sample)
    lime_explain(sample)
    deep_shap_explain(sample)

## Mixed feelings

In [None]:
sample = 157
print_sample(sample)
lime_explain(sample)
deep_shap_explain(sample)

In [None]:
sample = 1211
print_sample(sample)
lime_explain(sample)
deep_shap_explain(sample)

In [None]:
sample = 214
print_sample(sample)
lime_explain(sample)
deep_shap_explain(sample)

In [None]:
manually_selected_samples = []

## Actual: joy, Prediced: none

In [None]:
manually_selected_samples.append([2,23,189,534,1402,1200,294,298,181])

In [None]:
for sample in [1338,904,1038,335,1228]:
    print_sample(sample)
    lime_explain(sample)
    deep_shap_explain(sample)

In [None]:
test["tweet"]

## Actual: surprise, Prediced: none

In [None]:
manually_selected_samples.append([154,1158,1014,848,811])

In [None]:
for sample in manually_selected_samples[1]:
    print_sample(sample)
    lime_explain(sample)
    deep_shap_explain(sample)

## Actual: love, Prediced: joy

In [None]:
manually_selected_samples.append([298,320,932,985,276])

In [None]:
for sample in manually_selected_samples[2]:
    print_sample(sample)
    lime_explain(sample)
    deep_shap_explain(sample)

## Actual: anger, Prediced: sadness

In [None]:
manually_selected_samples.append([114,1211,1309,1444,1001])

In [None]:
for sample in [114,1211,1309,1444,1001]:
    print_sample(sample)
    lime_explain(sample)
    deep_shap_explain(sample)

## Actual: joy, Prediced: love

In [None]:
manually_selected_samples.append([327,378,689,335,214])

In [None]:
for sample in [327,378,689,335,214]:
    print_sample(sample)
    lime_explain(sample)
    deep_shap_explain(sample)

## Actual: sadness, Prediced: none

In [None]:
manually_selected_samples.append([254,742,1255,820,862])

In [None]:
for sample in manually_selected_samples[5]:
    print_sample(sample)
    lime_explain(sample)
    deep_shap_explain(sample)

## Samples agreed on by all models

In [None]:
samples_agreed_on = [3,2,1,429,43,354,56,391,44,263,32,172,17,1496,22]

In [None]:
for sample in samples_agreed_on:
    print_sample(sample)
    lime_explain(sample)
    deep_shap_explain(sample)