In [1]:
import re
import emoji
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from cleanlab import Datalab
from datasets import Dataset, DatasetDict
from sentence_transformers import SentenceTransformer

In [2]:
from sklearnex import patch_sklearn

patch_sklearn()

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


In [3]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import train_test_split

In [4]:
# import fasttext

# encoder = fasttext.load_model("./models/cc.ar.300.bin")

In [5]:
import torch

device = f"cuda:{torch.cuda.current_device()}" if torch.cuda.is_available() else "cpu"
encoder = SentenceTransformer("UBC-NLP/MARBERT", device=device)

No sentence-transformers model found with name C:\Users\ehhho/.cache\torch\sentence_transformers\UBC-NLP_MARBERT. Creating a new one with MEAN pooling.


In [6]:
seed = 42
clean_data_path = "./data/irony_clean_data"
processed_data_path = "./data/irony_processed_data"
train_data_path = "./data/irony_training_data.csv"
test_data_path = "./data/irony_testing_data.csv"

In [7]:
train_data = pd.read_csv(train_data_path)[["tweet", "sarcasm"]]
train_data["sarcasm"] = train_data.sarcasm.astype(int)
train_data.head()

Unnamed: 0,tweet,sarcasm
0,"""د. #محمود_العلايلي:أرى أن الفريق #أحمد_شفيق ر...",0
1,"""مع فيدرر يا آجا والكبار 😍 https://t.co/hrBeHb...",0
2,“الداعون لمبدأ الاختلاط بين الجنسين؛ كالداعين ...,1
3,"""@ihe_94 @ya78m @amooo5 @badiajnikhar @Oukasaf...",1
4,"""قل شرق حلب ولا تقل حلب الشرقية ....وقل غرب حل...",0


In [8]:
test_data = pd.read_csv(test_data_path)[["tweet", "sarcasm"]]
test_data["sarcasm"] = test_data.sarcasm.astype(int)
test_data.head()

Unnamed: 0,tweet,sarcasm
0,اخوي حانق يالغلا وشفيك معصب؟ عادي تراهم بشر يف...,0
1,اف مو متعوده عليهم سته https://t.co/8igFPx1i26,1
2,اللهم اشفِ مرضانا ومرضى المسلمين . . ♥️,0
3,ابشركم طلقت السات 😘.,0
4,مؤشر خطير: ٩٠٪ من الشخصيات البرلمانية في الكوي...,1


In [9]:
data = pd.concat([train_data, test_data], ignore_index=True)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15548 entries, 0 to 15547
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   tweet    15548 non-null  object
 1   sarcasm  15548 non-null  int32 
dtypes: int32(1), object(1)
memory usage: 182.3+ KB


In [10]:
# dataset = DatasetDict(
#     {
#         "train": Dataset.from_pandas(train_data),
#         "test": Dataset.from_pandas(test_data),
#     }
# )

# dataset

In [11]:
dataset = Dataset.from_pandas(data)

dataset

Dataset({
    features: ['tweet', 'sarcasm'],
    num_rows: 15548
})

In [12]:
def clean_text(text):
    clean_text = re.sub(r"http\S+|t\.co/\S+", "", text)
    clean_text = re.sub(r"@\w+", "", clean_text)
    clean_text = re.sub(r"#", "", clean_text)
    clean_text = re.sub(r"_", " ", clean_text)
    # tashqeel - from @bakriano
    clean_text = re.sub(r"[\u0617-\u061A\u064B-\u0652]", "", clean_text)
    clean_text = emoji.replace_emoji(clean_text, replace="")
    clean_text = re.sub(r"\s+", " ", clean_text)
    return clean_text.replace("RT :", "").strip()

In [13]:
dataset = dataset.rename_column("sarcasm", "labels")
clean_data = dataset.map(
    lambda x: {"text": [clean_text(t) for t in x["tweet"]]},
    batched=True,
    remove_columns=["tweet"],
)

clean_data

Map:   0%|          | 0/15548 [00:00<?, ? examples/s]

Dataset({
    features: ['labels', 'text'],
    num_rows: 15548
})

In [14]:
clean_data.save_to_disk(processed_data_path)

Saving the dataset (0/1 shards):   0%|          | 0/15548 [00:00<?, ? examples/s]

In [15]:
def vectorize(batch):
    # return {"features": [encoder.get_sentence_vector(t) for t in batch["text"]]}
    return {
        "features": encoder.encode(batch["text"], convert_to_numpy=True, device=device)
    }

In [16]:
encoded_data = clean_data.map(vectorize, batched=True)

Map:   0%|          | 0/15548 [00:00<?, ? examples/s]

In [108]:
w = encoded_data.to_pandas().labels.value_counts(normalize=True).to_list()
w = {i: w[i] for i in range(len(w))}
w

{0: 0.884174692793092, 1: 0.11582530720690801}

In [109]:
train_params = {"penalty": "l1", "solver": "liblinear", "C": 1000.0}

In [110]:
model = LogisticRegression(
    class_weight=w, random_state=seed, max_iter=int(1e5), verbose=1, **train_params
)
pred_probs = cross_val_predict(
    estimator=model,
    X=encoded_data[:]["features"],
    y=encoded_data[:]["labels"],
    cv=10,
    method="predict_proba",
)

[LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear]

In [111]:
data = encoded_data.to_dict()

In [112]:
lab = Datalab(data, label_name="labels")
lab.find_issues(pred_probs=pred_probs, features=np.array(data["features"]))

Finding label issues ...
Finding outlier issues ...
Fitting OOD estimator based on provided features ...
Finding near_duplicate issues ...
Audit complete. 579 issues found in the dataset.


In [113]:
lab.report()

Here is a summary of the different kinds of issues found in the data:

    issue_type  num_issues
near_duplicate         572
         label           5
       outlier           2

Dataset Information: num_examples: 12044, num_classes: 2


------------------ near_duplicate issues -------------------

About this issue:
	A (near) duplicate issue refers to two or more examples in
    a dataset that are extremely similar to each other, relative
    to the rest of the dataset.  The examples flagged with this issue
    may be exactly duplicated, or lie atypically close together when
    represented as vectors (i.e. feature embeddings).
    

Number of examples with this issue: 572
Overall dataset quality in terms of this issue: 0.0030

Examples representing most severe instances of this issue:
      is_near_duplicate_issue  near_duplicate_score                                          near_duplicate_sets  distance_to_nearest_neighbor
7117                     True                   0.0  [4568,

In [114]:
issues = lab.get_issues()

In [115]:
issues.head()

Unnamed: 0,is_label_issue,label_score,is_outlier_issue,outlier_score,is_near_duplicate_issue,near_duplicate_score
0,False,1.0,False,0.997248,False,0.002209
1,False,1.0,False,0.996134,False,0.000612
2,False,1.0,False,0.995298,False,0.004305
3,False,1.0,False,0.99517,False,0.003874
4,False,0.998774,False,0.995505,False,0.004138


In [116]:
issues_flag = np.any(issues[["is_label_issue", "is_outlier_issue"]], axis=1).to_list()

In [117]:
encoded_data = encoded_data.add_column("is_issued", issues_flag)
encoded_data

Flattening the indices:   0%|          | 0/12044 [00:00<?, ? examples/s]

Dataset({
    features: ['labels', 'text', 'features', 'is_issued'],
    num_rows: 12044
})

In [118]:
encoded_data = encoded_data.filter(lambda x: not x["is_issued"])

Filter:   0%|          | 0/12044 [00:00<?, ? examples/s]

In [119]:
encoded_data = encoded_data.remove_columns("is_issued")

In [120]:
encoded_data

Dataset({
    features: ['labels', 'text', 'features'],
    num_rows: 12037
})

In [121]:
encoded_data = encoded_data.remove_columns("features")

In [122]:
encoded_data

Dataset({
    features: ['labels', 'text'],
    num_rows: 12037
})

In [123]:
encoded_data.save_to_disk(clean_data_path)

Saving the dataset (0/1 shards):   0%|          | 0/12037 [00:00<?, ? examples/s]