# Text Classification with Noisy Labels


## 1. Install required dependencies

In [2]:
import re
import string
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split, cross_val_predict
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sentence_transformers import SentenceTransformer

from cleanlab.classification import CleanLearning

In [3]:
import random
import numpy as np

pd.set_option("display.max_colwidth", None)
SEED = 123456

np.random.seed(SEED)
random.seed(SEED)

## 2.Load and format the text dataset

In [7]:
df = pd.read_csv("https://s.cleanlab.ai/banking-intent-classification.csv")
df.head()

Unnamed: 0,text,label
0,i accidentally made a payment to a wrong account. what should i do?,cancel_transfer
1,"i no longer want to transfer funds, can we cancel that transaction?",cancel_transfer
2,"cancel my transfer, please.",cancel_transfer
3,i want to revert this mornings transaction.,cancel_transfer
4,i just realised i made the wrong payment yesterday. can you please change it to the right account? it's my rent payment and really really needs to be in the right account by tomorrow,cancel_transfer


In [8]:
raw_texts, raw_labels = df["text"].values, df["label"].values

raw_train_texts, raw_test_texts, raw_train_labels, raw_test_labels = train_test_split(
    raw_texts, raw_labels, test_size=0.1
)

In [13]:
num_classes = len(set(raw_train_labels))

print(f"This dataset has {num_classes} classes.")
print(f"Classes: {set(raw_train_labels)}")

This dataset has 10 classes.
Classes: {'beneficiary_not_allowed', 'card_payment_fee_charged', 'lost_or_stolen_phone', 'supported_cards_and_currencies', 'change_pin', 'apple_pay_or_google_pay', 'cancel_transfer', 'card_about_to_expire', 'getting_spare_card', 'visa_or_mastercard'}


In [14]:
i = 0
print(f"Example Label: {raw_train_labels[i]}")
print(f"Example Text: {raw_train_texts[i]}")

Example Label: cancel_transfer
Example Text: i want to revert this mornings transaction.


In [16]:
encoder = LabelEncoder()
encoder.fit(raw_train_labels)

train_labels = encoder.transform(raw_train_labels)
test_labels = encoder.transform(raw_test_labels)

In [18]:
transformer = SentenceTransformer("google/electra-small-discriminator")

train_texts = transformer.encode(raw_train_texts)
train_texts

No sentence-transformers model found with name google/electra-small-discriminator. Creating a new one with MEAN pooling.


array([[ 0.09906125,  0.10903942, -0.3578913 , ..., -0.00198934,
        -0.32687584, -0.1739869 ],
       [ 0.09361263,  0.1858116 ,  0.07459928, ...,  0.14751613,
         0.08302285,  0.20097497],
       [-0.12223459,  0.15505424, -0.07183969, ...,  0.00379977,
        -0.28981966,  0.35612258],
       ...,
       [-0.3994856 , -0.1298121 , -0.00607546, ..., -0.20033723,
        -0.66783416,  0.8242751 ],
       [ 0.38461944,  0.08989605, -0.01956453, ..., -0.05059751,
        -0.2893063 ,  0.291096  ],
       [-0.2754149 ,  0.2643168 ,  0.11367966, ..., -0.04313809,
        -0.19180089,  0.42906156]], dtype=float32)

## 3. Define a classification model and use cleanlab to find potential label errors

<a id="section3"></a>

In [None]:
# run all above cells