In [54]:
# !pip install imbalanced-learn==0.8.1 -q


In [28]:
# !pip install nlpaug == 1.1.0 transformers==3.0.2 -q
# !pip install snorkel==0.9.8 -q


In [34]:
# !pip install nlpaug transformers==4.11.3 -q

In [1]:
from sklearn import metrics
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import log_loss, precision_recall_fscore_support


In [2]:
import random
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import re
import json
from collections import Counter
from sklearn.model_selection import train_test_split
import nlpaug.augmenter.word as naw
from sklearn.feature_extraction.text import TfidfVectorizer
# from imblearn.over_sampling import RandomOverSampler



In [3]:
def set_seeds(seed=42):
    """Set seeds for reproducibility."""
    np.random.seed(seed)
    random.seed(seed)


In [4]:
nltk.download("stopwords")
# stopwords are word which are generally filtered out, because they don't cary much information.
STOPWORDS = stopwords.words("english")
# lemmatization is normalization process which converts every word to its base root mode
stemmer = PorterStemmer()

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/princychahal/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
def clean_text(text, lower=True, stem=False, stopwords=STOPWORDS):
    """Clean raw text."""
    # Lower
    if lower:
        text = text.lower()

    # Remove stopwords
    if len(stopwords):
        pattern = re.compile(r'\b(' + r"|".join(stopwords) + r")\b\s*")
        text = pattern.sub('', text)

    # Spacing and filters
    text = re.sub(
        r"([!\"'#$%&()*\+,-./:;<=>?@\\\[\]^_`{|}~])", r" \1 ", text
    )  # add spacing between objects to be filtered
    text = re.sub("[^A-Za-z0-9]+", " ", text)  # remove non alphanumeric chars
    text = re.sub(" +", " ", text)  # remove multiple spaces
    text = text.strip()  # strip white space at the ends

    # Remove links
    text = re.sub(r"http\S+", "", text)

    # Stemming
    if stem:
        text = " ".join([stemmer.stem(word, to_lowercase=lower) for word in text.split(" ")])

    return text

In [6]:
# Accepted tags (external constraint)
ACCEPTED_TAGS = ["natural-language-processing", "computer-vision", "mlops", "graph-learning"]


def preprocess(df, lower, stem, min_freq):
    """Preprocess the data."""
    df["text"] = df.title + " " + df.description  # feature engineering
    df.text = df.text.apply(clean_text, lower=lower, stem=stem)  # clean text

    # Replace OOS tags with `other`
    oos_tags = [item for item in df.tag.unique() if item not in ACCEPTED_TAGS]
    df.tag = df.tag.apply(lambda x: "other" if x in oos_tags else x)

    # Replace tags below min_freq with `other`
    tags = Counter(df.tag.values)
    tags_above_freq = Counter(tag for tag in tags.elements()
                            if (tags[tag] >= min_freq))
    df.tag = df.tag.apply(lambda tag: tag if tag in tags_above_freq else None)
    df.tag = df.tag.fillna("other")

    return df


In [7]:
def get_data_splits(X, y, train_size=0.7):
    """Generate balanced data splits."""
    X_train, X_, y_train, y_ = train_test_split(
        X, y, train_size=train_size, stratify=y)
    X_val, X_test, y_val, y_test = train_test_split(
        X_, y_, train_size=0.5, stratify=y_)
    return X_train, X_val, X_test, y_train, y_val, y_test

In [8]:
class LabelEncoder(object):
    """Encode labels into unique indices"""
    def __init__(self, class_to_index={}):
        self.class_to_index = class_to_index or {}  # mutable defaults ;)
        self.index_to_class = {v: k for k, v in self.class_to_index.items()}
        self.classes = list(self.class_to_index.keys())

    def __len__(self):
        return len(self.class_to_index)

    def __str__(self):
        return f"<LabelEncoder(num_classes={len(self)})>"

    def fit(self, y):
        classes = np.unique(y)
        for i, class_ in enumerate(classes):
            self.class_to_index[class_] = i
        self.index_to_class = {v: k for k, v in self.class_to_index.items()}
        self.classes = list(self.class_to_index.keys())
        return self

    def encode(self, y):
        encoded = np.zeros((len(y)), dtype=int)
        for i, item in enumerate(y):
            encoded[i] = self.class_to_index[item]
        return encoded

    def decode(self, y):
        classes = []
        for i, item in enumerate(y):
            classes.append(self.index_to_class[item])
        return classes
#  Do not understand
    def save(self, fp):
        with open(fp, "w") as fp:
            contents = {"class_to_index": self.class_to_index}
            json.dump(contents, fp, indent=4, sort_keys=False)

    @classmethod
    def load(cls, fp):
        with open(fp, "r") as fp:
            kwargs = json.load(fp=fp)
        return cls(**kwargs)


In [9]:
# Set up
min_freq = 75
set_seeds()
df = pd.read_csv("labeled_projects.csv")
df = df.sample(frac=1).reset_index(drop=True)
df = preprocess(df, lower=True, stem=False, min_freq=min_freq)
label_encoder = LabelEncoder().fit(df.tag)
X_train, X_val, X_test, y_train, y_val, y_test = \
    get_data_splits(X=df.text.to_numpy(), y=label_encoder.encode(df.tag))


In [10]:
X_test_raw = X_test

In [11]:
# Tf-idf
vectorizer = TfidfVectorizer(analyzer="char", ngram_range=(2,7))  # char n-grams
# print (X_train[0])
X_train = vectorizer.fit_transform(X_train)
X_val = vectorizer.transform(X_val)
X_test = vectorizer.transform(X_test)
# print(X_train[0])
# print (X_train.shape)  # scipy.sparse.csr_matrix


In [12]:
# Class weights
counts = np.bincount(y_train)
class_weights = {i: 1.0/count for i, count in enumerate(counts)}
print (f"class counts: {counts},\nclass weights: {class_weights}")


class counts: [249  55 272  92],
class weights: {0: 0.004016064257028112, 1: 0.01818181818181818, 2: 0.003676470588235294, 3: 0.010869565217391304}


In [13]:
# Initialize model
model = SGDClassifier(
    loss="log", penalty="l2", alpha=1e-4, max_iter=1,
    learning_rate="constant", eta0=1e-1, power_t=0.1,
    warm_start=True)


In [14]:
# Train model
num_epochs = 100
for epoch in range(num_epochs):
    # Training
    model.fit(X_train, y_train)

    # Evaluation
    train_loss = log_loss(y_train, model.predict_proba(X_train))
    val_loss = log_loss(y_val, model.predict_proba(X_val))

    if not epoch%10:
        print(
            f"Epoch: {epoch:02d} | "
            f"train_loss: {train_loss:.5f}, "
            f"val_loss: {val_loss:.5f}"
        )




Epoch: 00 | train_loss: 1.18299, val_loss: 1.20148
Epoch: 10 | train_loss: 0.54027, val_loss: 0.67864




Epoch: 20 | train_loss: 0.37319, val_loss: 0.55959
Epoch: 30 | train_loss: 0.29271, val_loss: 0.50606
Epoch: 40 | train_loss: 0.24549, val_loss: 0.47539




Epoch: 50 | train_loss: 0.21513, val_loss: 0.45581
Epoch: 60 | train_loss: 0.19445, val_loss: 0.44276
Epoch: 70 | train_loss: 0.17978, val_loss: 0.43350




Epoch: 80 | train_loss: 0.16901, val_loss: 0.42670
Epoch: 90 | train_loss: 0.16055, val_loss: 0.42135




In [15]:
# Evaluate
y_pred = model.predict(X_test)
metrics = precision_recall_fscore_support(y_test, y_pred, average="weighted")
print(metrics)
performance = {"precision": metrics[0], "recall": metrics[1], "f1": metrics[2]}
print (json.dumps(performance, indent=2))


(0.8804728087816324, 0.875, 0.8738912212360173, None)
{
  "precision": 0.8804728087816324,
  "recall": 0.875,
  "f1": 0.8738912212360173
}


In [16]:
# Inference (with tokens similar to training data)
text = "Transfer learning with transformers for text classification."
y_pred = model.predict(vectorizer.transform([text]))
label_encoder.decode(y_pred)


['natural-language-processing']

In [17]:
# Probabilities
y_prob = model.predict_proba(vectorizer.transform([text]))
print(y_prob)
{tag:y_prob[0][i] for i, tag in enumerate(label_encoder.classes)}


[[0.02063355 0.00259423 0.96928256 0.00748966]]


{'computer-vision': 0.020633551628573863,
 'mlops': 0.002594230407916429,
 'natural-language-processing': 0.9692825609694882,
 'other': 0.007489656994021604}

In [18]:
# Determine first quantile softmax score for the correct class (on validation split)
y_pred = model.predict(X_val)
y_prob = model.predict_proba(X_val)
threshold = np.quantile([y_prob[i][j] for i, j in enumerate(y_pred)], q=0.25)  # Q1
threshold


0.6890960095210326

In [19]:
# Custom predict function
def custom_predict(y_prob, threshold, index):
    """Custom predict function that defaults
    to an index if conditions are not met."""
    y_pred = [np.argmax(p) if max(p) > threshold else index for p in y_prob]
    return np.array(y_pred)


In [20]:
def predict_tag(texts):
    y_prob = model.predict_proba(vectorizer.transform(texts))
    other_index = label_encoder.class_to_index["other"]
    y_pred = custom_predict(y_prob=y_prob, threshold=threshold, index=other_index)
    return label_encoder.decode(y_pred)


In [21]:
# Inference (with tokens not similar to training data)
text = "Interpretability methods for explaining model behavior."
predict_tag(texts=[text])


['natural-language-processing']

In [22]:
# Evaluate
y_prob = model.predict_proba(X_test)
other_index = label_encoder.class_to_index["other"]
y_pred = custom_predict(y_prob=y_prob, threshold=threshold, index=other_index)
metrics = precision_recall_fscore_support(y_test, y_pred, average="weighted")
performance = {"precision": metrics[0], "recall": metrics[1], "f1": metrics[2]}
print (json.dumps(performance, indent=2))


{
  "precision": 0.9116161616161617,
  "recall": 0.7569444444444444,
  "f1": 0.7917810016494227
}


In [23]:
# Overall metrics
overall_metrics = precision_recall_fscore_support(y_test, y_pred, average="weighted")
metrics = {"overall": {}, "class": {}}
metrics["overall"]["precision"] = overall_metrics[0]
metrics["overall"]["recall"] = overall_metrics[1]
metrics["overall"]["f1"] = overall_metrics[2]
metrics["overall"]["num_samples"] = np.float64(len(y_test))
print (json.dumps(metrics["overall"], indent=4))


{
    "precision": 0.9116161616161617,
    "recall": 0.7569444444444444,
    "f1": 0.7917810016494227,
    "num_samples": 144.0
}


In [24]:
class_metrics = precision_recall_fscore_support(y_test, y_pred, average=None)
class_metrics

(array([1.        , 1.        , 1.        , 0.36363636]),
 array([0.66666667, 0.58333333, 0.79310345, 1.        ]),
 array([0.8       , 0.73684211, 0.88461538, 0.53333333]),
 array([54, 12, 58, 20]))

In [25]:
# Per-class metrics
class_metrics = precision_recall_fscore_support(y_test, y_pred, average=None)
for i, _class in enumerate(label_encoder.classes):
    metrics["class"][_class] = {
        "precision": class_metrics[0][i],
        "recall": class_metrics[1][i],
        "f1": class_metrics[2][i],
        "num_samples": np.float64(class_metrics[3][i]),
    }


In [26]:
# TP, FP, FN samples
tag = "mlops"
index = label_encoder.class_to_index[tag]
tp, fp, fn = [], [], []
for i, true in enumerate(y_test):
    pred = y_pred[i]
    if index==true==pred:
        tp.append(i)
    elif index!=true and index==pred:
        fp.append(i)
    elif index==true and index!=pred:
        fn.append(i)


In [27]:
print (tp)
print (fp)
print (fn)


[5, 47, 52, 96, 111, 123, 129]
[]
[0, 38, 130, 136, 141]


In [28]:
# y
y_prob = model.predict_proba(X_test)
print (np.shape(y_test))
print (np.shape(y_prob))


(144,)
(144, 4)


In [29]:
# Used to show raw text
test_df = pd.DataFrame({"text": X_test_raw, "tag": label_encoder.decode(y_test)})


In [30]:
# Tag to inspect
tag = "mlops"
index = label_encoder.class_to_index[tag]
indices = np.where(y_test==index)[0]


In [31]:
np.where(y_test==index)

(array([  0,   5,  38,  47,  52,  96, 111, 123, 129, 130, 136, 141]),)

In [32]:
# Confidence score for the correct class is below a threshold
low_confidence = []
min_threshold = 0.5
for i in indices:
    prob = y_prob[i][index]
    if prob <= 0.5:
        low_confidence.append({"text": test_df.text[i],
                               "true": label_encoder.index_to_class[y_test[i]],
                               "pred": label_encoder.index_to_class[y_pred[i]],
                               "prob": prob})


In [33]:
low_confidence

[{'text': 'pytest pytest framework makes easy write small tests yet scales support complex functional testing',
  'true': 'mlops',
  'pred': 'other',
  'prob': 0.48004704907313744},
 {'text': 'hidden technical debt machine learning systems using software engineering framework technical debt find common incur massive ongoing maintenance costs real world ml systems',
  'true': 'mlops',
  'pred': 'other',
  'prob': 0.3382731457651776},
 {'text': 'docker help become effective data scientist look docker perspective data scientist',
  'true': 'mlops',
  'pred': 'other',
  'prob': 0.38298721421694343},
 {'text': 'neptune ai lightweight experiment management tool fits workflow',
  'true': 'mlops',
  'pred': 'other',
  'prob': 0.253380937421862}]

In [34]:
#!pip install cleanlab==1.0.1 -q
import cleanlab
from cleanlab.pruning import get_noise_indices



In [35]:
# Determine potential labeling errors
label_error_indices = get_noise_indices(
            s=y_test,
            psx=y_prob,
            sorted_index_method="self_confidence",
            verbose=0)


In [37]:
label_error_indices

array([ 41, 140])

In [36]:
num_samples = 10
for index in label_error_indices[:num_samples]:
    print ("text:", test_df.iloc[index].text)
    print ("true:", test_df.iloc[index].tag)
    print ("pred:", label_encoder.decode([y_pred[index]])[0])
    print ()


text: extracting structured data templatic documents automatically extract data structured documents invoices receipts etc potential streamline many business workflows
true: computer-vision
pred: other

text: module 2 convolutional neural networks cs231n lecture 5 move fully connected neural networks convolutional neural networks
true: computer-vision
pred: other

