In [13]:
from datasets import load_dataset
import pandas as pd
# Load 2000 training and 500 test examples for speed
imdb = load_dataset("imdb")
train = pd.DataFrame(imdb["train"].select(range(2000)))
test = pd.DataFrame(imdb["test"].select(range(500)))
print("Train size:", len(train), "Test size:", len(test))
train.head()

Train size: 2000 Test size: 500


Unnamed: 0,text,label
0,I rented I AM CURIOUS-YELLOW from my video sto...,0
1,"""I Am Curious: Yellow"" is a risible and preten...",0
2,If only to avoid making this type of film in t...,0
3,This film was probably inspired by Godard's Ma...,0
4,"Oh, brother...after hearing about this ridicul...",0


In [14]:
import re
def clean_text(text):
    text = re.sub(r"<br\s*/?>", " ", text)
    text = re.sub(r"[^\w\s']", "", text)
    return text.lower()
train["text"] = train["text"].apply(clean_text)
test["text"] = test["text"].apply(clean_text)

In [15]:
from snorkel.labeling import labeling_function, LFAnalysis
from snorkel.labeling.model import LabelModel
ABSTAIN, NEG, POS = -1, 0, 1
positive_words = {"great","excellent","amazing","wonderful","best","fantastic"}
negative_words = {"bad","terrible","awful","worst","boring","poor"}
@labeling_function()
def lf_positive(x):
    return POS if any(w in x.text.split() for w in positive_words) else ABSTAIN
@labeling_function()
def lf_negative(x):
    return NEG if any(w in x.text.split() for w in negative_words) else ABSTAIN
@labeling_function()
def lf_exclaim(x):
    return POS if x.text.count("!") > 2 else ABSTAIN
lfs = [lf_positive, lf_negative, lf_exclaim]

In [16]:
from snorkel.labeling import PandasLFApplier, LFAnalysis

# Apply LFs
applier = PandasLFApplier(lfs)
L_train = applier.apply(train)

# Analyze LFs
analysis = LFAnalysis(L=L_train, lfs=lfs)
print(analysis.lf_summary())


100%|████████████████████████████████████| 2000/2000 [00:00<00:00, 10201.22it/s]


             j Polarity  Coverage  Overlaps  Conflicts
lf_positive  0      [1]    0.3210    0.1795     0.1795
lf_negative  1      [0]    0.5635    0.1795     0.1795
lf_exclaim   2       []    0.0000    0.0000     0.0000


In [17]:

label_model = LabelModel(cardinality=2, verbose=True)
label_model.fit(L_train=L_train, n_epochs=500, log_freq=100, seed=42)
# Get probabilistic labels
train_probs = label_model.predict_proba(L_train)
train_preds = label_model.predict(L_train)

INFO:root:Computing O...
INFO:root:Estimating \mu...
  0%|                                                | 0/500 [00:00<?, ?epoch/s]INFO:root:[0 epochs]: TRAIN:[loss=0.293]
INFO:root:[100 epochs]: TRAIN:[loss=0.000]
 32%|███████████▋                        | 162/500 [00:00<00:00, 1618.56epoch/s]INFO:root:[200 epochs]: TRAIN:[loss=0.000]
INFO:root:[300 epochs]: TRAIN:[loss=0.000]
 67%|███████████████████████▉            | 333/500 [00:00<00:00, 1669.51epoch/s]INFO:root:[400 epochs]: TRAIN:[loss=0.000]
100%|████████████████████████████████████| 500/500 [00:00<00:00, 1657.27epoch/s]
INFO:root:Finished Training


In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
# Vectorize
vectorizer = TfidfVectorizer(max_features=5_000)
X_train = vectorizer.fit_transform(train["text"])
y_train = train_preds
# Fit classifier
clf = LogisticRegression(max_iter=200)
clf.fit(X_train, y_train)
# Evaluate on test set
X_test = vectorizer.transform(test["text"])
y_test = test["label"]
preds = clf.predict(X_test)
print(classification_report(y_test, preds, target_names=["abstain","neg","pos"]))

              precision    recall  f1-score   support

     abstain       0.00      0.00      0.00         0
         neg       1.00      0.81      0.90       500
         pos       0.00      0.00      0.00         0

    accuracy                           0.81       500
   macro avg       0.33      0.27      0.30       500
weighted avg       1.00      0.81      0.90       500



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [23]:
import pandas as pd

# Take a random sample so both classes appear
train = pd.DataFrame(imdb["train"]).sample(2000, random_state=42)
test = pd.DataFrame(imdb["test"]).sample(500, random_state=42)

print(train["label"].value_counts())


label
0    1040
1     960
Name: count, dtype: int64


In [24]:
clf_fs = LogisticRegression(max_iter=200)
clf_fs.fit(X_train, train["label"])
fs_preds = clf_fs.predict(X_test)
print("Fully supervised performance:")
print(classification_report(y_test, fs_preds, target_names=["neg","pos"]))

Fully supervised performance:
              precision    recall  f1-score   support

         neg       1.00      0.59      0.74       500
         pos       0.00      0.00      0.00         0

    accuracy                           0.59       500
   macro avg       0.50      0.29      0.37       500
weighted avg       1.00      0.59      0.74       500



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
