In [7]:


import re
import shutil
from pathlib import Path
import pandas as pd
from datasets import load_dataset, Dataset
from itertools import islice

from snorkel.labeling import labeling_function, LFAnalysis, PandasLFApplier
# Corrected import for LabelModel
from snorkel.labeling.model import LabelModel

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

#   1. Load and Explore the IMDb Dataset

# IMPORTANT: Clear corrupted cache first
print("Clearing IMDb dataset cache...")
cache_dir = Path.home() / ".cache" / "huggingface" / "datasets" / "imdb"
if cache_dir.exists():
    try:
        shutil.rmtree(cache_dir)
        print("Cache cleared successfully!")
    except Exception as e:
        print(f"Could not clear cache: {e}")
else:
    print("No cache found.")

# Try alternative: Load without split parameter first
print("\nDownloading IMDb dataset (this may take a minute)...")
try:
    # Method 1: Load entire dataset, no split parameter
    imdb_full = load_dataset("imdb")
    print("Dataset loaded successfully!")

    # Now extract the splits we need
    print("Extracting subsets...")

    #   FIX APPLIED HERE
    # Add .shuffle(seed=42) to get a random mix of labels
    train_data = imdb_full["train"].shuffle(seed=42).select(range(2000))
    test_data = imdb_full["test"].shuffle(seed=42).select(range(500))

except Exception as e:
    print(f"Method 1 failed: {e}")
    print("\nTrying alternative method...")

    # Method 2: Use streaming mode (doesn't cache)
    print("Using streaming mode...")

    #   FIX APPLIED HERE
    # Add .shuffle() to the stream
    train_stream = load_dataset("imdb", split="train", streaming=True).shuffle(seed=42, buffer_size=10000)
    test_stream = load_dataset("imdb", split="test", streaming=True).shuffle(seed=42, buffer_size=10000)

    # Convert to list with limit
    train_list = list(islice(train_stream, 2000))
    test_list = list(islice(test_stream, 500))

    # Convert to dataset format
    train_data = Dataset.from_dict({
        'text': [x['text'] for x in train_list],
        'label': [x['label'] for x in train_list]
    })
    test_data = Dataset.from_dict({
        'text': [x['text'] for x in test_list],
        'label': [x['label'] for x in test_list]
    })

train = pd.DataFrame(train_data)
test = pd.DataFrame(test_data)

print("Train size:", len(train), "Test size:", len(test))
print("\nFirst few training examples:")
print(train.head())
print("\nLabel distribution in training data:")
print(train['label'].value_counts())


#   2. Preprocess Text
def clean_text(text):
    text = re.sub(r"<br\s*/?>", " ", text)
    text = re.sub(r"[^\w\s']", "", text)
    return text.lower()

print("\nCleaning text...")
train["text"] = train["text"].apply(clean_text)
test["text"] = test["text"].apply(clean_text)
print("Text cleaning complete!")


#   3. Define Labeling Functions (LFs)
ABSTAIN, NEG, POS = -1, 0, 1

positive_words = {"great", "excellent", "amazing", "wonderful", "best", "fantastic"}
negative_words = {"bad", "terrible", "awful", "worst", "boring", "poor"}

@labeling_function()
def lf_positive(x):
    return POS if any(w in x.text.split() for w in positive_words) else ABSTAIN

@labeling_function()
def lf_negative(x):
    return NEG if any(w in x.text.split() for w in negative_words) else ABSTAIN

@labeling_function()
def lf_exclaim(x):
    return POS if x.text.count("!") > 2 else ABSTAIN

lfs = [lf_positive, lf_negative, lf_exclaim]

# Analyze LF Coverage & Conflicts
print("\nApplying labeling functions...")
applier = PandasLFApplier(lfs)
L_train = applier.apply(train)

print("\nLabeling Function Analysis:")
print(LFAnalysis(L_train, lfs).lf_summary())


#   4. Train the LabelModel
print("\nTraining Label Model...")
label_model = LabelModel(cardinality=2, verbose=True)
label_model.fit(L_train=L_train, n_epochs=500, log_freq=100, seed=42)

# Get probabilistic labels
train_probs = label_model.predict_proba(L_train)
train_preds = label_model.predict(L_train)

print(f"\nLabeled {len([p for p in train_preds if p != -1])} out of {len(train_preds)} training examples")


#   5. Train an End-to-End Classifier
# Filter out abstained predictions for training
train_filtered_idx = train_preds != -1
X_train_filtered = train[train_filtered_idx]["text"]
y_train_filtered = train_preds[train_filtered_idx]

print(f"\nTraining classifier on {len(y_train_filtered)} labeled examples...")

# Vectorize
vectorizer = TfidfVectorizer(max_features=5000)
X_train_vec = vectorizer.fit_transform(X_train_filtered)

# Fit classifier
clf = LogisticRegression(max_iter=200, random_state=42)
clf.fit(X_train_vec, y_train_filtered)

# Evaluate on test set
X_test_vec = vectorizer.transform(test["text"])
y_test = test["label"]
preds = clf.predict(X_test_vec)


print(classification_report(y_test, preds, target_names=["neg", "pos"], labels=[0, 1]))
print(f"Accuracy: {accuracy_score(y_test, preds):.4f}")


#   6. Evaluate Weak Supervision vs. Fully Supervised
print("\nTraining fully supervised baseline...")
# Use all training data with true labels
# We must use the *same* vectorizer, but fit_transform on the *full* train["text"]
X_train_full = vectorizer.fit_transform(train["text"])
y_train_full = train["label"]

clf_fs = LogisticRegression(max_iter=200, random_state=42)
clf_fs.fit(X_train_full, y_train_full)

# We must also re-transform the test set with this new vectorizer
X_test_full = vectorizer.transform(test["text"])
fs_preds = clf_fs.predict(X_test_full)


print(classification_report(y_test, fs_preds, target_names=["neg", "pos"], labels=[0, 1]))
print(f"Accuracy: {accuracy_score(y_test, fs_preds):.4f}")


#   Optional: Visualize confusion matrices

print("\nWeak Supervision Confusion Matrix:")
print("         Predicted")
print("         Neg  Pos")
cm_ws = confusion_matrix(y_test, preds, labels=[0, 1])
print(f"Actual Neg {cm_ws[0]}")
print(f"       Pos {cm_ws[1]}")

print("\nFully Supervised Confusion Matrix:")
print("         Predicted")
print("         Neg  Pos")
cm_fs = confusion_matrix(y_test, fs_preds, labels=[0, 1])
print(f"Actual Neg {cm_fs[0]}")
print(f"       Pos {cm_fs[1]}")


# Summary

print(f"Weak Supervision Accuracy: {accuracy_score(y_test, preds):.4f}")
print(f"Fully Supervised Accuracy: {accuracy_score(y_test, fs_preds):.4f}")
print(f"Labeled Examples Used (Weak): {len(y_train_filtered)} / {len(train)}")
print(f"Labeled Examples Used (Full): {len(train)} / {len(train)}")


Clearing IMDb dataset cache...
No cache found.

Downloading IMDb dataset (this may take a minute)...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

plain_text/train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

plain_text/test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

plain_text/unsupervised-00000-of-00001.p(…):   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

Dataset loaded successfully!
Extracting subsets...
Train size: 2000 Test size: 500

First few training examples:
                                                text  label
0  There is no relation at all between Fortier an...      1
1  This movie is a great. The plot is very true t...      1
2  George P. Cosmatos' "Rambo: First Blood Part I...      0
3  In the process of trying to establish the audi...      1
4  Yeh, I know -- you're quivering with excitemen...      0

Label distribution in training data:
label
1    1000
0    1000
Name: count, dtype: int64

Cleaning text...
Text cleaning complete!

Applying labeling functions...


100%|██████████| 2000/2000 [00:00<00:00, 6682.89it/s]



Labeling Function Analysis:
             j Polarity  Coverage  Overlaps  Conflicts
lf_positive  0      [1]    0.4500    0.1385     0.1385
lf_negative  1      [0]    0.3755    0.1385     0.1385
lf_exclaim   2       []    0.0000    0.0000     0.0000

Training Label Model...


100%|██████████| 500/500 [00:00<00:00, 924.83epoch/s] 



Labeled 1374 out of 2000 training examples

Training classifier on 1374 labeled examples...
              precision    recall  f1-score   support

         neg       0.73      0.81      0.77       254
         pos       0.78      0.70      0.74       246

    accuracy                           0.76       500
   macro avg       0.76      0.76      0.75       500
weighted avg       0.76      0.76      0.76       500

Accuracy: 0.7560

Training fully supervised baseline...
              precision    recall  f1-score   support

         neg       0.84      0.81      0.82       254
         pos       0.81      0.84      0.82       246

    accuracy                           0.82       500
   macro avg       0.82      0.82      0.82       500
weighted avg       0.82      0.82      0.82       500

Accuracy: 0.8220

Weak Supervision Confusion Matrix:
         Predicted
         Neg  Pos
Actual Neg [207  47]
       Pos [ 75 171]

Fully Supervised Confusion Matrix:
         Predicted
         N

In [6]:
!pip install pandas datasets scikit-learn snorkel

Collecting snorkel
  Downloading snorkel-0.10.0-py3-none-any.whl.metadata (9.5 kB)
Collecting munkres>=1.0.6 (from snorkel)
  Downloading munkres-1.1.4-py2.py3-none-any.whl.metadata (980 bytes)
Downloading snorkel-0.10.0-py3-none-any.whl (103 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m103.3/103.3 kB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading munkres-1.1.4-py2.py3-none-any.whl (7.0 kB)
Installing collected packages: munkres, snorkel
Successfully installed munkres-1.1.4 snorkel-0.10.0
