In [1]:
import pandas as pd
import numpy as np
from snorkel.labeling import labeling_function, PandasLFApplier, LFAnalysis
from snorkel.labeling.model import LabelModel

# Constants
SPAM = 1
HAM = 0
ABSTAIN = -1

# Load custom dataset
df = pd.read_csv('custom_spam_dataset.csv')
print(df.head(10))
print(f"\nDataset shape: {df.shape}")

                                                text  label
0  Congratulations! You won a free iPhone, click ...      1
1      Hey, are we still meeting for lunch tomorrow?      0
2  URGENT: Your account has been compromised, ver...      1
3        Can you send me the notes from class today?      0
4       Win $1000 cash prize! Text WIN to 12345 now!      1
5        I will be late to the meeting by 10 minutes      0
6     Click here to get FREE followers on Instagram!      1
7         Do you want to grab coffee this afternoon?      0
8  You have been selected for a special reward, c...      1
9  The project deadline has been moved to next Fr...      0

Dataset shape: (20, 2)


In [2]:
# Define Labeling Functions (LFs)
@labeling_function()
def lf_contains_link(x):
    return SPAM if "http" in x.text.lower() or "click here" in x.text.lower() else ABSTAIN

@labeling_function()
def lf_contains_free(x):
    return SPAM if "free" in x.text.lower() else ABSTAIN

@labeling_function()
def lf_contains_win(x):
    return SPAM if "win" in x.text.lower() or "won" in x.text.lower() else ABSTAIN

@labeling_function()
def lf_contains_urgent(x):
    return SPAM if "urgent" in x.text.lower() or "congratulations" in x.text.lower() else ABSTAIN

@labeling_function()
def lf_contains_money(x):
    return SPAM if "$" in x.text or "earn" in x.text.lower() or "cash" in x.text.lower() else ABSTAIN

@labeling_function()
def lf_normal_conversation(x):
    keywords = ["meeting", "lunch", "coffee", "notes", "report", "deadline", "presentation"]
    return HAM if any(k in x.text.lower() for k in keywords) else ABSTAIN

# Apply labeling functions
lfs = [lf_contains_link, lf_contains_free, lf_contains_win, 
       lf_contains_urgent, lf_contains_money, lf_normal_conversation]

applier = PandasLFApplier(lfs=lfs)
L_train = applier.apply(df=df)

print("Labeling function matrix shape:", L_train.shape)
print("\nLF Analysis:")
print(LFAnalysis(L=L_train, lfs=lfs).lf_summary())

100%|██████████████████████████████████████████████████████████████████| 20/20 [00:00<00:00, 3660.43it/s]

Labeling function matrix shape: (20, 6)

LF Analysis:
                        j Polarity  Coverage  Overlaps  Conflicts
lf_contains_link        0      [1]      0.10      0.10        0.0
lf_contains_free        1      [1]      0.20      0.15        0.0
lf_contains_win         2      [1]      0.15      0.15        0.0
lf_contains_urgent      3      [1]      0.10      0.05        0.0
lf_contains_money       4      [1]      0.15      0.05        0.0
lf_normal_conversation  5      [0]      0.40      0.00        0.0



  m = sparse.diags(np.ravel(self._L_sparse.max(axis=1).todense()))


In [3]:
# Train Label Model
label_model = LabelModel(cardinality=2, verbose=True)
label_model.fit(L_train=L_train, n_epochs=500, lr=0.001, seed=123)

# Generate probabilistic labels
preds = label_model.predict(L=L_train)

# Compare predictions vs ground truth
df['predicted_label'] = preds
df['correct'] = df['predicted_label'] == df['label']

print("Predictions vs Ground Truth:")
print(df[['text', 'label', 'predicted_label', 'correct']])
print(f"\nAccuracy: {df['correct'].mean():.2%}")
print(f"Spam detected: {(preds == SPAM).sum()}")
print(f"Ham detected: {(preds == HAM).sum()}")

INFO:root:Computing O...
INFO:root:Estimating \mu...
  0%|                                                                         | 0/500 [00:00<?, ?epoch/s]INFO:root:[0 epochs]: TRAIN:[loss=0.119]
INFO:root:[10 epochs]: TRAIN:[loss=0.112]
INFO:root:[20 epochs]: TRAIN:[loss=0.099]
INFO:root:[30 epochs]: TRAIN:[loss=0.087]
INFO:root:[40 epochs]: TRAIN:[loss=0.076]
INFO:root:[50 epochs]: TRAIN:[loss=0.067]
INFO:root:[60 epochs]: TRAIN:[loss=0.060]
INFO:root:[70 epochs]: TRAIN:[loss=0.055]
INFO:root:[80 epochs]: TRAIN:[loss=0.050]
INFO:root:[90 epochs]: TRAIN:[loss=0.047]
INFO:root:[100 epochs]: TRAIN:[loss=0.044]
INFO:root:[110 epochs]: TRAIN:[loss=0.042]
INFO:root:[120 epochs]: TRAIN:[loss=0.040]
INFO:root:[130 epochs]: TRAIN:[loss=0.038]
INFO:root:[140 epochs]: TRAIN:[loss=0.036]
INFO:root:[150 epochs]: TRAIN:[loss=0.035]
INFO:root:[160 epochs]: TRAIN:[loss=0.034]
INFO:root:[170 epochs]: TRAIN:[loss=0.033]
INFO:root:[180 epochs]: TRAIN:[loss=0.031]
INFO:root:[190 epochs]: TRAIN:[loss=

Predictions vs Ground Truth:
                                                 text  label  predicted_label  \
0   Congratulations! You won a free iPhone, click ...      1                1   
1       Hey, are we still meeting for lunch tomorrow?      0                0   
2   URGENT: Your account has been compromised, ver...      1                1   
3         Can you send me the notes from class today?      0                0   
4        Win $1000 cash prize! Text WIN to 12345 now!      1                1   
5         I will be late to the meeting by 10 minutes      0                0   
6      Click here to get FREE followers on Instagram!      1                1   
7          Do you want to grab coffee this afternoon?      0                0   
8   You have been selected for a special reward, c...      1               -1   
9   The project deadline has been moved to next Fr...      0                0   
10  Make money from home! Earn $500 daily with no ...      1                1   

In [4]:
# Summary
print("="*50)
print("SNORKEL SPAM DETECTION - CUSTOM DATASET")
print("="*50)
print(f"Total samples: {len(df)}")
print(f"Spam samples: {(df['label'] == SPAM).sum()}")
print(f"Ham samples: {(df['label'] == HAM).sum()}")
print(f"\nModel Accuracy: {df['correct'].mean():.2%}")
print("\nConclusion:")
print("Snorkel successfully used weak supervision to")
print("label spam messages without manual annotation.")
print("Labeling functions captured key spam patterns")
print("like free offers, money, urgency and links.")

SNORKEL SPAM DETECTION - CUSTOM DATASET
Total samples: 20
Spam samples: 9
Ham samples: 11

Model Accuracy: 80.00%

Conclusion:
Snorkel successfully used weak supervision to
label spam messages without manual annotation.
Labeling functions captured key spam patterns
like free offers, money, urgency and links.
