# ✂️ Snorkel Intro Tutorial: _Data Slicing_
> 정보 : 이 방법은 Slice-based Learning이라는 제목의 논문으로 [NeaurIPS 2019](https://arxiv.org/abs/1909.06349)에 accept되었다.

- 커뮤니케이션 가중 
    - 상위 의사결정자와 의견이 
- 방향성을 제시하지 못하고, 사소한걸 트집잡음.

In [1]:
import glob
import os
os.environ["CUDA_VISIBLE_DEVICES"]="0"
import subprocess

import pandas as pd

# Don't truncate text fields in the display
pd.set_option("display.max_colwidth", 0)

In [2]:
from sklearn.model_selection import train_test_split

In [3]:
def load_spam_dataset(load_train_labels: bool = False, split_dev_valid: bool = False):
    filenames = sorted(glob.glob("data/Youtube*.csv"))

    dfs = []
    for i, filename in enumerate(filenames, start=1):
        df = pd.read_csv(filename)
        # Lowercase column names
        df.columns = map(str.lower, df.columns)
        # Remove comment_id field
        df = df.drop("comment_id", axis=1)
        # Add field indicating source video
        df["video"] = [i] * len(df)
        # Rename fields
        df = df.rename(columns={"class": "label", "content": "text"})
        # Shuffle order
        df = df.sample(frac=1, random_state=123).reset_index(drop=True)
        dfs.append(df)

    df_train = pd.concat(dfs[:4])
    df_dev = df_train.sample(100, random_state=123)

    if not load_train_labels:
        df_train["label"] = np.ones(len(df_train["label"])) * -1
    df_valid_test = dfs[4]
    df_valid, df_test = train_test_split(
        df_valid_test, test_size=250, random_state=123, stratify=df_valid_test.label
    )

    if split_dev_valid:
        return df_train, df_dev, df_valid, df_test
    else:
        return df_train, df_test

In [4]:
df_train, df_test = load_spam_dataset(load_train_labels=True)

- slice function

In [5]:
import re
from snorkel.slicing import slicing_function


@slicing_function()
def short_comment(x):
    """Ham comments are often short, such as 'cool video!'"""
    return len(x.text.split()) < 5


sfs = [short_comment]

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [6]:
from snorkel.slicing import slice_dataframe

short_comment_df = slice_dataframe(df_test, short_comment)

100%|██████████| 250/250 [00:00<00:00, 11817.47it/s]


In [7]:
short_comment_df[["text", "label"]].head()

Unnamed: 0,text,label
194,super music﻿,0
2,I like shakira..﻿,0
110,subscribe to my feed,1
263,Awesome ﻿,0
77,Nice,0


Train a simple classifier

In [8]:
def df_to_features(vectorizer, df, split):
    """Convert pandas DataFrame containing spam data to bag-of-words PyTorch features."""
    words = [row.text for i, row in df.iterrows()]

    if split == "train":
        feats = vectorizer.fit_transform(words)
    else:
        feats = vectorizer.transform(words)
    X = feats.todense()
    Y = df["label"].values
    return X, Y

In [9]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(ngram_range=(1, 1))
X_train, Y_train = df_to_features(vectorizer, df_train, "train")
X_test, Y_test = df_to_features(vectorizer, df_test, "test")

In [10]:
from sklearn.linear_model import LogisticRegression

sklearn_model = LogisticRegression(C=0.001, solver="liblinear")
sklearn_model.fit(X=X_train, y=Y_train)

LogisticRegression(C=0.001, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [11]:
from snorkel.utils import preds_to_probs

preds_test = sklearn_model.predict(X_test)
probs_test = preds_to_probs(preds_test, 2)

In [12]:
from sklearn.metrics import f1_score

print(f"Test set F1: {100 * f1_score(Y_test, preds_test):.1f}%")

Test set F1: 92.5%


Store slice metadata in S


In [13]:
from snorkel.slicing import PandasSFApplier

applier = PandasSFApplier(sfs)
S_test = applier.apply(df_test)

100%|██████████| 250/250 [00:00<00:00, 16337.03it/s]


In [14]:
from snorkel.analysis import Scorer

scorer = Scorer(metrics=["f1"])

In [15]:
scorer.score_slices(
    S=S_test, golds=Y_test, preds=preds_test, probs=probs_test, as_dataframe=True
)

Unnamed: 0,f1
overall,0.925
short_comment,0.666667


전체적으로 f1이 높지만 short_comment에 대해서는 f1이 낮네

- Write additional slicing functions (SFs)

In [16]:
from snorkel.slicing import SlicingFunction, slicing_function
from snorkel.preprocess import preprocessor


# Keyword-based SFs
def keyword_lookup(x, keywords):
    return any(word in x.text.lower() for word in keywords)


def make_keyword_sf(keywords):
    return SlicingFunction(
        name=f"keyword_{keywords[0]}",
        f=keyword_lookup,
        resources=dict(keywords=keywords),
    )


keyword_please = make_keyword_sf(keywords=["please", "plz"])


# Regex-based SFs
@slicing_function()
def regex_check_out(x):
    return bool(re.search(r"check.*out", x.text, flags=re.I))


@slicing_function()
def short_link(x):
    """Returns whether text matches common pattern for shortened ".ly" links."""
    return bool(re.search(r"\w+\.ly", x.text))


# Leverage preprocessor in SF
from textblob import TextBlob


@preprocessor(memoize=True)
def textblob_sentiment(x):
    scores = TextBlob(x.text)
    x.polarity = scores.sentiment.polarity
    return x


@slicing_function(pre=[textblob_sentiment])
def textblob_polarity(x):
    return x.polarity > 0.9

In [17]:
polarity_df = slice_dataframe(df_test, textblob_polarity)

100%|██████████| 250/250 [00:00<00:00, 455.35it/s]


In [18]:
polarity_df[["text", "label"]].head()

Unnamed: 0,text,label
263,Awesome ﻿,0
240,Shakira is the best dancer,0
261,OMG LISTEN TO THIS ITS SOO GOOD!! :D﻿,0
14,Shakira is very beautiful,0
114,awesome,0


In [19]:
extra_sfs = [keyword_please, regex_check_out, short_link, textblob_polarity]

sfs = [short_comment] + extra_sfs
slice_names = [sf.name for sf in sfs]

In [20]:
applier = PandasSFApplier(sfs)
S_test = applier.apply(df_test)

100%|██████████| 250/250 [00:00<00:00, 2635.51it/s]


In [21]:
scorer.score_slices(
    S=S_test, golds=Y_test, preds=preds_test, probs=probs_test, as_dataframe=True
)

Unnamed: 0,f1
overall,0.925
short_comment,0.666667
keyword_please,1.0
regex_check_out,1.0
short_link,0.5
textblob_polarity,0.727273


In [22]:
import torch
import torch.nn as nn
def get_pytorch_mlp(hidden_dim, num_layers):
    layers = []
    for _ in range(num_layers):
        layers.extend([nn.Linear(hidden_dim, hidden_dim), nn.ReLU()])
    return nn.Sequential(*layers)

In [23]:
from snorkel.slicing import SliceAwareClassifier

# Define model architecture
bow_dim = X_train.shape[1]
hidden_dim = bow_dim
mlp = get_pytorch_mlp(hidden_dim=hidden_dim, num_layers=2)

# Initialize slice model
slice_model = SliceAwareClassifier(
    base_architecture=mlp,
    head_dim=hidden_dim,
    slice_names=[sf.name for sf in sfs],
    scorer=scorer,
)

In [24]:
applier = PandasSFApplier(sfs)
S_train = applier.apply(df_train)
S_test = applier.apply(df_test)

100%|██████████| 1586/1586 [00:02<00:00, 543.45it/s]
100%|██████████| 250/250 [00:00<00:00, 2647.20it/s]


In [25]:
from snorkel.classification.data import DictDataset, DictDataLoader

In [26]:
def create_dict_dataloader(X, Y, split, **kwargs):
    """Create a DictDataLoader for bag-of-words features."""
    ds = DictDataset.from_tensors(torch.FloatTensor(X), torch.LongTensor(Y), split)
    return DictDataLoader(ds, **kwargs)

In [27]:
BATCH_SIZE = 64

train_dl = create_dict_dataloader(X_train, Y_train, "train")
train_dl_slice = slice_model.make_slice_dataloader(
    train_dl.dataset, S_train, shuffle=True, batch_size=BATCH_SIZE
)
test_dl = create_dict_dataloader(X_test, Y_test, "train")
test_dl_slice = slice_model.make_slice_dataloader(
    test_dl.dataset, S_test, shuffle=False, batch_size=BATCH_SIZE
)

In [28]:
from snorkel.classification import Trainer

# For demonstration purposes, we set n_epochs=2
trainer = Trainer(n_epochs=2, lr=1e-4, progress_bar=True)
trainer.fit(slice_model, [train_dl_slice])

Epoch 0:: 100%|██████████| 25/25 [00:01<00:00, 13.83it/s, model/all/train/loss=0.509, model/all/train/lr=0.0001]
Epoch 1:: 100%|██████████| 25/25 [00:01<00:00, 14.95it/s, model/all/train/loss=0.263, model/all/train/lr=0.0001]


In [29]:
slice_model.score_slices([test_dl_slice], as_dataframe=True)

Unnamed: 0,label,dataset,split,metric,score
0,task,SnorkelDataset,train,f1,0.941704
1,task_slice:short_comment_pred,SnorkelDataset,train,f1,0.769231
2,task_slice:keyword_please_pred,SnorkelDataset,train,f1,0.977778
3,task_slice:regex_check_out_pred,SnorkelDataset,train,f1,1.0
4,task_slice:short_link_pred,SnorkelDataset,train,f1,0.5
5,task_slice:textblob_polarity_pred,SnorkelDataset,train,f1,0.8
6,task_slice:base_pred,SnorkelDataset,train,f1,0.941704
