# Semi-supervised Classification on a Text Dataset
- semi-supervised 분류기는 20개의 뉴스 그룹 데이터 세트에 대해 학습
    - 데이터세 로더에 이름을 지정하여 카테고리 수 조정 가능

In [1]:
import numpy as np

from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.semi_supervised import LabelSpreading, SelfTrainingClassifier

In [2]:
# 처음 5개의 카테고리가 포함된 데이터셋 로드
data = fetch_20newsgroups(
    subset="train",
    categories=[
        "alt.atheism",
        "comp.graphics",
        "comp.os.ms-windows.misc",
        "comp.sys.ibm.pc.hardware",
        "comp.sys.mac.hardware",
    ],
)
print("%d documents" % len(data.filenames))
print("%d categories" % len(data.target_names))
print()

2823 documents
5 categories



In [3]:
# 파라미터들
sdg_params = dict(alpha=1e-5, penalty="l2", loss="log_loss")
vectorizer_params = dict(ngram_range=(1, 2), min_df=5, max_df=0.8)

In [4]:
# 감독 파이프라인
pipeline = Pipeline(
    [
        ("vect", CountVectorizer(**vectorizer_params)),
        ("tfidf", TfidfTransformer()),
        ("clf", SGDClassifier(**sdg_params)),
    ]
)
# 셀프 트레이닝 파이프라인
st_pipeline = Pipeline(
    [
        ("vect", CountVectorizer(**vectorizer_params)),
        ("tfidf", TfidfTransformer()),
        ("clf", SelfTrainingClassifier(SGDClassifier(**sdg_params), verbose=True)),
    ]
)
# 라벨 확산 파이프라인
ls_pipeline = Pipeline(
    [
        ("vect", CountVectorizer(**vectorizer_params)),
        ("tfidf", TfidfTransformer()),
        # LabelSpreading은 고밀도 행렬 지원하지 않음
        ("toarray", FunctionTransformer(lambda x: x.toarray())),
        ("clf", LabelSpreading()),
    ]
)

In [5]:
def eval_and_print_metrics(clf, X_train, y_train, X_test, y_test):
    print("Number of training samples:", len(X_train))
    print("Unlabeled samples in training set:", sum(1 for x in y_train if x == -1))
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(
        "Micro-averaged F1 score on test set: %0.3f"
        % f1_score(y_test, y_pred, average="micro")
    )
    print("-" * 10)
    print()

In [6]:
if __name__ == "__main__":
    X, y = data.data, data.target
    X_train, X_test, y_train, y_test = train_test_split(X, y)

    print("Supervised SGDClassifier on 100% of the data:")
    eval_and_print_metrics(pipeline, X_train, y_train, X_test, y_test)

    # 훈련 데이터셋의 20% 마스크 선택
    y_mask = np.random.rand(len(y_train)) < 0.2

    # X_20 및 y_20은 마스크로 표시된 열차 데이터셋의 하위 집합 
    X_20, y_20 = map(
        list, zip(*((x, y) for x, y, m in zip(X_train, y_train, y_mask) if m))
    )
    print("Supervised SGDClassifier on 20% of the training data:")
    eval_and_print_metrics(pipeline, X_20, y_20, X_test, y_test)

    # 마스크 되지 않은 하위 집합을 레이블이 해제되도록 설정
    y_train[~y_mask] = -1
    print("SelfTrainingClassifier on 20% of the training data (rest is unlabeled):")
    eval_and_print_metrics(st_pipeline, X_train, y_train, X_test, y_test)

    print("LabelSpreading on 20% of the data (rest is unlabeled):")
    eval_and_print_metrics(ls_pipeline, X_train, y_train, X_test, y_test)

Supervised SGDClassifier on 100% of the data:
Number of training samples: 2117
Unlabeled samples in training set: 0
Micro-averaged F1 score on test set: 0.911
----------

Supervised SGDClassifier on 20% of the training data:
Number of training samples: 405
Unlabeled samples in training set: 0
Micro-averaged F1 score on test set: 0.776
----------

SelfTrainingClassifier on 20% of the training data (rest is unlabeled):
Number of training samples: 2117
Unlabeled samples in training set: 1712
End of iteration 1, added 1047 new labels.
End of iteration 2, added 249 new labels.
End of iteration 3, added 67 new labels.
End of iteration 4, added 25 new labels.
End of iteration 5, added 14 new labels.
End of iteration 6, added 8 new labels.
End of iteration 7, added 6 new labels.
End of iteration 8, added 2 new labels.
End of iteration 9, added 1 new labels.
End of iteration 10, added 2 new labels.
Micro-averaged F1 score on test set: 0.841
----------

LabelSpreading on 20% of the data (rest is