# 얕은 학습 주제 모델링

여기서는 얕은 학습 접근 방식을 사용해 주제 모델을 만드는 방법을 보여준다. 이를 위해 이분 그래프의 문서-문서 투영에서 얻은 결과와 임베딩을 사용할 것이다.

**참고: 이 노트북은 01_자연어처리_그래프_생성(01_nlp_graph_creation) 노트북 이후에만 실행할 수 있다. 첫 번째 노트북에서 계산된 결과 중 일부가 여기에서 재사용되기 때문이다.**

### 데이터셋 불러오기

In [1]:
import pandas as pd

In [2]:
corpus = pd.read_pickle("corpus.p")

In [3]:
from collections import Counter
topics = Counter([label for document_labels in corpus["label"] for label in document_labels]).most_common(10)

In [4]:
topics

[('earn', 3964),
 ('acq', 2369),
 ('money-fx', 717),
 ('grain', 582),
 ('crude', 578),
 ('trade', 485),
 ('interest', 478),
 ('ship', 286),
 ('wheat', 283),
 ('corn', 237)]

In [5]:
topicsList = [topic[0] for topic in topics]
topicsSet = set(topicsList)
dataset = corpus[corpus["label"].apply(lambda x: len(topicsSet.intersection(x))>0)]

임베딩 교육을 "가상화"하는 클래스 만들기

In [6]:
from sklearn.base import BaseEstimator

class EmbeddingsTransformer(BaseEstimator):
    
    def __init__(self, embeddings_file):
        self.embeddings_file = embeddings_file
        
    def fit(self, *args, **kwargs):
        self.embeddings = pd.read_pickle(self.embeddings_file)
        return self
        
    def transform(self, X):
        return self.embeddings.loc[X.index]
    
    def fit_transform(self, X, y):
        return self.fit().transform(X)



In [7]:
from glob import glob 
files = glob("./embeddings/*")

In [8]:
graphEmbeddings = EmbeddingsTransformer(files[0]).fit()

학습/테스트 데이터셋 분할

In [9]:
def get_labels(corpus, topicsList=topicsList):
    return corpus["label"].apply(
        lambda labels: pd.Series({label: 1 for label in labels}).reindex(topicsList).fillna(0)
    )[topicsList]

In [10]:
def get_features(corpus):
    return corpus["parsed"] #graphEmbeddings.transform(corpus["parsed"])

In [11]:
def get_features_and_labels(corpus):
    return get_features(corpus), get_labels(corpus)

In [12]:
def train_test_split(corpus):
    graphIndex = [index for index in corpus.index if index in graphEmbeddings.embeddings.index]
    
    train_idx = [idx for idx in graphIndex if "training/" in idx]
    test_idx = [idx for idx in graphIndex if "test/" in idx]
    return corpus.loc[train_idx], corpus.loc[test_idx]

In [13]:
train, test = train_test_split(dataset)

모델 생성 및 교차 검증

In [14]:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier 
from sklearn.multioutput import MultiOutputClassifier

In [15]:
model = MultiOutputClassifier(RandomForestClassifier())

In [16]:
pipeline = Pipeline([
    ("embeddings", graphEmbeddings),
    ("model", model)
])

In [17]:
from sklearn.model_selection import GridSearchCV

In [18]:
from sklearn.model_selection import RandomizedSearchCV

In [19]:
files

['./embeddings\\bipartiteGraphEmbeddings_10_20.p']

In [20]:
param_grid = {
    "embeddings__embeddings_file": files,
    "model__estimator__n_estimators": [50, 100], 
    "model__estimator__max_features": [0.2,0.3, "auto"], 
    #"model__estimator__max_depth": [3, 5]
}

In [21]:
features, labels = get_features_and_labels(train)

In [22]:
from sklearn.metrics import f1_score 

In [23]:
grid_search = GridSearchCV(pipeline, param_grid=param_grid, cv=5, n_jobs=-1, 
                           scoring=lambda y_true, y_pred: f1_score(y_true, y_pred,average='weighted'))

In [24]:
model = grid_search.fit(features, labels)



In [25]:
model

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('embeddings',
                                        EmbeddingsTransformer(embeddings_file='./embeddings\\bipartiteGraphEmbeddings_10_20.p')),
                                       ('model',
                                        MultiOutputClassifier(estimator=RandomForestClassifier()))]),
             n_jobs=-1,
             param_grid={'embeddings__embeddings_file': ['./embeddings\\bipartiteGraphEmbeddings_10_20.p'],
                         'model__estimator__max_features': [0.2, 0.3, 'auto'],
                         'model__estimator__n_estimators': [50, 100]},
             scoring=<function <lambda> at 0x000001843EC00700>)

In [26]:
model.best_params_

{'embeddings__embeddings_file': './embeddings\\bipartiteGraphEmbeddings_10_20.p',
 'model__estimator__max_features': 0.2,
 'model__estimator__n_estimators': 50}

성능 평가

In [27]:
def get_predictions(model, features):
    return pd.DataFrame(
        model.predict(features), 
        columns=topicsList, 
        index=features.index
    )

In [28]:
preds = get_predictions(model, get_features(test))
labels = get_labels(test)

In [29]:
errors = 1 - (labels - preds).abs().sum().sum() / labels.abs().sum().sum()

In [30]:
errors

0.7086472909939002

In [31]:
from sklearn.metrics import classification_report

In [32]:
print(classification_report(labels, preds))

              precision    recall  f1-score   support

           0       0.97      0.93      0.95      1087
           1       0.93      0.83      0.88       719
           2       0.80      0.60      0.69       179
           3       0.91      0.80      0.85       149
           4       0.90      0.70      0.79       189
           5       0.89      0.43      0.58       117
           6       0.87      0.46      0.60       131
           7       0.83      0.27      0.41        89
           8       0.84      0.37      0.51        71
           9       0.48      0.21      0.30        56

   micro avg       0.93      0.77      0.84      2787
   macro avg       0.84      0.56      0.65      2787
weighted avg       0.91      0.77      0.82      2787
 samples avg       0.81      0.80      0.80      2787



  _warn_prf(average, modifier, msg_start, len(result))
