In [2]:
from ko_text import *
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.multiclass import OneVsRestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

# 불러오기

In [3]:
token_df = pd.read_csv('Data/meta_morphs_final.csv', encoding = 'utf-8')

# 용량을 줄이기 위해 '단어 단어' 꼴로 묶어둔 token을 ['단어', '단어'] 꼴로 풀기
token_df['Token'] = [token.split() for token in token_df['Token']]

In [4]:
token_df.head()

Unnamed: 0,Section,Text,Token,Num of Tokens
0,financial,\n\n\n텀블벅에서 크라우드 펀딩이 이뤄지고 있는 `아침달 시집`.\n\n ...,"[텀블벅, 크라, 우드, 펀딩, 이뤄지고, 아침, 시집, 많지, 않은, 금액, 으로...",263
1,economy,\n\n\n[사진 제공: 연합뉴스]\n\n 유류...,"[유류, 인하, 국제, 유가, 급락, 입어, 국내, 휘발유, 경유, 하락, 특히, ...",166
2,financial,부득이한 사정으로 매월 내는 보험료가 부담이 될 때 계약은 그대로 유지하면서 보험...,"[부득이, 사정, 매월, 내는, 보험료, 부담, 계약, 그대로, 유지, 보험료, 부...",314
3,estate,한때 `미분양의 늪`으로 통하던 경기도 파주시 부동산 시장이 달라지고 있다. 지난해...,"[한때, 미분, 하던, 경기도, 파주시, 부동산, 시장, 달라지고, 분양, 파주, ...",165
4,economy,\n\n\n인디고뱅크의 `미키인서울` 컬래버 맨투맨 <사진제공=월트디즈니코리아>\...,"[인디고, 뱅크, 미키, 서울, 컬래버, 투맨, 월트디즈니, 사의, 마스코트, 미키...",196


In [5]:
token_df.shape

(41418, 4)

# Keyword 추출

In [6]:
nlp = NLP()

In [7]:
token_corpus = [' '.join(doc) for doc in token_df['Token']]

# Train Test Split

In [5]:
train_size = round(len(token_df) * 0.8)
np.random.seed(0)
train_index_ls = np.random.choice(token_df.index, train_size, replace = False)
test_index_ls = [x for x in token_df.index if not x in train_index_ls]

In [6]:
train_df = token_df.loc[train_index_ls]
train_df.shape

(33865, 4)

In [7]:
test_df = token_df.loc[test_index_ls]
test_df.shape

(8466, 4)

In [8]:
Counter(train_df['Section'])

Counter({'bio & tech': 1730,
         'business': 5055,
         'culture & art': 4105,
         'economy': 2611,
         'estate': 3958,
         'financial': 763,
         'it': 2125,
         'politics': 3810,
         'society': 3374,
         'stock': 2560,
         'world': 3774})

In [9]:
len(set(train_df['Section']))

11

# **TF-IDF classification**

# **Naive Bayes**

In [12]:
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=100000,
                              min_df = 3)),
    ('clf', OneVsRestClassifier(MultinomialNB(fit_prior=True, class_prior=None, alpha= 1e-2))),
])

parameters = {
    #'tfidf__max_df': (0.25, 0.5, 0.75),
    'tfidf__min_df': (1, 5, 10),
    'tfidf__ngram_range': [(1, 1), (1, 2),],
    #'clf__estimator__alpha': (1e-2)
}

In [None]:
% time

train_corpus = [' '.join(doc) for doc in train_df['Token']]
test_corpus = [' '.join(doc) for doc in test_df['Token']]

y_train = train_df['Section'].tolist()
y_test = test_df['Section'].tolist()

grid_search_tune = GridSearchCV(pipeline, parameters, cv=2, n_jobs=2, verbose=3)
grid_search_tune.fit(train_corpus, y_train)

In [None]:
print("Best parameters set:")
print(grid_search_tune.best_estimator_.steps)

# measuring performance on test set
print("Applying best classifier on test data:")
best_clf = grid_search_tune.best_estimator_
print(best_clf)

print(grid_search_tune.best_score_)
#predictions = best_clf.predict(test_df['Token'].tolist())

#print(classification_report(test_df['Section'].tolist(), predictions))

# **SVM**

In [9]:
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=50000,
                              min_df = 3)),
    ('clf', OneVsRestClassifier(LinearSVC())),
])

parameters = {
    'tfidf__max_df': (0.25, 0.5, 0.75),
    'tfidf__ngram_range': [(1, 1), (1, 2), (1, 3)],
    "clf__estimator__C": [0.01, 0.1, 1],
    "clf__estimator__class_weight": ['balanced', None],
}

In [11]:
% time

train_corpus = [' '.join(doc) for doc in train_df['Token']]
test_corpus = [' '.join(doc) for doc in test_df['Token']]

y_train = train_df['Section'].tolist()
y_test = test_df['Section'].tolist()

grid_search_tune = GridSearchCV(pipeline, parameters, cv=2, n_jobs=2, verbose=3)
grid_search_tune.fit(train_corpus, y_train)

Wall time: 998 µs
Fitting 2 folds for each of 54 candidates, totalling 108 fits


[Parallel(n_jobs=2)]: Done  28 tasks      | elapsed: 21.8min
[Parallel(n_jobs=2)]: Done 108 out of 108 | elapsed: 83.7min finished


GridSearchCV(cv=2, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=50000, min_df=3,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
...lti_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0),
          n_jobs=1))]),
       fit_params=None, iid=True, n_jobs=2,
       param_grid={'tfidf__max_df': (0.25, 0.5, 0.75), 'tfidf__ngram_range': [(1, 1), (1, 2), (1, 3)], 'clf__estimator__C': [0.01, 0.1, 1], 'clf__estimator__class_weight': ['balanced', None]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=3)

In [None]:
print("Best parameters set:")
print(grid_search_tune.best_estimator_.steps)

# measuring performance on test set
print("Applying best classifier on test data:")
best_clf = grid_search_tune.best_estimator_
print(best_clf)

print(grid_search_tune.best_score_)
#predictions = best_clf.predict(test_df['Token'].tolist())

#print(classification_report(test_df['Section'].tolist(), predictions))

# **Logistic Regression**

In [15]:
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=30000,
                              min_df = 3)),
    ('clf', OneVsRestClassifier(LogisticRegression(solver='sag'))),
])
parameters = {
    #'tfidf__max_df': (0.25, 0.5, 0.75),
    'tfidf__ngram_range': [(1, 1), (1, 2), (1, 3)],
    "clf__estimator__C": [0.01, 0.1, 1],
    "clf__estimator__class_weight": ['balanced', None],
}


In [None]:
% time

train_corpus = [' '.join(doc) for doc in train_df['Token']]
test_corpus = [' '.join(doc) for doc in test_df['Token']]

y_train = train_df['Section'].tolist()
y_test = test_df['Section'].tolist()

grid_search_tune = GridSearchCV(pipeline, parameters, cv=2, n_jobs=2, verbose=3)
grid_search_tune.fit(train_corpus, y_train)

In [15]:
print("Best parameters set:")
print(grid_search_tune.best_estimator_.steps)

# measuring performance on test set
print("Applying best classifier on test data:")
best_clf = grid_search_tune.best_estimator_
print(best_clf)

print(grid_search_tune.best_score_)
#predictions = best_clf.predict(test_df['Token'].tolist())

#print(classification_report(test_df['Section'].tolist(), predictions))

Best parameters set:
[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.75, max_features=30000, min_df=5,
        ngram_range=(1, 2), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)), ('clf', OneVsRestClassifier(estimator=LogisticRegression(C=1, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='sag', tol=0.0001, verbose=0, warm_start=False),
          n_jobs=1))]
Applying best classifier on test data:
Pipeline(memory=None,
     steps=[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encod