In [None]:
!pip install konlpy

Collecting konlpy
  Downloading konlpy-0.6.0-py2.py3-none-any.whl (19.4 MB)
[K     |████████████████████████████████| 19.4 MB 58.6 MB/s 
Collecting JPype1>=0.7.0
  Downloading JPype1-1.3.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl (448 kB)
[K     |████████████████████████████████| 448 kB 41.7 MB/s 
Installing collected packages: JPype1, konlpy
Successfully installed JPype1-1.3.0 konlpy-0.6.0


In [None]:
import pandas as pd
import re
from konlpy.tag import Okt,Mecab
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score,f1_score
from lightgbm import LGBMClassifier

## EDA

In [None]:
train = pd.read_csv('train_data.csv')

In [None]:
len(train)

45654

In [None]:
train.head()

Unnamed: 0,index,title,topic_idx
0,0,인천→핀란드 항공기 결항…휴가철 여행객 분통,4
1,1,실리콘밸리 넘어서겠다…구글 15조원 들여 美전역 거점화,4
2,2,이란 외무 긴장완화 해결책은 미국이 경제전쟁 멈추는 것,4
3,3,NYT 클린턴 측근韓기업 특수관계 조명…공과 사 맞물려종합,4
4,4,시진핑 트럼프에 중미 무역협상 조속 타결 희망,4


In [None]:
train.isnull().sum()

index        0
title        0
topic_idx    0
dtype: int64

In [None]:
# label 비율 확인
train.topic_idx.value_counts()

4    7629
2    7362
5    6933
6    6751
1    6222
3    5933
0    4824
Name: topic_idx, dtype: int64

## 데이터 전처리


In [None]:
# 형태소 분석기(Okt) 
okt=Okt() 

In [None]:
# 조사, 어미, 구두점 제거
def cleaning(text):
    clean = []
    for word in okt.pos(text, stem=True): #어간 추출
        if word[1] not in ['Josa', 'Eomi', 'Punctuation']: #조사, 어미, 구두점 제외 
            clean.append(word[0])    
    return " ".join(clean)

train['title'] = train['title'].apply(lambda x : cleaning(x))

In [None]:
train.head()

Unnamed: 0,index,title,topic_idx
0,0,인천 → 핀란드 항공기 결항 휴가 철 여행객 분통,4
1,1,실리콘밸리 넘어서다 구글 15조원 들이다 美 전역 거점 화,4
2,2,이란 외무 긴장 완화 해결 책 미국 경제 전쟁 멈추다 것,4
3,3,NYT 클린턴 측근 韓 기업 특수 관계 조명 공과 사 맞다 물리다 종합,4
4,4,시진핑 트럼프 중미 무역 협상 조속 타결 희망,4


In [None]:
# tf-idf Vectorizing
def split(text):
  textS = text.split()
  return textS

tfidfVect = TfidfVectorizer(tokenizer=split)
tfidfVect.fit(train['title'])
tfidfMatrix = tfidfVect.transform(train['title'])

  "The parameter 'token_pattern' will not be used"


In [None]:
print(tfidfVect)

TfidfVectorizer(tokenizer=<function split at 0x7f9430ec5dd0>)


In [None]:
print(tfidfMatrix)

  (0, 30384)	0.303198955014393
  (0, 29166)	0.3362391623970149
  (0, 28669)	0.3700036247885556
  (0, 25514)	0.3044043022587192
  (0, 22162)	0.26389484173776045
  (0, 19349)	0.34201702125404715
  (0, 14336)	0.40122204342426404
  (0, 5320)	0.3732354398897556
  (0, 4069)	0.28026595532799803
  (1, 29937)	0.22865959899138585
  (1, 23330)	0.32506211538170515
  (1, 17786)	0.3890123413776047
  (1, 10130)	0.3640882559654885
  (1, 8347)	0.37608184708858755
  (1, 6293)	0.2731857007305464
  (1, 5043)	0.3601147329833301
  (1, 4239)	0.1855324251141215
  (1, 616)	0.4218834727540379
  (2, 29205)	0.32041220869587167
  (2, 25402)	0.3060167183549775
  (2, 23381)	0.3077745883554512
  (2, 21559)	0.23564325126772886
  (2, 20215)	0.32366964046937474
  (2, 20140)	0.33047889856695584
  (2, 12312)	0.25744674430114883
  :	:
  (45651, 23179)	0.19534039099561248
  (45651, 19656)	0.35466062341665416
  (45651, 17824)	0.3464598077175827
  (45651, 9426)	0.24749820937511133
  (45651, 5178)	0.21658261121350317
  (45651,

In [None]:
# train/valid dataset split

def split_dataset(tfidf,df):
    X_data = tfidf
    y_data = df['topic_idx']

    X_train, X_test, y_train, y_test = \
    train_test_split(X_data, y_data, test_size=0.2, random_state=3, stratify=y_data)
    
    return (X_train, X_test, y_train, y_test)

X_train, X_test, y_train, y_test = split_dataset(tfidfMatrix,train)


## 모델 학습 

In [None]:
lgbm = LGBMClassifier(random_state = 3)
lgbm.fit(X_train,y_train)

LGBMClassifier(random_state=3)

## 모델 평가


In [None]:
pred = lgbm.predict(X_test)
accuracy = accuracy_score(y_test,pred)

print('정확도', accuracy)

정확도 0.8076880954988501


## test 데이터 예측


In [None]:
test = pd.read_csv('test_data.csv')

In [None]:
test['title'] = test['title'].apply(lambda x : cleaning(x)) 

In [None]:
len(test)

In [None]:
tfidf_matrix_test = tfidfVect.transform(test['title'])

In [None]:
pred = lgbm.predict(tfidf_matrix_test)

## 제출 파일 생성


In [None]:
submission = pd.read_csv('sample_submission.csv')

In [None]:
submission['topic_idx'] = pred
submission.head()

In [None]:
submission.to_csv('fileNewsData.csv',index = False)