In [1]:
import pandas as pd
import numpy as np

In [2]:
df_hiphop = pd.read_csv('힙합.csv')
df_dance = pd.read_csv('댄스.csv')
df_ballad = pd.read_csv('발라드.csv')
df_trot = pd.read_csv('트로트.csv')

### 문서 단어 행렬(Document-Term Matrix, DTM)

In [3]:
from sklearn.feature_extraction.text import CountVectorizer

In [4]:
cv = CountVectorizer()
dtm = cv.fit_transform(df_hiphop.data)
dtm

<1164x32032 sparse matrix of type '<class 'numpy.int64'>'
	with 143639 stored elements in Compressed Sparse Row format>

In [5]:
sample_df = pd.DataFrame(dtm.toarray(), columns= cv.get_feature_names_out())
sample_df = sample_df.iloc[:,~sample_df.columns.isin(['새우','genkidama','nihao','glowin','ya','you','they','yacht','슬펐지만','wat','yall','새우던','새워','glowin','이였다고','odin','tym', '지겨내','the', 'my', 'it', 'me','우리', 'while', 'years', 'in', 'to', 'like','up', 'on','in','don','be','that','all','and','나를','love', 'your','우린','with'])]
sample_df

Unnamed: 0,aaaaaa,aain,aaron,ab,abandon,abandoned,abcd,abide,ability,able,...,힙찔,힙플,힙하,힙하대,힙할,힙합,힙합씬,힙합트레인,힙해,힛뎀
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1159,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1160,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1161,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1162,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### LDA 토픽 모델링

In [7]:
from sklearn.decomposition import LatentDirichletAllocation
lda = LatentDirichletAllocation(n_components=4)
lda.fit(sample_df)

In [8]:
for n, i in enumerate(lda.components_):
    idx = np.argsort(i)[::-1][:3]
    topic = cv.get_feature_names_out()[idx]
    print(f'Topic {n+1} : {topic}')

Topic 1 : ['xxk' 'waste' 'nignt']
Topic 2 : ['october' '같다고' '슬퍼하는']
Topic 3 : ['다를' '이었던' '증기기관차']
Topic 4 : ['사라졌던' '이었던' 'glow']


In [9]:
from gensim import corpora
from gensim.models import LdaModel
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

In [10]:
from tqdm import tqdm
from konlpy.tag import Okt
# Okt 객체 생성
okt = Okt()

In [47]:
box = []
for i in tqdm(df_hiphop['data']):
    words_with_pos = okt.pos(i)
    words = [
        word for word, pos in words_with_pos if 
        ('*' in word and pos == 'Punctuation') or
        (pos == 'Verb') or
        (pos == 'Noun') or
        (pos == 'Adjective') or
        (pos == 'Alpha') and 
        (word not in ['새우','genkidama','glowin','ya','wasting','you','yacht','나','너','내','해','안','속','날','그','난','슬펐지만','wat','yall','새우던','새워','glowin','이였다고','odin','tym', '지겨내','the', 'my', 'it', 'me','우리', 'while', 'years', 'in', 'to', 'like','up', 'on','in','don','be','that','all','and','나를','love', 'your','우린','with']) 
    ]
    words = [i for i in words if len(i) != 1]
    box.append(words)

100%|██████████████████████████████████████████████████████████████████████████████| 1164/1164 [00:20<00:00, 56.85it/s]


In [48]:
dic = corpora.Dictionary(box)

corpus = []
for i in box:
    a = dic.doc2bow(i)
    corpus.append(a)

### 토픽 모델링 분류 정확성 시각화

In [49]:
lda = LdaModel(corpus, num_topics=4, id2word=dic)

pyLDAvis.enable_notebook()
pyLDAvis_display = gensimvis.prepare(lda, corpus, dic)
pyLDAvis_display

  default_term_info = default_term_info.sort_values(


### tf_idf 모델

In [15]:
df = pd.concat([df_ballad,df_dance, df_hiphop, df_trot])
df

Unnamed: 0,data,label
0,잠깐 날 떠난줄 알았는데 날 기다려도 오지 않는 너 년 잊혀진단 친구 위로 사실 될...,발라드
1,수만 가지 생각 돌다 저편 사라진다 열 들떠 붉어진 얼굴 위로 찬 바람 겨 붙은 모...,발라드
2,힘든 거 였니 아픈 거 였니 너 이별 하는 슬픈 거 였니 너 울 웃던 이 계절 전부...,발라드
3,마음 문 활짝 열고 귀 기울여 기다리면 침묵 저편 들려오는 내 음성 다정한 그대 모...,발라드
4,넌 꿈 뭐 난 꿈 찾고 있어요 여기저기 둘러보고 것 것 만져 보고 경험 하며 꿈 찾...,발라드
...,...,...
995,이태원 프리덤 이태원 프리덤 나를 사랑 채워줘요 사랑 배터리 됐나 봐요 당신 없인 ...,트로트
996,바람 불면 꽃 바람 꿈 그리던 님 찾아 오려나 설레는 가슴 나 사랑 해 영원히 사랑...,트로트
997,목포 행 완행열차 마지막 기차 떠나가고 늦은 밤 홀로 한잔 술 몸 기댄다 우리 사랑...,트로트
998,나 간직 싶기에 이름 밝힌 적도 없었지요 기억 문 열고 들어와 내 앞 서 있는 그대...,트로트


In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(max_df=0.8, min_df=0.005,
                        stop_words=['you', 'the', 'my', 'it', 'me','우리', 'while','하는', 'years', 'in', 'to', 'like','up', 'on','in','don','시간','be','that','all','and','사랑','나를','love', 'your','can','우린','with','지난', '이번', '위한'],
                       max_features=50,
                       ngram_range=(1,3))


In [17]:
a = tfidf.fit_transform(df['data'])
df_dtm = pd.DataFrame(a.toarray(), columns= tfidf.get_feature_names_out())
df_dtm

Unnamed: 0,baby,do,for,get,go,got,is,just,know,la,...,않아,없는,없어,오늘,이제,있어,지금,하나,하루,하지
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.312140,0.482149,0.0,0.000000,0.000000,0.000000,0.173994,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.313591,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.723191,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4109,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.096474,0.000000,0.0,0.000000,0.160065,0.050137,0.107554,0.0,0.0
4110,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.0
4111,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.000000,0.0,0.419368,0.000000,0.000000,0.000000,0.0,0.0
4112,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.541179,0.000000,0.0,0.0


In [18]:
from sklearn.model_selection import train_test_split

In [19]:
data = df_dtm.to_numpy()
target = df['label'].to_numpy() 
train_input, test_input, train_target, test_target = train_test_split(data, target, test_size = 0.2)

In [20]:
from sklearn.ensemble import RandomForestClassifier

In [21]:
rf = RandomForestClassifier()
rf.fit(train_input, train_target)
rf.score(test_input, test_target)

0.6257594167679222

In [51]:
rf = RandomForestClassifier(oob_score=True)
rf.fit(train_input, train_target)
rf.score(test_input, test_target)

0.6464155528554071

In [52]:
rf.predict(train_input)

array(['힙합', '힙합', '댄스', ..., '트로트', '트로트', '댄스'], dtype=object)

In [54]:
from sklearn.metrics import classification_report
result = classification_report(test_target, rf.predict(test_input))
print(result)

              precision    recall  f1-score   support

          댄스       0.59      0.44      0.50       183
         발라드       0.58      0.63      0.60       193
         트로트       0.70      0.83      0.76       203
          힙합       0.68      0.66      0.67       244

    accuracy                           0.65       823
   macro avg       0.64      0.64      0.63       823
weighted avg       0.64      0.65      0.64       823



In [28]:
import joblib
joblib.dump(rf, 'tfidf_rf.pkl')

['tfidf_rf.pkl']

In [30]:
loaded_model = joblib.load('tfidf_rf.pkl')

### 힙합 예측

In [22]:
#다이나믹듀오 smoke
text = """Light it up Light it up Light it up

나는 달리거나 넘어지거나
둘 중에 하나야 브레이크 없는 bike
택도 없는 것들을 택도 안 뗀 옷 위로 stack it up
난 절대 빠꾸 없는 type

I’m gonna smoke you up
I’mma smoke you
I’m gonna smoke you up
I’mma smoke you
I’m gonna smoke you up
I’mma smoke you
싹 다 부수고 원상복구해 (light it up)

적자생존
아마 난 진짜 1
내일 없는 애들 빈 수레가 요란해 that’s why I’m shooting
To your 골대 cuz you have no keeper
차린 것 없는 밥상 들이밀지 말고 zip up
저기 빈털터리들 재떨이에 털어 넣고 twerkin‘
Then I’mma smoke another chance
You know that I’mma chop it
Lazy ho
그리고 또 stupid thug
모자란 애들 들이 마시고 뱉어 that’s wassup

내 입김은 태풍 내가 후 하고 불면 넌 힘없이 쓰러지는 가로수
무대 위 조명은 늘 파란불 내가 짓밟고 가는 넌 횡단보도 위에 가로줄
어차피 너무 기운 시소 이쯤 되면 너에게 필요한 건 시도 아닌 기도
난 입으로 널 패지 구타
처맞은 것처럼 네 뺨은 붉게 불타

나는 달리거나 넘어지거나
둘 중에 하나야 브레이크 없는 bike
택도 없는 것들을 택도 안 뗀 옷 위로 stack it up 난 절대 빠꾸 없는 type

I’m gonna smoke you up
I’mma smoke you
I’m gonna smoke you up
I’mma smoke you
I’m gonna smoke you up
I’mma smoke you
싹 다 부수고 원상복구해 (light it up)

끝났어 파티는
잔 돌려 ice water
여긴 아무도 없네
날 상대할 카리스마가
타자 팔자에 난 outsider
죽거나 싹 쓸어 주먹 안에 주사위
타고난 dice roller
지그시 밟아주지 부득이 싸움 나면
속도 조절 어린이 보호구역부터 아우토반
까다롭게 굴어 사우스포
갈기지 턱주가리 카운터
넌 피식 쓰러지는 나무토막

I’mma south side baddie
Collect all these veggies
Lap top 위에서 money dance
넌 계속해라 copy
Nothing's dynamic in ur life
저주 같지
다 끝난 파티 뒤에서
꽁초나 하나 줍길
Man I can't curse to you
Cuz you already die for it
Sorry that I’m so stable in my life
I’m done with it
Better get your money
Or u better get ma number
다 피고 남은 꽁초 더밀
꽂아줄게 주머니에

나는 달리거나 넘어지거나
둘 중에 하나야 브레이크 없는 bike
택도 없는 것들을 택도 안 뗀 옷 위로 stack it up 난 절대 빠꾸 없는 type

I’m gonna smoke you up
I’mma smoke you
I’m gonna smoke you up
I’mma smoke you
I’m gonna smoke you up
I’mma smoke you
싹 다 부수고 원상복구해 (light it up)

하이킥 로우킥에 넌 쓰러지지 픽픽
풀린 두 다리 눈 콧바람은 씩씩
네 목을 조르는 내 두 다리 사이로 보이는 네 흰자위
힘없이 탭탭 질식
팔다리를 꺾는 암바와 니바
네 자존심을 꺾는 짬바와 이빨
넌 피투성이 사람들이 기겁해
난 관대해 더 버티지 마 받아줄게 기권패

다 겪었지 대우차부터 테슬라
다 꺾였지 같이 짬밥 먹던 랩 스타
We stand strong
완력보다 강한 펜촉
우습게 봐도 오래 버티는 게 센 놈
상어 밥도 안돼 넌 그냥 벵에돔
엄마 젖은 사치 이유식을 맥여 더
빈약한 커리어 세치 혀로 채 썰어
태운 다음 재 털어

I’m gonna smoke you up
I’mma smoke you
I’m gonna smoke you up
I’mma smoke you
I’m gonna smoke you up
I’mma smoke you
싹 다 부수고 원상복구해 (light it up)

I’m gonna smoke you up
I’mma smoke you
I’m gonna smoke you up
I’mma smoke you
I’m gonna smoke you up
I’mma smoke you
싹 다 부수고 원상복구해 (light it up)"""

In [25]:
t = okt.pos(text)
t = [word for word, pos in t if ('*' in word and pos == 'Punctuation') or (pos == 'Verb') or (pos == 'Noun') or (pos == 'Noun') or (pos == 'Adjective') or (pos == 'Alpha')]
t = " ".join(t)

g = tfidf.transform([t]).toarray()
rf.predict(g)

array(['힙합'], dtype=object)

### 트로트 예측

In [32]:
# 장윤정 어머나
text = """어머나 어머나 이러지 마세요   
여자의 마음은 갈대랍니다
안돼요 왜이래요 묻지말아요
더이상 내게 원하시면 안돼요
오늘 처음 만난 당신이지만
내 사랑인걸요
헤어지면 남이 되어
모른척 하겠지만
좋아해요 사랑해요
거짓말처럼 당신을 사랑해요
소설속에 영화속에
멋진 주인공은 아니지만
괜찮아요 말해봐요
당신 위해서라면 다 줄게요
어머나 어머나 이러지 마세요
여자의 마음은 바람입니다
안돼요 왜이래요 잡지말아요
더이상 내게 바라시면 안돼요
오늘 처음 만난 당신이지만
내 사랑인걸요
헤어지면 남이 되어
모른 척 하겠지만
좋아해요 사랑해요
거짓말처럼 당신을 사랑해요
소설속에 영화속에
멋진 주인공은 아니지만
괜찮아요 말해봐요
당신 위해서라면 다 줄게요
소설속에 영화속에
멋진 주인공은 아니지만
괜찮아요 말해봐요
당신 위해서라면 다 줄게요"""

In [33]:
t = okt.pos(text)
t = [word for word, pos in t if ('*' in word and pos == 'Punctuation') or (pos == 'Verb') or (pos == 'Noun') or (pos == 'Noun') or (pos == 'Adjective') or (pos == 'Alpha')]
t = " ".join(t)

g = tfidf.transform([t]).toarray()
rf.predict(g)

array(['트로트'], dtype=object)

### 댄스

In [38]:
# 뉴진스 supershy
text = """I’m super shy, super shy
But wait a minute while I
Make you mine, make you mine
떨리는 지금도
You’re on my mind
All the time
I wanna tell you but I’m
Super shy, super shy

I’m super shy, super shy
But wait a minute while I
Make you mine, make you mine
떨리는 지금도
You’re on my mind
All the time
I wanna tell you but I’m
Super shy, super shy

And I wanna go out with you
Where you wanna go? (Huh?)
Find a lil spot
Just sit and talk
Looking pretty
Follow me
우리 둘이 나란히
보이지? (봐)
내 눈이 (heh)
갑자기
빛나지
When you say
I’m your dream

You don’t even know my name
Do ya?
You don’t even know my name
Do ya-a?
누구보다도

I’m super shy, super shy
But wait a minute while I
Make you mine, make you mine
떨리는 지금도
You’re on my mind
All the time
I wanna tell you but I’m
Super shy, super shy

I’m super shy, super shy
But wait a minute while I
Make you mine, make you mine
떨리는 지금도
You’re on my mind
All the time
I wanna tell you but I’m
Super shy, super shy

나 원래 말도 잘하고 그런데 왜 이런지
I don’t like that
Something odd about you
Yeah you’re special and you know it
You’re the top babe

I’m super shy, super shy
But wait a minute while I
Make you mine, make you mine
떨리는 지금도
You’re on my mind
All the time
I wanna tell you but I’m
Super shy, super shy

I’m super shy, super shy
But wait a minute while I
Make you mine, make you mine
떨리는 지금도
You’re on my mind
All the time
I wanna tell you but I’m
Super shy, super shy

You don’t even know my name
Do ya?
You don’t even know my name
Do ya-a?
누구보다도
You don’t even know my name
Do ya?
You don’t even know my name
Do ya-a?"""

In [39]:
t = okt.pos(text)
t = [word for word, pos in t if ('*' in word and pos == 'Punctuation') or (pos == 'Verb') or (pos == 'Noun') or (pos == 'Noun') or (pos == 'Adjective') or (pos == 'Alpha')]
t = " ".join(t)

g = tfidf.transform([t]).toarray()
rf.predict(g)

array(['댄스'], dtype=object)

### 발라드

In [36]:
# 박재정 헤어지자말해요
text = """헤어지자고 말하려 오늘
너에게 가다가 우리 추억 생각해 봤어
처음 본 네 얼굴
마주친 눈동자
가까스로 본 너의 그 미소들
손을 잡고 늘 걷던 거리에
첫눈을 보다가 문득 고백했던 그 순간
가보고 싶었던 식당
난생처음 준비한 선물
고맙다는 너의 그 눈물들이
바뀔까 봐 두려워
그대 먼저 헤어지자 말해요
나는 사실 그대에게 좋은 사람이 아녜요
그대 이제 날 떠난다 말해요
잠시라도 이 행복을 느껴서 고마웠다고
시간이 지나고 나면 나는
어쩔 수 없을 걸 문득 너의 사진 보겠지
새로 사귄 친구 함께
웃음 띤 네 얼굴 보면
말할 수 없을 묘한 감정들이
힘들단 걸 알지만
그대 먼저 헤어지자 말해요
나는 사실 그대에게 좋은 사람이 아녜요
그대 이제 날 떠난다 말해요
잠시라도 이 행복을 느껴서 고마웠다고
한 번은 널 볼 수 있을까
이기적인 거 나도 잘 알아
그땐 그럴 수밖에 없던
어린 내게 한 번만 더 기회를 주길
그댈 정말 사랑했다 말해요
나는 사실 그대에게
좋은 사람이 되고 싶었어
영영 다신 못 본다 해도
그댈 위한 이 노래가
당신을 영원히 사랑할 테니"""

In [37]:
t = okt.pos(text)
t = [word for word, pos in t if ('*' in word and pos == 'Punctuation') or (pos == 'Verb') or (pos == 'Noun') or (pos == 'Noun') or (pos == 'Adjective') or (pos == 'Alpha')]
t = " ".join(t)

g = tfidf.transform([t]).toarray()
rf.predict(g)

array(['트로트'], dtype=object)

In [40]:
# 임영웅 사랑은 늘 도망가
text = """눈물이 난다 이 길을 걸으면
그 사람 손길이 자꾸 생각이 난다
붙잡지 못하고 가슴만 떨었지
내 아름답던 사람아
사랑이란 게 참 쓰린 거더라
잡으려 할수록 더 멀어지더라
이별이란 게 참 쉬운 거더라
내 잊지 못할 사람아
사랑아 왜 도망가
수줍은 아이처럼
행여 놓아버릴까 봐
꼭 움켜쥐지만
그리움이 쫓아 사랑은 늘 도망가
잠시 쉬어가면 좋을 텐데
바람이 분다 옷깃을 세워도
차가운 이별의 눈물이 차올라
잊지 못해서 가슴에 사무친
내 소중했던 사람아
사랑아 왜 도망가
수줍은 아이처럼
행여 놓아버릴까 봐
꼭 움켜쥐지만
그리움이 쫓아 사랑은 늘 도망가
잠시 쉬어가면 좋을 텐데
기다림도 애태움도 다 버려야 하는데
무얼 찾아 이 길을 서성일까
무얼 찾아 여기 있나
사랑아 왜 도망가
수줍은 아이처럼
행여 놓아버릴까 봐
꼭 움켜쥐지만
그리움이 쫓아 사랑은 늘 도망가
잠시 쉬어가면 좋을 텐데
잠시 쉬어가면 좋을 텐데"""

In [41]:
t = okt.pos(text)
t = [word for word, pos in t if ('*' in word and pos == 'Punctuation') or (pos == 'Verb') or (pos == 'Noun') or (pos == 'Noun') or (pos == 'Adjective') or (pos == 'Alpha')]
t = " ".join(t)

g = tfidf.transform([t]).toarray()
rf.predict(g)

array(['트로트'], dtype=object)