In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
from sentence_transformers import SentenceTransformer, util
import numpy as np
from numpy import dot
from numpy.linalg import norm
import pandas as pd
import os
import re

In [None]:
!pip install sentence_transformers

Collecting sentence_transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting transformers<5.0.0,>=4.6.0 (from sentence_transformers)
  Downloading transformers-4.35.0-py3-none-any.whl (7.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.9/7.9 MB[0m [31m59.2 MB/s[0m eta [36m0:00:00[0m
Collecting sentencepiece (from sentence_transformers)
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m81.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface-hub>=0.4.0 (from sentence_transformers)
  Downloading huggingface_hub-0.18.0-py3-none-any.whl (301 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 k

In [None]:
label1 = pd.read_csv("/content/drive/MyDrive/2023-2 주분/댓글쪼개기_완전판4.csv")
label1 = label1["comments"]

In [None]:
label0 = pd.read_csv("/content/drive/MyDrive/2023-2 주분/innocent_유머.csv", encoding="cp949")
label0 = label0["content"]

In [None]:
pattern = r'\([^)]*\)'
label0 = label0.apply(lambda x : re.sub(pattern=pattern, repl="", string=x))
# 정규식으로 () 태그 삭제

label0 = label0.apply(lambda x : ' '.join(x.split()))
# 다중 공백 제거

label0 = label0.sample(n=10000)
# label1개의 개수에 맞추어 적절하게 샘플링

label0 = pd.Series(label0.unique())
# label1 데이터의 중복 제거

label0 = label0.to_list()

In [None]:
model = SentenceTransformer('snunlp/KR-SBERT-V40K-klueNLI-augSTS')

# https://github.com/snunlp/KR-SBERT
# KR-SBERT : A pretrained Korean-specific Sentence-BERT model developed by Computational Linguistics Lab at Seoul National University.
# STS 데이터셋을 증강한 (어떻게 증강?) 데이터셋 기반으로 학습된 모델이다.

Downloading (…)635b2/.gitattributes:   0%|          | 0.00/1.23k [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)3f97e635b2/README.md:   0%|          | 0.00/4.02k [00:00<?, ?B/s]

Downloading (…)97e635b2/config.json:   0%|          | 0.00/707 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/467M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)635b2/tokenizer.json:   0%|          | 0.00/967k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/394 [00:00<?, ?B/s]

Downloading (…)3f97e635b2/vocab.txt:   0%|          | 0.00/336k [00:00<?, ?B/s]

Downloading (…)7e635b2/modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

In [None]:
vec0 = model.encode(label0, show_progress_bar=True, batch_size=32)
vec1 = model.encode(label1, show_progress_bar=True, batch_size=32)

Batches:   0%|          | 0/310 [00:00<?, ?it/s]

Batches:   0%|          | 0/54 [00:00<?, ?it/s]

In [None]:
similarity = pd.DataFrame(columns=["label0", "label1", "score"])

In [None]:
def cos_sim(A, B):
    return max(0.0, dot(A, B)/(norm(A)*norm(B)))

In [None]:
def return_mostsim_label0 (sentence):
    embedding = model.encode(sentence)
    # (1) 입력된 label0 데이터를 임베딩한다.

    score = pd.Series(map(lambda x : cos_sim(x, embedding), vec1))
    # (2) 하나의 label0 문장에 대해서 모든 vec1(label=1) 데이터와의 유사성을 계산한다.

    temp = pd.DataFrame({"label0" : pd.Series(sentence, index=range(len(score))),
                        "label1" : pd.Series(label1),
                        "score" : pd.Series(score)})

    sort_by_score = temp.sort_values(by="score", ascending=False)
    return sort_by_score

In [None]:
for i in range(len(label0)):
    similarity = pd.concat([similarity, return_mostsim_label0(label0[i])])

In [None]:
similarity.to_csv("KR-SBERT_코사인유사도_유머.csv", index=False)

In [None]:
similarity

Unnamed: 0,label0,label1,score
369,"여기서 무야호 리믹스를 했다면,",멍청한 대가리로 H 흉내만 내려다 망한 경우,0.386388
323,"여기서 무야호 리믹스를 했다면,",영화폭망 퇴물 D를 왜 E한테 붙임? 제왑 언플징하네,0.377223
325,"여기서 무야호 리믹스를 했다면,",영화폭망 퇴물 D를 왜 E한테 붙임? 제왑 언플징하네,0.377223
1091,"여기서 무야호 리믹스를 했다면,",이 사람이 1막 내내 계속 캠코더로 찍엇어요. 3열 19번! 이 사람이 찍었어요,0.370172
148,"여기서 무야호 리믹스를 했다면,",C 청산의 망상은 모리배들의 작란이다,0.368654
...,...,...,...
493,2020 보는사람,더러운 돈 받았으면 쪽팔린 줄 아세요. 댁들 수령님 서울구치소에 계신데 면회나 다녀...,0.000000
1462,2020 보는사람,우리나라는 특권층과 독점 재벌만이 잘 살 수 있고 노동자와 농민은 굶주리고 있으며 ...,0.000000
459,2020 보는사람,경찰ㆍ변호사도 죽이고 싶지 않으면 가만있어라,0.000000
1339,2020 보는사람,왜왜왜,0.000000


In [None]:
temp=pd.read_csv('/content/KR-SBERT_코사인유사도_ver1.csv')

In [None]:
temp.sort_values('score',ascending=False)

Unnamed: 0,label0,label1,score
2132370,대머리,대머리,1.000000
15106140,"대머리 ,",대머리,0.920430
13765500,이중인격 앀,이중인격자,0.803288
571140,찌르레기,부스레기,0.781154
4442580,개 개,견 같은 새끼,0.776665
...,...,...,...
5695991,해지돼지 살그만빼 살좀있을때가 백만배 이뻐,자취방에서 물뽕 한잔 하자고 말 걸어 봐야겠다,0.000000
5695990,해지돼지 살그만빼 살좀있을때가 백만배 이뻐,망언,0.000000
5695989,해지돼지 살그만빼 살좀있을때가 백만배 이뻐,주차장에서 심야에 왜 나오느냐.,0.000000
5695988,해지돼지 살그만빼 살좀있을때가 백만배 이뻐,건물주답게 시차 따위는 문제가 되지 않는 모양이다. 마치 한국에서 게임을 하는 게 ...,0.000000
