In [1]:
import pandas as pd

df = pd.read_csv('data_split_3.tsv', delimiter='\t')
df = df.set_index('index')
df = df[~df.duplicated(subset='text')]
df = df.reset_index(drop=True)
df['text'] = df['text'].str.replace("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]", "", regex=True)
df = df.reset_index(drop=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 955369 entries, 0 to 955368
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   text    955368 non-null  object 
 1   label   955368 non-null  float64
dtypes: float64(1), object(1)
memory usage: 14.6+ MB


In [2]:
from transformers import BertTokenizer
from konlpy.tag import Okt
from nltk.corpus import stopwords
import nltk

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
df = df.dropna(subset=['label'])
df['label'] = df['label'].astype(int)

In [4]:
df = df.dropna()
df = df.reset_index(drop=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 955367 entries, 0 to 955366
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   text    955367 non-null  object
 1   label   955367 non-null  int32 
dtypes: int32(1), object(1)
memory usage: 10.9+ MB


In [5]:
from tqdm import tqdm

tqdm.pandas()

stop_words = ['의', '가', '이', '은', '들', '는', '좀', '잘', '걍', '과', '도', '를', '으로', '자', '에', '와', '한', '하다', '되다', '이다', '있다', '없다', '가다', '오다', '보다', '그', '저']
okt = Okt()

def preprocess_text(review, stop_words):
    tokens = okt.morphs(review)  # 형태소 분석으로 토큰화
    filtered_tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(filtered_tokens)  # 공백으로 연결하여 반환

# 전처리 적용
df['processed_text'] = df['text'].progress_apply(lambda x: preprocess_text(x, stop_words))

100%|██████████| 955367/955367 [32:36<00:00, 488.29it/s]  


In [6]:
from sentence_transformers import SentenceTransformer

# 4. S-BERT 임베딩 생성
model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')
df['embedding'] = df['processed_text'].progress_apply(lambda x: model.encode(x).tolist())




100%|██████████| 955367/955367 [5:06:22<00:00, 51.97it/s]   


In [7]:
model.save('sentence_transformer_model')