In [5]:
import pandas as pd
import re
import glob
import os
from tqdm import tqdm
tqdm.pandas()

from konlpy.tag import Okt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import time
import tqdm
import jpype

In [6]:
folder_path = '/Users/jaesolshin/Documents/GitHub/trendpop/'
file_path = os.path.join(folder_path, 'KPOP_comments_merged_preprocessed.csv')
comments_df = pd.read_csv(file_path)

In [7]:
# 명사추출기 객체정의
okt = Okt()

# 불용어 정의
stopwords = ['timecode', '진짜','이번','사람','정말', '뭔데', '그룹', '우리', '생각', '댓글', '느낌','계속', '지금', '최고', '영상', '처음', '축하', '대박', '이건', '당신', '제발', '항상', '아주', '다음', '정도', '모두', '보고', '그냥', '다시', '그것', '역시', '점점', '오늘', '요즘', '가장', '부분', '전부', '제일', '너머', '내용', '뭔가', '모습', '근데', '너무', '아니', '사람들', '같아요', '데리', '여러분', '세상', '자기', '다른']

# 고유명사 리스트 정의
custom_nouns = ['에스파', '조회수', '하이브', '어도어', 'BTS', '아이브', '르세라핌']

# 텍스트에서 명사만 추출하는 함수
def extract_nouns(text, stopwords=stopwords, custom_nouns=custom_nouns):
    # Okt를 통해 명사 추출
    nouns = okt.nouns(text)

    # 고유명사로 대체하는 로직
    updated_nouns = []
    for noun in nouns:
        # 고유 명사에 포함된 단어를 대체
        replaced = False
        for custom_noun in custom_nouns:
            if custom_noun in text:
                # 만약 명사가 custom_nouns에 있는 단어의 일부라면 해당 단어로 대체
                if noun in custom_noun:
                    updated_nouns.append(custom_noun)
                    replaced = True
                    break
        if not replaced:
            updated_nouns.append(noun)

    # 길이가 1 이상이면서 stopwords에 없는 명사만 남김
    final_nouns = list(set([noun for noun in updated_nouns if len(noun) > 1 and noun not in stopwords]))

    return final_nouns

# 정상작동여부 확인
print(extract_nouns("신우석 감독님 폼 미치셨네요;;ㄹㅇ 이 노래로 큐피트를 표현할 생각을 하시다니"))
print(extract_nouns("에스파 조회수 1억뷰 돌파"))

['큐피트', '신우석', '표현', '노래', '감독']
['에스파', '돌파', '조회수']


In [8]:
import os
import pandas as pd
import concurrent.futures
from tqdm import tqdm  # tqdm 함수 불러오기

def extract_nouns_batch(dataframe, folder_path, batch_size=5, initialize=True, verbose=True):
    # 디렉토리가 존재하지 않으면 생성
    if not os.path.exists(folder_path):
        os.makedirs(folder_path)

    # 총 배치 수 계산
    batch_time = (len(dataframe) + batch_size - 1) // batch_size

    # 임시 파일 경로 설정
    temp_file_path = os.path.join(folder_path, 'processing.csv')

    # 임시 파일에 빈 데이터프레임 생성
    if not os.path.exists(temp_file_path) or initialize==True:
        temp = pd.DataFrame(columns=dataframe.columns) 
        temp.to_csv(temp_file_path, index=False)

    # 배치별 처리
    for n_try in tqdm(range(batch_time)):
        # 임시 파일에서 진행 상태 확인
        try:
            if os.path.getsize(temp_file_path) > 0:
                batch_temp = pd.read_csv(temp_file_path)
            else:
                batch_temp = pd.DataFrame(columns=dataframe.columns)  # 빈 데이터프레임 초기화
        except pd.errors.EmptyDataError:
            batch_temp = pd.DataFrame(columns=dataframe.columns)

        # 현재 배치 슬라이싱
        start_idx = n_try * batch_size
        end_idx = min((n_try + 1) * batch_size, len(dataframe))
        frac = dataframe.iloc[start_idx:end_idx]
        if verbose==True:
            print(f"Processing batch: {start_idx} to {end_idx}")

        # 명사 추출 및 처리
        frac = frac.copy()
        with concurrent.futures.ThreadPoolExecutor() as executor:
            frac['word_list'] = list(executor.map(extract_nouns, frac['comment']))

        # 배치 상태 업데이트
        if not frac.empty:
            tmp_path = os.path.join(folder_path, 'process_status.txt')
            with open(tmp_path, 'w', encoding='utf-8') as f:
                f.write(f"status : {frac.index[-1]}")  # 배치의 마지막 인덱스

        # 기존 데이터에 현재 배치 데이터 추가 (빈 행 제거 후)
        batch_temp = pd.concat([batch_temp, frac], ignore_index=True)

        # CSV 파일로 저장
        batch_temp.to_csv(temp_file_path, index=False)

In [9]:
# process_status에 저장된 마지막 인데스 읽어오기
with open('process_status.txt', 'r', encoding='utf-8') as f:
    content = f.readline()
    last_idx = int(content.split()[2])
    print(last_idx)

# last_idx가 가리키는 댓글이 proceesing.csv에 저장된 마지막 댓글과 일치하는지 확인
start_idx = last_idx +1
last_comment = comments_df.iloc[last_idx,]['comment_id_key']
if last_comment == pd.read_csv('processing2.csv').iloc[-1,]['comment_id_key']:
    print('OK - proceed to next stage')

# 배치 처리기에 넣을 데이터 프레임
comments_df2 = comments_df.iloc[start_idx:,]

636198
OK - proceed to next stage


In [10]:
# 실행 부분
import time
folder_path = '/Users/jaesolshin/Documents/GitHub/trendpop/'
extract_nouns_batch(comments_df, folder_path, batch_size=100, initialize=False, verbose=True)
print(len(pd.read_csv('processing.csv'))+len(pd.read_csv('processing2.csv')))

  0%|          | 0/119 [00:00<?, ?it/s]

Processing batch: 0 to 100


  1%|          | 1/119 [00:01<02:07,  1.08s/it]

Processing batch: 100 to 200


  2%|▏         | 2/119 [00:02<02:02,  1.05s/it]

Processing batch: 200 to 300


  3%|▎         | 3/119 [00:03<02:00,  1.04s/it]

Processing batch: 300 to 400


  3%|▎         | 4/119 [00:04<01:59,  1.04s/it]

Processing batch: 400 to 500


  4%|▍         | 5/119 [00:05<01:57,  1.03s/it]

Processing batch: 500 to 600


  5%|▌         | 6/119 [00:06<01:55,  1.02s/it]

Processing batch: 600 to 700


  6%|▌         | 7/119 [00:07<01:57,  1.05s/it]

Processing batch: 700 to 800


  7%|▋         | 8/119 [00:08<01:56,  1.05s/it]

Processing batch: 800 to 900


  8%|▊         | 9/119 [00:09<01:54,  1.04s/it]

Processing batch: 900 to 1000


  8%|▊         | 10/119 [00:10<01:53,  1.04s/it]

Processing batch: 1000 to 1100


  9%|▉         | 11/119 [00:11<01:49,  1.02s/it]

Processing batch: 1100 to 1200


 10%|█         | 12/119 [00:12<01:48,  1.02s/it]

Processing batch: 1200 to 1300


 11%|█         | 13/119 [00:13<01:47,  1.02s/it]

Processing batch: 1300 to 1400


 12%|█▏        | 14/119 [00:14<01:48,  1.03s/it]

Processing batch: 1400 to 1500


 13%|█▎        | 15/119 [00:15<01:47,  1.04s/it]

Processing batch: 1500 to 1600


 13%|█▎        | 16/119 [00:16<01:45,  1.02s/it]

Processing batch: 1600 to 1700


 14%|█▍        | 17/119 [00:17<01:43,  1.01s/it]

Processing batch: 1700 to 1800


 15%|█▌        | 18/119 [00:18<01:42,  1.01s/it]

Processing batch: 1800 to 1900


 16%|█▌        | 19/119 [00:19<01:42,  1.02s/it]

Processing batch: 1900 to 2000


 17%|█▋        | 20/119 [00:20<01:40,  1.02s/it]

Processing batch: 2000 to 2100


 18%|█▊        | 21/119 [00:21<01:41,  1.04s/it]

Processing batch: 2100 to 2200


 18%|█▊        | 22/119 [00:22<01:41,  1.05s/it]

Processing batch: 2200 to 2300


 19%|█▉        | 23/119 [00:23<01:40,  1.05s/it]

Processing batch: 2300 to 2400


 20%|██        | 24/119 [00:24<01:39,  1.04s/it]

Processing batch: 2400 to 2500


 21%|██        | 25/119 [00:25<01:35,  1.02s/it]

Processing batch: 2500 to 2600


 22%|██▏       | 26/119 [00:26<01:35,  1.02s/it]

Processing batch: 2600 to 2700


 23%|██▎       | 27/119 [00:27<01:33,  1.01s/it]

Processing batch: 2700 to 2800


 24%|██▎       | 28/119 [00:28<01:32,  1.02s/it]

Processing batch: 2800 to 2900


 24%|██▍       | 29/119 [00:29<01:31,  1.01s/it]

Processing batch: 2900 to 3000


 25%|██▌       | 30/119 [00:30<01:30,  1.02s/it]

Processing batch: 3000 to 3100


 26%|██▌       | 31/119 [00:31<01:27,  1.00it/s]

Processing batch: 3100 to 3200


 27%|██▋       | 32/119 [00:32<01:27,  1.00s/it]

Processing batch: 3200 to 3300


 28%|██▊       | 33/119 [00:33<01:25,  1.00it/s]

Processing batch: 3300 to 3400


 29%|██▊       | 34/119 [00:34<01:24,  1.00it/s]

Processing batch: 3400 to 3500


 29%|██▉       | 35/119 [00:35<01:23,  1.01it/s]

Processing batch: 3500 to 3600


 30%|███       | 36/119 [00:36<01:24,  1.01s/it]

Processing batch: 3600 to 3700


 31%|███       | 37/119 [00:37<01:23,  1.02s/it]

Processing batch: 3700 to 3800


 32%|███▏      | 38/119 [00:38<01:21,  1.01s/it]

Processing batch: 3800 to 3900


 33%|███▎      | 39/119 [00:39<01:20,  1.01s/it]

Processing batch: 3900 to 4000


 34%|███▎      | 40/119 [00:40<01:19,  1.00s/it]

Processing batch: 4000 to 4100


 34%|███▍      | 41/119 [00:41<01:18,  1.01s/it]

Processing batch: 4100 to 4200


 35%|███▌      | 42/119 [00:42<01:16,  1.00it/s]

Processing batch: 4200 to 4300


 36%|███▌      | 43/119 [00:43<01:15,  1.01it/s]

Processing batch: 4300 to 4400


 37%|███▋      | 44/119 [00:44<01:15,  1.01s/it]

Processing batch: 4400 to 4500


 38%|███▊      | 45/119 [00:45<01:14,  1.01s/it]

Processing batch: 4500 to 4600


 39%|███▊      | 46/119 [00:46<01:13,  1.01s/it]

Processing batch: 4600 to 4700


 39%|███▉      | 47/119 [00:47<01:12,  1.01s/it]

Processing batch: 4700 to 4800


 40%|████      | 48/119 [00:48<01:12,  1.02s/it]

Processing batch: 4800 to 4900


 41%|████      | 49/119 [00:49<01:11,  1.02s/it]

Processing batch: 4900 to 5000


 42%|████▏     | 50/119 [00:50<01:09,  1.01s/it]

Processing batch: 5000 to 5100


 43%|████▎     | 51/119 [00:51<01:09,  1.02s/it]

Processing batch: 5100 to 5200


 44%|████▎     | 52/119 [00:52<01:06,  1.01it/s]

Processing batch: 5200 to 5300


 45%|████▍     | 53/119 [00:53<01:06,  1.01s/it]

Processing batch: 5300 to 5400


 45%|████▌     | 54/119 [00:55<01:06,  1.02s/it]

Processing batch: 5400 to 5500


 46%|████▌     | 55/119 [00:56<01:04,  1.01s/it]

Processing batch: 5500 to 5600


 47%|████▋     | 56/119 [00:57<01:04,  1.02s/it]

Processing batch: 5600 to 5700


 48%|████▊     | 57/119 [00:58<01:02,  1.02s/it]

Processing batch: 5700 to 5800


 49%|████▊     | 58/119 [00:59<01:02,  1.02s/it]

Processing batch: 5800 to 5900


 50%|████▉     | 59/119 [01:00<01:00,  1.01s/it]

Processing batch: 5900 to 6000


 50%|█████     | 60/119 [01:01<01:00,  1.03s/it]

Processing batch: 6000 to 6100


 51%|█████▏    | 61/119 [01:02<01:00,  1.03s/it]

Processing batch: 6100 to 6200


 52%|█████▏    | 62/119 [01:03<00:58,  1.03s/it]

Processing batch: 6200 to 6300


 53%|█████▎    | 63/119 [01:04<00:57,  1.03s/it]

Processing batch: 6300 to 6400


 54%|█████▍    | 64/119 [01:05<00:57,  1.04s/it]

Processing batch: 6400 to 6500


 55%|█████▍    | 65/119 [01:06<00:56,  1.05s/it]

Processing batch: 6500 to 6600


 55%|█████▌    | 66/119 [01:07<00:55,  1.05s/it]

Processing batch: 6600 to 6700


 56%|█████▋    | 67/119 [01:08<00:54,  1.06s/it]

Processing batch: 6700 to 6800


 57%|█████▋    | 68/119 [01:09<00:54,  1.06s/it]

Processing batch: 6800 to 6900


 58%|█████▊    | 69/119 [01:10<00:52,  1.06s/it]

Processing batch: 6900 to 7000


 59%|█████▉    | 70/119 [01:11<00:52,  1.06s/it]

Processing batch: 7000 to 7100


 60%|█████▉    | 71/119 [01:12<00:50,  1.05s/it]

Processing batch: 7100 to 7200


 61%|██████    | 72/119 [01:13<00:48,  1.04s/it]

Processing batch: 7200 to 7300


 61%|██████▏   | 73/119 [01:14<00:48,  1.05s/it]

Processing batch: 7300 to 7400


 62%|██████▏   | 74/119 [01:15<00:47,  1.05s/it]

Processing batch: 7400 to 7500


 63%|██████▎   | 75/119 [01:16<00:46,  1.06s/it]

Processing batch: 7500 to 7600


 64%|██████▍   | 76/119 [01:18<00:45,  1.06s/it]

Processing batch: 7600 to 7700


 65%|██████▍   | 77/119 [01:19<00:44,  1.06s/it]

Processing batch: 7700 to 7800


 66%|██████▌   | 78/119 [01:20<00:43,  1.05s/it]

Processing batch: 7800 to 7900


 66%|██████▋   | 79/119 [01:21<00:42,  1.06s/it]

Processing batch: 7900 to 8000


 67%|██████▋   | 80/119 [01:22<00:41,  1.07s/it]

Processing batch: 8000 to 8100


 68%|██████▊   | 81/119 [01:23<00:40,  1.07s/it]

Processing batch: 8100 to 8200


 69%|██████▉   | 82/119 [01:24<00:38,  1.05s/it]

Processing batch: 8200 to 8300


 70%|██████▉   | 83/119 [01:25<00:37,  1.06s/it]

Processing batch: 8300 to 8400


 71%|███████   | 84/119 [01:26<00:36,  1.05s/it]

Processing batch: 8400 to 8500


 71%|███████▏  | 85/119 [01:27<00:35,  1.05s/it]

Processing batch: 8500 to 8600


 72%|███████▏  | 86/119 [01:28<00:34,  1.04s/it]

Processing batch: 8600 to 8700


 73%|███████▎  | 87/119 [01:29<00:33,  1.05s/it]

Processing batch: 8700 to 8800


 74%|███████▍  | 88/119 [01:30<00:32,  1.06s/it]

Processing batch: 8800 to 8900


 75%|███████▍  | 89/119 [01:31<00:31,  1.05s/it]

Processing batch: 8900 to 9000


 76%|███████▌  | 90/119 [01:32<00:30,  1.06s/it]

Processing batch: 9000 to 9100


 76%|███████▋  | 91/119 [01:33<00:29,  1.06s/it]

Processing batch: 9100 to 9200


 77%|███████▋  | 92/119 [01:34<00:28,  1.04s/it]

Processing batch: 9200 to 9300


 78%|███████▊  | 93/119 [01:35<00:27,  1.04s/it]

Processing batch: 9300 to 9400


 79%|███████▉  | 94/119 [01:37<00:26,  1.06s/it]

Processing batch: 9400 to 9500


 80%|███████▉  | 95/119 [01:38<00:25,  1.05s/it]

Processing batch: 9500 to 9600


 81%|████████  | 96/119 [01:39<00:24,  1.06s/it]

Processing batch: 9600 to 9700


 82%|████████▏ | 97/119 [01:40<00:23,  1.05s/it]

Processing batch: 9700 to 9800


 82%|████████▏ | 98/119 [01:41<00:22,  1.05s/it]

Processing batch: 9800 to 9900


 83%|████████▎ | 99/119 [01:42<00:21,  1.05s/it]

Processing batch: 9900 to 10000


 84%|████████▍ | 100/119 [01:43<00:19,  1.05s/it]

Processing batch: 10000 to 10100


 85%|████████▍ | 101/119 [01:44<00:19,  1.06s/it]

Processing batch: 10100 to 10200


 86%|████████▌ | 102/119 [01:45<00:17,  1.04s/it]

Processing batch: 10200 to 10300


 87%|████████▋ | 103/119 [01:46<00:16,  1.05s/it]

Processing batch: 10300 to 10400


 87%|████████▋ | 104/119 [01:47<00:15,  1.05s/it]

Processing batch: 10400 to 10500


 88%|████████▊ | 105/119 [01:48<00:14,  1.05s/it]

Processing batch: 10500 to 10600


 89%|████████▉ | 106/119 [01:49<00:13,  1.05s/it]

Processing batch: 10600 to 10700


 90%|████████▉ | 107/119 [01:50<00:12,  1.05s/it]

Processing batch: 10700 to 10800


 91%|█████████ | 108/119 [01:51<00:11,  1.06s/it]

Processing batch: 10800 to 10900


 92%|█████████▏| 109/119 [01:52<00:10,  1.06s/it]

Processing batch: 10900 to 11000


 92%|█████████▏| 110/119 [01:53<00:09,  1.06s/it]

Processing batch: 11000 to 11100


 93%|█████████▎| 111/119 [01:54<00:08,  1.05s/it]

Processing batch: 11100 to 11200


 94%|█████████▍| 112/119 [01:55<00:07,  1.05s/it]

Processing batch: 11200 to 11300


 95%|█████████▍| 113/119 [01:56<00:06,  1.05s/it]

Processing batch: 11300 to 11400


 96%|█████████▌| 114/119 [01:58<00:05,  1.06s/it]

Processing batch: 11400 to 11500


 97%|█████████▋| 115/119 [01:59<00:04,  1.08s/it]

Processing batch: 11500 to 11600


 97%|█████████▋| 116/119 [02:00<00:03,  1.08s/it]

Processing batch: 11600 to 11700


 98%|█████████▊| 117/119 [02:01<00:02,  1.08s/it]

Processing batch: 11700 to 11800


 99%|█████████▉| 118/119 [02:02<00:01,  1.08s/it]

Processing batch: 11800 to 11861


100%|██████████| 119/119 [02:03<00:00,  1.04s/it]


648060


In [11]:
comments_df.iloc[501999,]

Group                                                      NewJeans
Title                                                           OMG
Unnamed: 0                                                    62619
comment_type                                                  reply
comment_id_key                                                  NaN
parent_id_key                            Ugzb3AmPpDAzi_9Ced94AaABAg
comment           OMG  ETA 같이 스밍해주세요. 한 곡들으면 집계안된다함!!!! 좋아요 댓글달아...
author                                               @Bemyself_NINI
date                                           2024-04-28T16:02:13Z
likes                                                             6
Name: 501999, dtype: object