# 파일명 변경 및 정크 데이터 삭제

In [42]:
import os
import re
from langdetect import detect
from mutagen.mp3 import MP3
from datetime import datetime  # datetime 클래스를 명시적으로 임포트

directory = './data/'  # MP3 파일들이 위치한 폴더 경로

# 파일을 생성 날짜에 따라 정렬
mp3_files = [f for f in os.listdir(directory) if f.endswith('.mp3')]
mp3_files.sort(key=lambda x: os.path.getctime(os.path.join(directory, x)))

def clean_filename(filename):
    """특수문자를 제외한 파일 이름을 반환합니다."""
    # 알파벳, 숫자, 공백, 하이픈, 언더스코어만 허용
    return re.sub(r'[^\w\s-]', '', filename)

# 파일 처리 및 이름 변경
for index, filename in enumerate(mp3_files, 1):
    try:
        file_path = os.path.join(directory, filename)
        audio = MP3(file_path)

        # 비영어 제목 확인
        if detect(filename) != 'en':
            os.remove(file_path)
            continue

        # 파일 길이 확인 (7분 미만 제거)
        if audio.info.length < 420:
            os.remove(file_path)
            continue

        # 파일 이름에서 특수문자 제거
        clean_name = clean_filename(filename)
        
        # 파일 생성 날짜를 '년-월-일' 형식으로 변환
        creation_time = datetime.fromtimestamp(os.path.getmtime(file_path)).strftime('%Y%m%d')

        # 확장자 유지
        extension = '.mp3'

        # 파일 이름 변경 (인덱스-생성 날짜-제목 형식)
        new_filename = f"{index}_{creation_time}_{clean_name}{extension}"
        new_file_path = os.path.join(directory, new_filename)
        os.rename(file_path, new_file_path)

    except Exception as e:
        print(f"Error processing file {filename}: {e}")


In [44]:
import os

for filename in os.listdir(directory):
    if filename.endswith('mp3'):  # 'mp3'로 끝나는 파일 이름 찾기
        correct_filename = filename[:-7] + '.mp3'  # 올바른 확장자 '.mp3'로 변경
        os.rename(os.path.join(directory, filename), os.path.join(directory, correct_filename))


In [41]:
import os

def remove_specific_part(filename, part_to_remove):
    """ 파일 이름에서 특정 부분을 제거하는 함수 """
    return filename.replace(part_to_remove, '', 1)  # 부분을 한 번만 제거

for filename in os.listdir(directory):
    if filename.endswith('.mp3'):
        # 제거하고자 하는 부분을 정의 (예: "-1-2023-11-15")
        parts = filename.split('_')
        part_to_remove = '_'.join(parts[0:2]) + '_'
        
        # 특정 부분을 제거
        new_filename = remove_specific_part(filename, part_to_remove)

        # 파일 이름 변경
        os.rename(os.path.join(directory, filename), os.path.join(directory, new_filename))


# 위스퍼를 통해 텍스트화

In [1]:
import os
import natsort
from tqdm import tqdm
import time
import stable_whisper

directory_path = "./youtube/_data"
model_path = "./youtube/_whisper_model"

# Load Whisper model
model = stable_whisper.load_faster_whisper(model_path, device="cuda", compute_type="float16")

# Walk through the directory
for root, dirs, files in os.walk(directory_path):
    files = natsort.natsorted(files)
    
    # Wrap file processing with tqdm
    with tqdm(total=len(files), desc="Processing files", unit="file") as pbar:
        for file in files:
            # Process only .mp3 files
            if file.endswith(".mp3"):
                video_file_path = os.path.join(root, file)
                video_filename = os.path.splitext(os.path.basename(video_file_path))[0]
                txt_filename = f'{video_filename}.txt'
                txt_file_path = f"./youtube/_transcribe/{txt_filename}" # path

                # Check if the txt file already exists
                if os.path.exists(txt_file_path):
                    print('')
                    # tqdm.write(f"Skipping {txt_filename} as it already exists.")
                else:
                    # Transcribe the video
                    result = model.transcribe_stable(video_file_path, vad=True, language="en", verbose=False)

                    # Save the result to a txt file
                    with open(txt_file_path, 'w', encoding='utf-8') as txt_file:
                        txt_file.write(result.text)

                    tqdm.write(f"Processing {file}")

                # Update the tqdm progress bar
                pbar.update(1)


Processing files:   4%|▍         | 133/2979 [00:00<00:03, 747.98file/s]

Skipping 1_20201220_Market equilibrium  Supply demand and market equilibrium  Microeconomics  Khan Academy PEMkfgrifDw.txt as it already exists.
Skipping 2_20190806_How To Calculate Market Share in Excel 5VmTJ9LrLek.txt as it already exists.
Skipping 3_20170606_What are derivatives - MoneyWeek Investment Tutorials Wjlw7ZpZVK4.txt as it already exists.
Skipping 4_20230109_Stock Market Video Continued Upside Expected 3xvzZWw0AuI.txt as it already exists.
Skipping 5_20190804_Stock Market Video Live Trade Analysis For Profit SA3iAiIuEmA.txt as it already exists.
Skipping 6_20231003_PHP Stock Market Analyzer - 3 - Connecting to the Database 7Ub8uHHTTwo.txt as it already exists.
Skipping 7_20231012_PHP Stock Market Analyzer - 6 - Downloading and Saving Data WgE-PdJUVsg.txt as it already exists.
Skipping 8_20171013_PHP Stock Market Analyzer - 15 - Getting Tomorrows Date dEWNTqEr0lQ.txt as it already exists.
Skipping 9_20190726_Stock Market Video Earnings On Tap Light Volume Float 8wOPR-d9akA.

Processing files:   9%|▉         | 275/2979 [00:00<00:03, 756.46file/s]

Skipping 137_20210729_STOCK MARKET TRADING TRADING THE TREND REVERSAL IS AS EASY AS 1 2 3 suQH16fYX_E.txt as it already exists.
Skipping 138_20230821_Stock Market Analysis for Week Ending June 7 2013 7cJ-3Zo1_5E.txt as it already exists.
Skipping 139_20190716_Stock Market Analysis for Week Ending June 21 2013 3dxTMAkpYJU.txt as it already exists.
Skipping 140_20200125_How to make Big Pips Target trading the Forex Stock Market marketfy cKhprVKmfjE.txt as it already exists.
Skipping 141_20231114_What Is a Hot Sector and Industry in Stock Market I Technical Analysis on Whiteboard _IgZweSArcc.txt as it already exists.
Skipping 142_20230819_Stock Market Analysis for First Half of 2013 FyXgdi1SJg0.txt as it already exists.
Skipping 143_20150404_Big Story - Local Algos Dominate Indian Stock Market Xxg9Bc6RQGg.txt as it already exists.
Skipping 144_20171003_Stock Market Meltdown How to Prepare Yourself for the Upcoming Stock Market Crash AYSwiyjTCMg.txt as it already exists.
Skipping 145_20170

Processing files:  14%|█▍        | 422/2979 [00:00<00:03, 775.59file/s]

Skipping 282_20190805_Turning Bullish  Stock Market Video aR1aw6GI_L0.txt as it already exists.
Skipping 283_20171004_How To Profit From A US Stock Market Holiday i-MZ2fOxZ0Q.txt as it already exists.
Skipping 284_20171004_Stocks Course for Dummies Definitions  What Must You Know Stock Market Course 102 Part 0210 lSR_FHhWYuk.txt as it already exists.
Skipping 285_20210214_US Stock Market Hits Another RECORD HIGH as Greece Exit Fear Increases z4S3InyxE2A.txt as it already exists.
Skipping 286_20230825_Stock Market Analysis for Week Ending Feb 20 2015 5syFY2SeRbU.txt as it already exists.
Skipping 287_20181112_stock returns regression in excel E4BGGpsQrOM.txt as it already exists.
Skipping 288_20221224_US Dollar  Stock Market BOOM QE and Negative Interest Rates Go VIRAL uVM1d386CXw.txt as it already exists.
Skipping 289_20181002_Capitalism and the Dutch East India Company Crash Course World History 229 zPIhMJGWiM8.txt as it already exists.
Skipping 290_20230920_Annuity Education Can a Fi

Processing files:  19%|█▊        | 557/2979 [00:00<00:03, 741.27file/s]

Skipping 430_20160527_What news will be a catalyst for the stock market jrVVSBNSev0.txt as it already exists.
Skipping 431_20160528_Buyers In Control   Stock Market Analysis May 27 2016 PdiEcrPoWac.txt as it already exists.
Skipping 433_20210701_Startup Funding Explained Everything You Need to Know 677ZtSMr4-4.txt as it already exists.
Skipping 434_20160609_China Stock Market 13 FLASH CRASH Heres Why UtKoxLPb9SQ.txt as it already exists.
Skipping 435_20160615_Stock Market Trading Routine - 7 day game plan LvwjfP-cILE.txt as it already exists.
Skipping 436_20160621_Forex investment Stock Market and Forex Trading Stocks Forex Investments Forex Trader OeOz58RvtKg.txt as it already exists.
Skipping 437_20160622_Wyckoff Stock Market Techniques - Point and Figure Charts 2UzxYjzRM1c.txt as it already exists.
Skipping 438_20181112_How to Read Stock Market Charts - What is Technical Analysis G8cKZrMvWHc.txt as it already exists.
Skipping 439_20160713_Stock Market Hits ALL-TIME HIGH Japan New QE

                                                                       

Skipping 566_20170607_Stock Market Analysis June 2 2017 dmYKF-MeTZw.txt as it already exists.
Skipping 567_20201104_Stock Market Weekly Recap June 2 2017 -9ARNMvWOFQ.txt as it already exists.
Skipping 568_20170605_Is Technical Analysis enough for trading In Hindi  Bazaar Bites Episode-36  Sunil Minglani KGRDamI2ANk.txt as it already exists.
Skipping 569_20220804_What are Indices  How to Trade Them 4OB9YpIcAO8.txt as it already exists.
Skipping 570_20170607_Highlights from Stock Pe Charcha Event  Sunil Minglani  Stock Market Education vxNIJh0FXK0.txt as it already exists.
Skipping 571_20170610_Stock Market Ready to COLLAPSE According To Bill Gross Heres Why fnN8vJ1Lr5I.txt as it already exists.
Skipping 572_20170610_12 Best Books on Stock Market for Beginners in India nS9K2NSt75Y.txt as it already exists.
Skipping 573_20170618_These Charts PROVE Stock Market Manipulation Most Convincing Chart Ive Seen OwaY_4JXpdY.txt as it already exists.
Skipping 574_20170612_Robinhood APP - How to INV

Processing files:  29%|██▉       | 862/2979 [00:01<00:02, 796.66file/s]

Skipping 726_20180321_Why ONLY 1 SHARE of a Stock  Stock Market Portfolio Explained N42HXtzP3RQ.txt as it already exists.
Skipping 727_20180318_Technical Analysis of Stock Market  the Magazine Cover hBNtkTnYYlM.txt as it already exists.
Skipping 728_20181029_WHAT A STOCK MARKET CRASH LOOKS LIKE  iT4e41sSodU.txt as it already exists.
Skipping 729_20180321_What is the Most Important Thing in Your Life  Sankarsh Chanda  TEDxNMIMSBangalore BrPt4FfF0oc.txt as it already exists.
Skipping 730_20180321_5 Reasons TO BUY a Stock  Stock Market Investing dNqIkI6b7y8.txt as it already exists.
Skipping 731_20180322_SWOT Analysis for Stocks HINDI s94Fnc1wwHI.txt as it already exists.
Skipping 732_20180323_Stock Market Cycles - askSlim Special Presentation 032218 PmT2oTTF8NE.txt as it already exists.
Skipping 733_20180323_Stock Market in Japan 1989 What happened after rising interest rates ffoCiG-YOxw.txt as it already exists.
Skipping 734_20190729_39 Stock Market Statistics That Will Blow Your Mind -

Processing files:  30%|██▉       | 890/2979 [00:01<00:02, 786.80file/s]

Skipping 874_20180930_5 CORPORATE ACTIONS AND ITS IMPACT ON STOCK PRICES ZiHFA9EDYVA.txt as it already exists.
Skipping 875_20180930_Stock Market Current Condition  Infibeam stock Down  share market rumor aNmPsjcbJpk.txt as it already exists.
Skipping 876_20180930_BEST WEBSITE for Stock Market  How to Use Economic Times for Stock Market  Tamil Share rrTYh3HzTQE.txt as it already exists.
Skipping 877_20181004_Technical Analysis of Stock Market  High-Lows 5880uN1ULTA.txt as it already exists.
Skipping 878_20181004_Stock Market Updates and NEWS 4th October 2018  Tamil Share YDIGxSVN3PI.txt as it already exists.
Skipping 879_20200102_Resources to Start Coding Trading Algorithms vkljN8jeWV0.txt as it already exists.
Skipping 880_20190421_How AI Traders Will Dominate Hedge Fund Industry    Marshall Chang  TEDxBeaconStreetSalon lzaBbQKUtAA.txt as it already exists.
Skipping 881_20181005_3 Reasons The Stock Market Could CRASH 20 or More BkiKMjgkOX8.txt as it already exists.
Skipping 882_201810

