In [None]:
# First, install the necessary library
!pip install youtube_transcript_api


Collecting youtube_transcript_api
  Downloading youtube_transcript_api-1.0.3-py3-none-any.whl.metadata (23 kB)
Downloading youtube_transcript_api-1.0.3-py3-none-any.whl (2.2 MB)
   ---------------------------------------- 0.0/2.2 MB ? eta -:--:--
   -------------------------------------- - 2.1/2.2 MB 11.7 MB/s eta 0:00:01
   ---------------------------------------- 2.2/2.2 MB 11.1 MB/s eta 0:00:00
Installing collected packages: youtube_transcript_api
Successfully installed youtube_transcript_api-1.0.3


### METODE 1: YOUTUBE TRANSKRIP API TO TXT

In [3]:
import whisper
import os
import re
from pytube import YouTube
import datetime

def get_video_id(url):
    """Extract video ID from YouTube URL"""
    video_id_match = re.search(r'(?:v=|\/)([0-9A-Za-z_-]{11}).*', url)
    if video_id_match:
        return video_id_match.group(1)
    return None

def download_youtube_audio(url, output_path="audio"):
    """Download audio from YouTube video"""
    try:
        # Create output directory if it doesn't exist
        if not os.path.exists(output_path):
            os.makedirs(output_path)
            
        # Create YouTube object
        yt = YouTube(url)
        
        # Get video details for file naming
        video_id = get_video_id(url)
        video_title = yt.title
        safe_title = re.sub(r'[^\w\-_]', '_', video_title)
        
        print(f"Downloading audio from: {video_title}")
        
        # Get audio stream and download
        audio_stream = yt.streams.filter(only_audio=True).first()
        output_file = audio_stream.download(output_path=output_path)
        
        # Rename to mp3
        base, ext = os.path.splitext(output_file)
        new_file = f"{base}.mp3"
        os.rename(output_file, new_file)
        
        print(f"Audio downloaded to: {new_file}")
        return new_file, safe_title
        
    except Exception as e:
        print(f"Error downloading YouTube audio: {e}")
        return None, None

def transcribe_audio(audio_path, language="id"):
    """Transcribe audio using OpenAI Whisper"""
    try:
        print(f"Loading Whisper model...")
        model = whisper.load_model("medium")  # You can change model size: tiny, base, small, medium, large
        
        print(f"Transcribing audio... (this may take some time)")
        result = model.transcribe(audio_path, language=language)
        
        return result["text"]
    except Exception as e:
        print(f"Error transcribing audio: {e}")
        return None

# Main process
def process_youtube_speech(youtube_url, language="id"):
    """Process YouTube video: download audio and transcribe"""
    # Download audio
    audio_file, video_title = download_youtube_audio(youtube_url)
    
    if not audio_file:
        return
    
    # Transcribe audio
    transcription = transcribe_audio(audio_file, language=language)
    
    if not transcription:
        return
    
    # Save transcription to file
    timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
    output_filename = f"Soekarno_{timestamp}.txt"
    
    with open(output_filename, "w", encoding="utf-8") as f:
        f.write(f"Source: {youtube_url}\n")
        f.write(f"Title: {video_title}\n")
        f.write(f"Transcribed: {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n")
        f.write(transcription)
    
    print(f"✓ Transcription complete! Saved to {output_filename}")
    print("\nPreview:")
    print(transcription[:500] + "..." if len(transcription) > 500 else transcription)
    
    return output_filename

# Example usage
youtube_url = "https://www.youtube.com/watch?v=N_Cp4KBrRZw"  # Change to your Soekarno speech video
process_youtube_speech(youtube_url, language="id")

Error downloading YouTube audio: HTTP Error 400: Bad Request


### METHOD 2: OPEN AI WHISPER

In [1]:
# Install all required packages
!pip install openai-whisper ffmpeg-python pytube



In [3]:
!pip install yt-dlp

Collecting yt-dlp
  Downloading yt_dlp-2025.5.22-py3-none-any.whl.metadata (174 kB)
Downloading yt_dlp-2025.5.22-py3-none-any.whl (3.3 MB)
   ---------------------------------------- 0.0/3.3 MB ? eta -:--:--
   ------------------------- -------------- 2.1/3.3 MB 10.7 MB/s eta 0:00:01
   ---------------------------------------- 3.3/3.3 MB 9.6 MB/s eta 0:00:00
Installing collected packages: yt-dlp
Successfully installed yt-dlp-2025.5.22


In [4]:
# Option 1: Try to install FFmpeg via pip (simpler but may not always work)
!pip install ffmpeg-python

# Option 2: Try to install via conda if you're using Anaconda
# !conda install -c conda-forge ffmpeg



In [5]:
import whisper
import os
import re
from pytube import YouTube
import datetime
import yt_dlp

def get_video_id(url):
    """Extract video ID from YouTube URL"""
    video_id_match = re.search(r'(?:v=|\/)([0-9A-Za-z_-]{11}).*', url)
    if video_id_match:
        return video_id_match.group(1)
    return None


def download_with_ytdlp(url, output_path="audio", ffmpeg_location=None):
    """Alternative download using yt-dlp with ffmpeg handling"""
    try:
        if not os.path.exists(output_path):
            os.makedirs(output_path)
            
        video_id = get_video_id(url)
        timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
        output_file = f"{output_path}/audio_{video_id}_{timestamp}.mp3"
        
        ydl_opts = {
            'format': 'bestaudio/best',
            'outtmpl': output_file,
            'quiet': False,
            'no_warnings': False
        }
        
        # Add ffmpeg location if provided
        if ffmpeg_location:
            ydl_opts['ffmpeg_location'] = ffmpeg_location
            ydl_opts['postprocessors'] = [{
                'key': 'FFmpegExtractAudio',
                'preferredcodec': 'mp3',
                'preferredquality': '192',
            }]
        
        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
            info = ydl.extract_info(url, download=True)
            video_title = info.get('title', f"video_{video_id}")
            
        print(f"Audio downloaded to: {output_file}")
        return output_file, video_title
        
    except Exception as e:
        print(f"Error downloading with yt-dlp: {e}")
        return None, None

def download_youtube_audio(url, output_path="audio"):
    """Download audio from YouTube video using pytube"""
    try:
        # Create output directory if it doesn't exist
        if not os.path.exists(output_path):
            os.makedirs(output_path)
            
        # Create YouTube object
        yt = YouTube(url)
        
        # Get video details for file naming
        video_id = get_video_id(url)
        video_title = yt.title
        safe_title = re.sub(r'[^\w\-_]', '_', video_title)
        
        print(f"Downloading audio from: {video_title}")
        
        # Get audio stream and download
        audio_stream = yt.streams.filter(only_audio=True).first()
        output_file = audio_stream.download(output_path=output_path)
        
        # Rename to mp3
        base, ext = os.path.splitext(output_file)
        new_file = f"{base}.mp3"
        os.rename(output_file, new_file)
        
        print(f"Audio downloaded to: {new_file}")
        return new_file, safe_title
        
    except Exception as e:
        print(f"Error with pytube: {e}")
        return None, None
def transcribe_audio(audio_path, language="id"):
    """Transcribe audio using OpenAI Whisper"""
    try:
        print(f"Loading Whisper model...")
        model = whisper.load_model("medium")  # You can change model size: tiny, base, small, medium, large
        
        print(f"Transcribing audio... (this may take some time)")
        result = model.transcribe(audio_path, language=language)
        
        return result["text"]
    except Exception as e:
        print(f"Error transcribing audio: {e}")
        return None

def process_youtube_speech(youtube_url, language="id"):
    """Process YouTube video: download audio and transcribe"""
    # Try downloading with pytube first
    audio_file, video_title = download_youtube_audio(youtube_url)
    
    # If pytube fails, try yt-dlp
    if not audio_file:
        audio_file, video_title = download_with_ytdlp(youtube_url)
    
    if not audio_file:
        print("Failed to download audio using both methods.")
        return
    
    # Rest of the function remains the same
    # Transcribe audio
    transcription = transcribe_audio(audio_file, language=language)
    
    if not transcription:
        return
    
    # Save transcription to file
    timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
    output_filename = f"Soekarno_{timestamp}.txt"
    
    with open(output_filename, "w", encoding="utf-8") as f:
        f.write(f"Source: {youtube_url}\n")
        f.write(f"Title: {video_title}\n")
        f.write(f"Transcribed: {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n")
        f.write(transcription)
    
    print(f"✓ Transcription complete! Saved to {output_filename}")
    print("\nPreview:")
    print(transcription[:500] + "..." if len(transcription) > 500 else transcription)
    
    return output_filename

# Example usage
youtube_url = "https://www.youtube.com/watch?v=bcIk9n6nRUo&ab_channel=HendriTeja"  # Change to your Soekarno speech video
process_youtube_speech(youtube_url, language="id")

Error with pytube: HTTP Error 400: Bad Request
[youtube] Extracting URL: https://www.youtube.com/watch?v=bcIk9n6nRUo&ab_channel=HendriTeja
[youtube] bcIk9n6nRUo: Downloading webpage
[youtube] bcIk9n6nRUo: Downloading tv client config
[youtube] bcIk9n6nRUo: Downloading tv player API JSON
[youtube] bcIk9n6nRUo: Downloading ios player API JSON
[youtube] bcIk9n6nRUo: Downloading m3u8 information
[info] bcIk9n6nRUo: Downloading 1 format(s): 251
[download] Destination: audio\audio_bcIk9n6nRUo_20250523_220126.mp3
[download] 100% of    2.25MiB in 00:00:00 at 4.43MiB/s   
Audio downloaded to: audio/audio_bcIk9n6nRUo_20250523_220126.mp3
Loading Whisper model...


 20%|███████▋                              | 294M/1.42G [00:43<02:54, 7.01MiB/s]


KeyboardInterrupt: 

### Scrapping from Website

In [12]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from time import sleep

# URL target
url = "https://www.goodreads.com/author/quotes/661589.Sukarno"

# User-Agent header untuk meniru browser
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

quotes_list = []

while True:
    # Mengirim permintaan HTTP
    response = requests.get(url, headers=headers)
    
    # Memeriksa status response
    if response.status_code != 200:
        print(f"Gagal mengakses halaman. Kode status: {response.status_code}")
        break
        
    # Parsing konten HTML
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Mencari semua elemen kutipan
    quotes = soup.find_all('div', class_='quoteText')
    
    # Mengekstrak teks kutipan
    for quote in quotes:
        text = quote.get_text(strip=True).split('―')[0]
        text = text.replace('“', '').replace('”', '').strip()
        quotes_list.append(text)
    
    # Mencari link halaman berikutnya
    next_page = soup.find('a', class_='next_page')
    
    if not next_page:
        break  # Tidak ada halaman berikutnya
        
    url = "https://www.goodreads.com" + next_page['href']
    sleep(2)  # Jeda untuk menghormati server

# Membuat DataFrame
df = pd.DataFrame(quotes_list, columns=['Quotes'])

# Menyimpan ke CSV
df.to_csv('quotes_soekarno.csv', index=False)

# Menampilkan hasil
print(f"Berhasil mengumpulkan {len(df)} kutipan:")
df.head()

Berhasil mengumpulkan 25 kutipan:


Unnamed: 0,Quotes
0,"Kami menggoyangkan langit, menggempakan darat,..."
1,"This country, the Republic of Indonesia, does ..."
2,"Bebek berjalan berbondong-bondong, akan tetapi..."
3,I hate imperialism. I detest colonialism. And ...
4,"Learning without thinking is useless, but thin..."


In [None]:
import pandas as pd
import json
import os
import re

base_path = r"c:\PythonVSCenv\Capstone\scrapping\output"
csv_path = r"c:\PythonVSCenv\Capstone\scrapping"
hatta_csv = os.path.join(csv_path, "quotes_hatta.csv")
soekarno_csv = os.path.join(csv_path, "quotes_soekarno.csv")
output_json = os.path.join(base_path, "content_author_quotes.json")

def load_quotes(path):
    if not os.path.exists(path):
        return []
    df = pd.read_csv(path)
    col = 'Quotes' if 'Quotes' in df.columns else df.columns[0]
    return [q.strip().strip('"') for q in df[col].dropna().astype(str) if q.strip()]

tags = [
    'greeting','whoami','nationalism','revolution','independence',
    'unity','advice','international','struggle','goodbye'
]

keywords = {
    'greeting': ['selamat pagi','halo','hai','salam'],
    'whoami': ['aku adalah','saya adalah','identitas saya'],
    'nationalism': ['bangsa','negara','nasional','tanah air'],
    'revolution': ['revolusi','letusan','mengguncang','gempakan'],
    'independence': ['merdeka','kemerdekaan','bebas','penjajahan','kolonialisme'],
    'unity': ['persatuan','bersatu','kesatuan','gotong royong'],
    'advice': ['nasihat','petuah','bijak','pelajaran','ingatlah'],
    'international': ['internasional','dunia','global','antar bangsa'],
    'struggle': ['perjuangan','pengorbanan','berjuang','tantangan'],
    'goodbye': ['selamat tinggal','sampai jumpa','berpisah']
}

def categorize(quotes):
    cat = {t: [] for t in tags}
    for q in quotes:
        ql = q.lower()
        for t in tags:
            for kw in keywords[t]:
                if re.search(r'\b'+re.escape(kw)+r'\b', ql):
                    cat[t].append(q)
                    break
    return cat

def build_intents(cat):
    out = []
    for t, qs in cat.items():
        if not qs: continue
        inp = [
            f"berikan kutipan tentang {t}",
            f"quotes mengenai {t}",
            f"{t} quotes"
        ]
        if t=='whoami': inp += ["siapa kamu","tentang dirimu"]
        if t=='greeting': inp += ["apa kabar","selamat pagi"]
        out.append({"tag": t, "input": inp, "responses": list(dict.fromkeys(qs))})
    return out

hatta = load_quotes(hatta_csv)
soekarno = load_quotes(soekarno_csv)

data = {
    "Hatta": {"intents": build_intents(categorize(hatta))},
    "Soekarno": {"intents": build_intents(categorize(soekarno))}
}

with open(output_json, 'w', encoding='utf-8') as f:
    json.dump(data, f, ensure_ascii=False, indent=4)

print("Saved to", output_json)


Saved to c:\PythonVSCenv\Capstone\scrapping\output2\content_author_quotes.json


In [15]:
import pandas as pd
import json
import os
import re

base_path = r"c:\PythonVSCenv\Capstone\scrapping\output2"
csv_path = r"c:\PythonVSCenv\Capstone\scrapping"
hatta_csv = os.path.join(csv_path, "quotes_hatta.csv")
soekarno_csv = os.path.join(csv_path, "quotes_soekarno.csv")
output_json = os.path.join(base_path, "content_by_author_and_tags.json")

def load_quotes(path):
    if not os.path.exists(path):
        return []
    df = pd.read_csv(path)
    col = 'Quotes' if 'Quotes' in df.columns else df.columns[0]
    return [q.strip().strip('"') for q in df[col].dropna().astype(str) if q.strip()]

tags = [
    'greeting','whoami','nationalism','revolution','independence',
    'unity','advice','international','struggle','goodbye'
]

keywords = {
    'greeting': ['selamat pagi','halo','hai','salam'],
    'whoami': ['aku adalah','saya adalah','identitas saya'],
    'nationalism': ['bangsa','negara','nasional','tanah air'],
    'revolution': ['revolusi','letusan','mengguncang','gempakan'],
    'independence': ['merdeka','kemerdekaan','bebas','penjajahan','kolonialisme'],
    'unity': ['persatuan','bersatu','kesatuan','gotong royong'],
    'advice': ['nasihat','petuah','bijak','pelajaran','ingatlah'],
    'international': ['internasional','dunia','global','antar bangsa'],
    'struggle': ['perjuangan','pengorbanan','berjuang','tantangan'],
    'goodbye': ['selamat tinggal','sampai jumpa','berpisah']
}

def categorize(quotes):
    cat = {t: [] for t in tags}
    for q in quotes:
        ql = q.lower()
        for t in tags:
            for kw in keywords[t]:
                if re.search(r'\b'+re.escape(kw)+r'\b', ql):
                    cat[t].append(q)
                    break
    return cat

def build_intents(cat):
    out = []
    for t, qs in cat.items():
        if not qs: continue
        inp = [
            f"berikan kutipan tentang {t}",
            f"quotes mengenai {t}",
            f"{t} quotes"
        ]
        if t=='whoami': inp += ["siapa kamu","tentang dirimu"]
        if t=='greeting': inp += ["apa kabar","selamat pagi"]
        out.append({"tag": t, "input": inp, "responses": list(dict.fromkeys(qs))})
    return out

hatta = load_quotes(hatta_csv)
soekarno = load_quotes(soekarno_csv)

data = {
    "Hatta": {"intents": build_intents(categorize(hatta))},
    "Soekarno": {"intents": build_intents(categorize(soekarno))}
}

with open(output_json, 'w', encoding='utf-8') as f:
    json.dump(data, f, ensure_ascii=False, indent=4)

print("Saved to", output_json)

Saved to c:\PythonVSCenv\Capstone\scrapping\output2\content_by_author_and_tags.json


Process

In [None]:
import pandas as pd
import json
import os
import re

# Path configuration
base_path = r"c:\PythonVSCenv\Capstone\scrapping\output"
csv_path = r"c:\PythonVSCenv\Capstone\scrapping"

# Output files
hatta_output = os.path.join(base_path, "content_hatta.json")
soekarno_output = os.path.join(base_path, "content_soekarno.json")

# Create output directory
os.makedirs(base_path, exist_ok=True)

# Tags and keywords
tags = ['greeting','whoami','nationalism','revolution','independence',
        'unity','advice','international','struggle','goodbye']

keywords = {
    'nationalism': ['bangsa','negara','nasional','tanah air'],
    'revolution': ['revolusi','letusan','mengguncang','gempakan'],
    'independence': ['merdeka','kemerdekaan','bebas','penjajahan'],
    'struggle': ['perjuangan','pengorbanan','berjuang','tantangan']
    # ... other keywords
}

# ================= PROCESS HATTA =================
print("Processing Hatta quotes...")

# Load Hatta CSV
df = pd.read_csv(os.path.join(csv_path, "quotes_hatta.csv"))
hatta_quotes = [q.strip().strip('"') for q in df['Quotes'].dropna().astype(str)]

# Categorize Hatta quotes
hatta_categories = {t: [] for t in tags}
for q in hatta_quotes:
    ql = q.lower()
    for t in tags:
        for kw in keywords.get(t, []):
            if re.search(r'\b'+re.escape(kw)+r'\b', ql):
                hatta_categories[t].append(q)
                break

# Build Hatta intents
hatta_intents = []
for t, qs in hatta_categories.items():
    if qs:
        intent = {
            "tag": t,
            "input": [f"kutipan {t}", f"quotes {t}"],
            "responses": list(dict.fromkeys(qs))
        }
        hatta_intents.append(intent)

# Save Hatta JSON
hatta_data = {"author": "Hatta", "intents": hatta_intents}
with open(hatta_output, 'w', encoding='utf-8') as f:
    json.dump(hatta_data, f, ensure_ascii=False, indent=4)

# ================= PROCESS SOEKARNO =================
print("Processing Soekarno quotes...")

# Load Soekarno CSV
df = pd.read_csv(os.path.join(csv_path, "quotes_soekarno.csv"))
soekarno_quotes = [q.strip().strip('"') for q in df['Quotes'].dropna().astype(str)]

# Categorize Soekarno quotes
soekarno_categories = {t: [] for t in tags}
for q in soekarno_quotes:
    ql = q.lower()
    for t in tags:
        for kw in keywords.get(t, []):
            if re.search(r'\b'+re.escape(kw)+r'\b', ql):
                soekarno_categories[t].append(q)
                break

# Build Soekarno intents
soekarno_intents = []
for t, qs in soekarno_categories.items():
    if qs:
        intent = {
            "tag": t,
            "input": [f"kutipan {t}", f"quotes {t}"],
            "responses": list(dict.fromkeys(qs))
        }
        soekarno_intents.append(intent)

# Save Soekarno JSON
soekarno_data = {"author": "Soekarno", "intents": soekarno_intents}
with open(soekarno_output, 'w', encoding='utf-8') as f:
    json.dump(soekarno_data, f, ensure_ascii=False, indent=4)

print(f"✅ Files created: content_hatta.json & content_soekarno.json")
print(f"📊 Hatta: {len(hatta_quotes)} quotes → {len(hatta_intents)} intents")
print(f"📊 Soekarno: {len(soekarno_quotes)} quotes → {len(soekarno_intents)} intents")

Processing Hatta quotes...
Processing Soekarno quotes...
✅ Files created: content_hatta.json & content_soekarno.json
📊 Hatta: 17 quotes → 3 intents
📊 Soekarno: 25 quotes → 4 intents


#### Melakukan pergantian format ke json untuk quotes quotes content json 

In [None]:
import pandas as pd
import json
import os

# Define base path for the files
base_path = r"c:\PythonVSCenv\Capstone\scrapping"

# Define input CSV file names and output JSON file name
hatta_csv_file = 'quotes_hatta.csv'
soekarno_csv_file = 'quotes_soekarno.csv' # Pastikan file ini ada
output_json_file = 'content.json'

# Construct full paths
hatta_csv_path = os.path.join(base_path, hatta_csv_file)
soekarno_csv_path = os.path.join(base_path, soekarno_csv_file)
output_json_path = os.path.join(base_path, output_json_file)

def load_quotes_from_csv(csv_path):
    """Reads quotes from a CSV file."""
    if not os.path.exists(csv_path):
        print(f"Peringatan: File tidak ditemukan - {csv_path}")
        return []
    try:
        df = pd.read_csv(csv_path)
        # Asumsikan kolom pertama berisi kutipan, atau kolom bernama 'Quotes'
        if 'Quotes' in df.columns:
            quotes = df['Quotes'].dropna().astype(str).tolist()
        elif not df.empty:
            quotes = df.iloc[:, 0].dropna().astype(str).tolist()
        else:
            quotes = []
        
        # Membersihkan kutipan dari tanda kutip ganda yang mungkin ada di awal/akhir dari CSV
        cleaned_quotes = [q.strip('"') for q in quotes]
        return cleaned_quotes
    except Exception as e:
        print(f"Error saat membaca {csv_path}: {e}")
        return []

# Load quotes
hatta_quotes = load_quotes_from_csv(hatta_csv_path)
soekarno_quotes = load_quotes_from_csv(soekarno_csv_path) # Pastikan file quotes_soekarno.csv ada

# Initialize intents list
intents_list = []

# Create intent for Hatta's quotes
if hatta_quotes:
    hatta_intent = {
        "tag": "quote_hatta",
        "input": [
            "kutipan hatta", "quotes hatta", "kata bijak hatta", 
            "pemikiran hatta", "nasihat hatta"
        ],
        "responses": hatta_quotes
    }
    intents_list.append(hatta_intent)
else:
    print(f"Tidak ada kutipan yang dimuat dari {hatta_csv_path}")

# Create intent for Soekarno's quotes
if soekarno_quotes:
    soekarno_intent = {
        "tag": "quote_soekarno",
        "input": [
            "kutipan soekarno", "quotes soekarno", "kata bijak soekarno", 
            "pemikiran soekarno", "pidato soekarno", "nasihat soekarno"
        ],
        "responses": soekarno_quotes
    }
    intents_list.append(soekarno_intent)
else:
    print(f"Tidak ada kutipan yang dimuat dari {soekarno_csv_path}. Pastikan file tersebut ada dan formatnya benar.")

# Final JSON structure
final_json_data = {
    "intents": intents_list
}

# Write to JSON file
try:
    with open(output_json_path, 'w', encoding='utf-8') as f:
        json.dump(final_json_data, f, ensure_ascii=False, indent=4)
    print(f"Berhasil mengonversi kutipan ke JSON: {output_json_path}")
    if not intents_list:
        print("Peringatan: File JSON yang dihasilkan kosong karena tidak ada kutipan yang ditemukan.")
except Exception as e:
    print(f"Error saat menulis file JSON: {e}")


import pandas as pd
import json
import os

# Define base path for the files
base_path = r"c:\PythonVSCenv\Capstone\scrapping\output"

# Define input CSV file names and output JSON file name
hatta_csv_file = 'quotes_hatta.csv'
soekarno_csv_file = 'quotes_soekarno.csv' # Pastikan file ini ada
output_json_file = 'content.json'

# Construct full paths
hatta_csv_path = os.path.join(base_path, hatta_csv_file)
soekarno_csv_path = os.path.join(base_path, soekarno_csv_file)
output_json_path = os.path.join(base_path, output_json_file)

def load_quotes_from_csv(csv_path):
    """Reads quotes from a CSV file."""
    if not os.path.exists(csv_path):
        return []
    df = pd.read_csv(csv_path)
    if 'Quotes' in df.columns:
        quotes = df['Quotes'].dropna().astype(str).tolist()
    cleaned_quotes = [q.strip('"') for q in quotes]
    return cleaned_quotes

# Load quotes
hatta_quotes = load_quotes_from_csv(hatta_csv_path)
soekarno_quotes = load_quotes_from_csv(soekarno_csv_path) # Pastikan file quotes_soekarno.csv ada

# Initialize intents list
intents_list = []

# Create intent for Hatta's quotes
if hatta_quotes:
    hatta_intent = {
        "tag": "quote_hatta",
        "input": [
            "kutipan hatta", "quotes hatta", "kata bijak hatta", 
            "pemikiran hatta", "nasihat hatta"
        ],
        "responses": hatta_quotes
    }
    intents_list.append(hatta_intent)
else:
    print(f"Tidak ada kutipan yang dimuat dari {hatta_csv_path}")

# Create intent for Soekarno's quotes
if soekarno_quotes:
    soekarno_intent = {
        "tag": "quote_soekarno",
        "input": [
            "kutipan soekarno", "quotes soekarno", "kata bijak soekarno", 
            "pemikiran soekarno", "pidato soekarno", "nasihat soekarno"
        ],
        "responses": soekarno_quotes
    }
    intents_list.append(soekarno_intent)
else:
    print(f"Tidak ada kutipan yang dimuat dari {soekarno_csv_path}. Pastikan file tersebut ada dan formatnya benar.")

# Final JSON structure
final_json_data = {
    "intents": intents_list
}

# Write to JSON file
try:
    with open(output_json_path, 'w', encoding='utf-8') as f:
        json.dump(final_json_data, f, ensure_ascii=False, indent=4)
    print(f"Berhasil mengonversi kutipan ke JSON: {output_json_path}")
    if not intents_list:
        print("Peringatan: File JSON yang dihasilkan kosong karena tidak ada kutipan yang ditemukan.")
except Exception as e:
    print(f"Error saat menulis file JSON: {e}")

Berhasil mengonversi kutipan ke JSON: c:\PythonVSCenv\Capstone\scrapping\content.json
Peringatan: File tidak ditemukan - c:\PythonVSCenv\Capstone\scrapping\output2\quotes_hatta.csv
Peringatan: File tidak ditemukan - c:\PythonVSCenv\Capstone\scrapping\output2\quotes_soekarno.csv
Tidak ada kutipan yang dimuat dari c:\PythonVSCenv\Capstone\scrapping\output2\quotes_hatta.csv
Tidak ada kutipan yang dimuat dari c:\PythonVSCenv\Capstone\scrapping\output2\quotes_soekarno.csv. Pastikan file tersebut ada dan formatnya benar.
Berhasil mengonversi kutipan ke JSON: c:\PythonVSCenv\Capstone\scrapping\output2\content.json
Peringatan: File JSON yang dihasilkan kosong karena tidak ada kutipan yang ditemukan.


Versi Sederhana


In [None]:
import pandas as pd
import json

# Ultra simple version - minimal code
def quick_convert():
    """Convert CSV to JSON """
    
    # Read both CSV files
    try:
        hatta = pd.read_csv(r"c:\PythonVSCenv\Capstone\scrapping\quotes_hatta.csv")['Quotes'].dropna().tolist()
    except:
        hatta = []
    
    try:
        soekarno = pd.read_csv(r"c:\PythonVSCenv\Capstone\scrapping\quotes_soekarno.csv")['Quotes'].dropna().tolist()
    except:
        soekarno = []
    
    # Create JSON structure
    data = {
        "intents": [
            {
                "tag": "quote_hatta",
                "input": ["kutipan hatta", "quotes hatta", "kata bijak hatta"],
                "responses": [str(q).strip('"') for q in hatta]
            },
            {
                "tag": "quote_soekarno", 
                "input": ["kutipan soekarno", "quotes soekarno", "kata bijak soekarno"],
                "responses": [str(q).strip('"') for q in soekarno]
            }
        ]
    }
    
    # Save to JSON
    with open(r"c:\PythonVSCenv\Capstone\scrapping\content.json", 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=2)
    
    print(f"✅ Converted! Hatta: {len(hatta)} quotes, Soekarno: {len(soekarno)} quotes")

# Run
quick_convert()

Library dipecah


In [None]:
# URL target - halaman quotes Soekarno di Goodreads
url = "https://www.goodreads.com/author/quotes/661589.Sukarno"

# Header untuk meniru browser (menghindari blocking)
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
}

quotes_list = []  # List untuk menyimpan semua kutipan

In [None]:
while True:
    # Kirim request ke halaman
    response = requests.get(url, headers=headers)
    
    # Cek status response
    if response.status_code != 200:
        print(f"Gagal mengakses halaman. Status: {response.status_code}")
        break