In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from io import StringIO
import re

# Base URL
base_url = "https://www.sprm.gov.my/index.php?r=site%2Findex&page_id=96&language=en&page=1&per-page=8"

all_dataframes = []

def get_next_page_url(soup):
    """Return the URL of the next page or None if there is no next page."""
    next_page = soup.find('a', string='»')
    if next_page and 'href' in next_page.attrs:
        return "https://www.sprm.gov.my" + next_page['href']
    return None

def get_person_data(soup):
    """Extract person data from the page content and return a DataFrame."""
    tables = soup.find_all('table')
    if not tables:
        return pd.DataFrame()

    html_content = ''.join(str(table) for table in tables)
    html_io = StringIO(html_content)
    
    try:
        data_frames = pd.read_html(html_io)
    except Exception as e:
        print(f"Error reading HTML tables: {e}")
        return pd.DataFrame()
    
    if not data_frames:
        return pd.DataFrame()

    df = pd.concat(data_frames, ignore_index=True)
    return df

def reorganize_dataframe(df):
    """Reorganize the DataFrame to structure personal information and cases."""
    organized_data = []
    current_accused = {}

    for index, row in df.iterrows():
        if row[0] == 'Accused':
            if current_accused:
                organized_data.append(current_accused)
            current_accused = {'Cases': []}
        if pd.notna(row[0]):
            if row[0] == '#':
                current_accused['Cases'].append({
                    'No Kes': row[1],
                    'Ringkasan Pertuduhan': row[2],
                    'Kesalahan': row[3],
                    'Hukuman': row[4]
                })
            else:
                current_accused[row[0]] = row[1]

    if current_accused:
        organized_data.append(current_accused)

    final_data = []
    for data in organized_data:
        temp_dict = {key: value for key, value in data.items() if key != 'Cases'}
        cases = data.get('Cases', [])
        for i, case in enumerate(cases, start=1):
            for k, v in case.items():
                if pd.notna(v):
                    temp_dict[f'Case {i} {k}'] = v
        final_data.append(temp_dict)

    final_df = pd.DataFrame(final_data)
    return final_df

def get_image_urls(soup):
    """Return a list of image URLs found in the page content."""
    img_tags = soup.find_all('img')
    img_urls = ["https://www.sprm.gov.my" + img['src'] for img in img_tags if 'src' in img.attrs and (img['src'].endswith('.jpg') or img['src'].endswith('.jpeg'))]
    return img_urls

def extract_name_from_url(url):
    """Extract the name of the accused from the image URL."""
    match = re.search(r'pesalah/(.*?)-\d{8}\.(jpg|jpeg)', url)
    if match:
        name = match.group(1).replace('-', ' ').title()
        return name
    return None

def normalize_name(name):
    """Normalize the name by removing non-alphabetic characters and converting to lowercase."""
    return ''.join(filter(str.isalpha, name)).lower()

def add_image_urls_to_df(df, img_urls):
    """Associate image URLs with accused names in the DataFrame."""
    img_urls_dict = {}
    for url in img_urls:
        name = extract_name_from_url(url)
        if name:
            normalized_name = normalize_name(name)
            if normalized_name in img_urls_dict:
                img_urls_dict[normalized_name].append(url)
            else:
                img_urls_dict[normalized_name] = [url]
    
    def find_image_url(accused_name):
        name_key = normalize_name(accused_name)
        if name_key in img_urls_dict and img_urls_dict[name_key]:
            return img_urls_dict[name_key].pop(0)
        return "https://www.sprm.gov.my/admin/images/noimage.jpg"
    
    df['Image URL'] = df['Accused'].apply(find_image_url)
    return df

current_url = base_url
page_num = 1

all_image_urls = []

while current_url:
    print(f"Scraping page {page_num}")
    try:
        response = requests.get(current_url)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Extract person data from the page
        df = get_person_data(soup)
        if not df.empty:
            all_dataframes.append(df)
        
        # Extract image URLs from the page
        img_urls = get_image_urls(soup)
        all_image_urls.extend(img_urls)
        
        current_url = get_next_page_url(soup)
        page_num += 1
    except requests.RequestException as e:
        print(f"HTTP request error: {e}")
        break

# Combine all dataframes
df_merged = pd.concat(all_dataframes, axis=0, ignore_index=True)

# Reorganize data
final_df = reorganize_dataframe(df_merged)

# Add image URLs to final DataFrame
final_df = add_image_urls_to_df(final_df, all_image_urls)

# Print names with 'no image' URLs
no_image_df = final_df[final_df['Image URL'] == 'https://www.sprm.gov.my/admin/images/noimage.jpg']
print("Entries with no image URL:")
print(no_image_df[['Accused', 'Image URL']])

# Save to CSV
csv_filename = '1-2024-07-29_sprm_data.csv'
final_df.to_csv(csv_filename, index=False)

# Convert to JSON
json_filename = '1-2024-07-29_sprm_data.json'
final_df.to_json(json_filename, orient='records', lines=True)

# Print the final DataFrame
print("Final DataFrame:")
print(final_df)

Scraping page 1
Scraping page 2
Scraping page 3
Scraping page 4
Scraping page 5
Scraping page 6
Scraping page 7
Scraping page 8
Scraping page 9
Scraping page 10
Scraping page 11
Scraping page 12
Scraping page 13
Scraping page 14
Scraping page 15
Scraping page 16
Scraping page 17
Scraping page 18
Scraping page 19
Scraping page 20
Scraping page 21
Scraping page 22
Scraping page 23
Scraping page 24
Scraping page 25
Scraping page 26
Scraping page 27
Scraping page 28
Scraping page 29
Scraping page 30
Scraping page 31
Scraping page 32
Scraping page 33
Scraping page 34
Scraping page 35
Scraping page 36
Scraping page 37
Scraping page 38
Scraping page 39
Scraping page 40
Scraping page 41
Scraping page 42
Scraping page 43
Scraping page 44
Scraping page 45
Entries with no image URL:
                                 Accused  \
9                         Low Khim Seong   
38                      Rhymei bin Kasim   
52                    Roslan bin Zakaria   
53            Mohd Asyraf bin Mohd Fauzi 