In [1]:
import requests
import pandas as pd
from api import ParliamentDataRetriever

In [2]:
df_cleaned = ParliamentDataRetriever.get_pmg_bills_data()
df_cleaned

Unnamed: 0,Title,Status,Introduced by,Created at,Updated at,URL,Document
0,National Water Amendment Bill,Under consideration by the National Assembly.,Minister of Water and Sanitation,2026-01-22T05:35:46.584278+00:00,2026-02-19T07:04:22.547456+00:00,http://api.pmg.org.za/bill/1321/,https://pmg.org.za/files/B1-2026_National_Wate...
2,Public Procurement Amendment Bill,Under consideration by the National Assembly.,"Mr MJ Cuthbert, MP",2026-02-25T09:00:52.021018+00:00,2026-02-25T12:26:11.720624+00:00,http://api.pmg.org.za/bill/1329/,No document available
3,Special Appropriation Bill,Under consideration by the National Assembly.,Minister of Finance,2026-02-25T08:58:28.437276+00:00,2026-02-25T12:15:34.886768+00:00,http://api.pmg.org.za/bill/1328/,https://pmg.org.za/files/B3-2026_Special_Appro...
4,Appropriation Bill,Under consideration by the National Assembly.,Minister of Finance,2026-02-25T08:53:38.001494+00:00,2026-02-25T12:15:03.533161+00:00,http://api.pmg.org.za/bill/1326/,https://pmg.org.za/files/B4-2026_Appropriation...
5,Traditional and Khoi-San Leadership Bill,No status available,Minister of Cooperative Governance and Traditi...,2026-02-27T18:03:54.932271+00:00,2026-02-27T18:03:54.932271+00:00,http://api.pmg.org.za/bill/1331/,No document available
6,Public Finance Management Amendment Bill,No status available,Mr S. Thambo (EFF),2026-01-16T09:48:03.674827+00:00,2026-01-16T09:48:03.674827+00:00,http://api.pmg.org.za/bill/1320/,https://pmg.org.za/files/260115Public-Finance-...
7,Labour Law Amendment Bill,No status available,Minister of Employment and Labour,2026-02-27T07:29:34.228869+00:00,2026-02-27T07:29:34.228869+00:00,http://api.pmg.org.za/bill/1330/,https://pmg.org.za/files/1/260226LabourLawAmen...
8,Division of Revenue Bill,Under consideration by the National Assembly.,Minister of Finance,2026-02-25T08:56:01.037467+00:00,2026-02-25T12:09:51.020998+00:00,http://api.pmg.org.za/bill/1327/,https://pmg.org.za/files/B5-2026_DoRB.pdf
9,South African National Petroleum Company Bill,Under consideration by the National Assembly.,Minister of Mineral and Petroleum Resources,2026-02-12T06:43:52.655768+00:00,2026-02-12T06:43:52.655768+00:00,http://api.pmg.org.za/bill/1325/,https://pmg.org.za/files/B2-2026_SA_National_P...
10,Water Services Amendement Bill,Under consideration by the National Assembly.,Minister of Water and Sanitation,2025-10-04T06:06:03.099974+00:00,2025-10-04T06:06:03.099974+00:00,http://api.pmg.org.za/bill/1301/,https://pmg.org.za/files/B24-2025_Water_Servic...


In [3]:
# Load previous snapshot
with open('old_data.json', 'r') as f:
    old_data = pd.read_json(f, orient='records', lines=True)

# Normalize and compute date differences
# Use .loc to safely set a new column on the DataFrame copy
df_cleaned.loc[:, 'date_diff'] = pd.to_datetime(df_cleaned['Updated at']) - pd.to_datetime(df_cleaned['Created at'])
df_cleaned.loc[:, 'date_diff'] = df_cleaned['date_diff'].dt.days
df_sorted = df_cleaned.sort_values(by='date_diff', ascending=True).copy()

# Quick shapes
print(f"Shapes before dedupe: df_sorted={df_sorted.shape}, old_data={old_data.shape}")

# Check for duplicate Titles which can cause a many-to-one merge and therefore extra rows in the merged result
old_dup_count = old_data['Title'].duplicated().sum() if 'Title' in old_data.columns else 0
new_dup_count = df_sorted['Title'].duplicated().sum() if 'Title' in df_sorted.columns else 0
print(f"Duplicate Title counts -> old_data: {old_dup_count}, df_sorted: {new_dup_count}")

if old_dup_count > 0:
    print("Sample duplicate Titles in old_data:")
    display(old_data[old_data['Title'].duplicated(keep=False)].head(10))

if new_dup_count > 0:
    print("Sample duplicate Titles in df_sorted:")
    display(df_sorted[df_sorted['Title'].duplicated(keep=False)].head(10))

# Deduplicate by Title to ensure a 1:1 merge (keep first occurrence). This avoids expanded rows when the right side has duplicates.
if old_dup_count > 0:
    old_data = old_data.drop_duplicates(subset=['Title'], keep='first').reset_index(drop=True)
if new_dup_count > 0:
    df_sorted = df_sorted.drop_duplicates(subset=['Title'], keep='first').reset_index(drop=True)

print(f"Shapes after dedupe: df_sorted={df_sorted.shape}, old_data={old_data.shape}")

# Compare and detect adds/removals/changes. Use a validated merge to catch unexpected many-to-many matches.
from pandas.errors import MergeError

if not df_sorted.equals(old_data):
    if df_sorted.shape[0] > old_data.shape[0]:
        print("New bills have been added.")
        new_bills = df_sorted[~df_sorted['Title'].isin(old_data['Title'])]
        print(new_bills)
    elif df_sorted.shape[0] < old_data.shape[0]:
        print("Some bills have been removed.")
        removed_bills = old_data[~old_data['Title'].isin(df_sorted['Title'])]
        print(removed_bills)
    else:
        print("The number of bills is the same, but there are changes in the details.")
        try:
            merged = df_sorted.merge(old_data, on='Title', suffixes=('_new', '_old'), how='left', validate='one_to_one')
        except MergeError as e:
            print("MergeError: multiple matches detected for some Titles in old_data. Showing counts >1:")
            counts = old_data['Title'].value_counts()
            print(counts[counts > 1])
            # Fall back to deduped right-side (we already deduped above), but show a warning
            merged = df_sorted.merge(old_data.drop_duplicates(subset=['Title'], keep='first'), on='Title', suffixes=('_new', '_old'), how='left')
        # Vectorized comparison for status changes (safer than iterrows)
        if 'Status_new' not in merged.columns and 'Status_old' not in merged.columns:
            # After merge the status columns from each DF will be suffixed; construct names dynamically
            # If original 'Status' column existed, pandas created 'Status_new' and 'Status_old'.
            pass
        status_new_col = 'Status_new' if 'Status_new' in merged.columns else 'Status'
        status_old_col = 'Status_old' if 'Status_old' in merged.columns else 'Status'
        # Select rows where statuses differ (handle NaN safely)
        changes = merged[(merged[status_new_col].fillna('') != merged[status_old_col].fillna(''))]
        for _, row in changes.iterrows():
            print(f"Bill '{row['Title']}' has a status change from '{row.get(status_old_col)}' to '{row.get(status_new_col)}'.")
else:
    print("No changes detected in the bills.")

# Persist current snapshot for next run
with open('old_data.json', 'w') as f:
    df_cleaned.to_json(f, orient='records', lines=True)

Shapes before dedupe: df_sorted=(50, 8), old_data=(38, 6)
Duplicate Title counts -> old_data: 0, df_sorted: 9
Sample duplicate Titles in df_sorted:


   0.   3. 190.   0.   0. 107.   0.   0.   0.  44.   0.   0.   0.   0.
 142.   0.  82.   0.  82.   0.  97.   0.   0.  97.   0.   0.   0.   0.
 308. 190.   0.  nan  nan  nan  nan  nan]' has dtype incompatible with timedelta64[ns], please explicitly cast to a compatible dtype first.
  df_cleaned.loc[:, 'date_diff'] = df_cleaned['date_diff'].dt.days


Unnamed: 0,Title,Status,Introduced by,Created at,Updated at,URL,Document,date_diff
3,Special Appropriation Bill,Under consideration by the National Assembly.,Minister of Finance,2026-02-25T08:58:28.437276+00:00,2026-02-25T12:15:34.886768+00:00,http://api.pmg.org.za/bill/1328/,https://pmg.org.za/files/B3-2026_Special_Appro...,0.0
4,Appropriation Bill,Under consideration by the National Assembly.,Minister of Finance,2026-02-25T08:53:38.001494+00:00,2026-02-25T12:15:03.533161+00:00,http://api.pmg.org.za/bill/1326/,https://pmg.org.za/files/B4-2026_Appropriation...,0.0
8,Division of Revenue Bill,Under consideration by the National Assembly.,Minister of Finance,2026-02-25T08:56:01.037467+00:00,2026-02-25T12:09:51.020998+00:00,http://api.pmg.org.za/bill/1327/,https://pmg.org.za/files/B5-2026_DoRB.pdf,0.0
14,Appropriation Bill,Withdrawn,Minister of Finance,2025-03-12T07:24:21.223534+00:00,2025-04-25T09:41:35.667971+00:00,http://api.pmg.org.za/bill/1266/,https://pmg.org.za/files/B6-2025_Appropriation...,44.0
24,Division of Revenue Bill,Withdrawn,Minister of Finance,2025-03-12T07:26:01.062218+00:00,2025-04-25T09:41:44.095583+00:00,http://api.pmg.org.za/bill/1267/,https://pmg.org.za/files/B7-2025_Division_of_R...,44.0
32,Division of Revenue Bill,Act commenced,Minister of Finance,2025-05-21T06:49:05.733305+00:00,2025-08-11T14:17:08.290963+00:00,http://api.pmg.org.za/bill/1280/,https://pmg.org.za/files/B15-2025_Division_of_...,82.0
35,Appropriation Bill,Act commenced,Minister of Finance,2025-05-21T06:46:00.583189+00:00,2025-08-11T11:28:59.144092+00:00,http://api.pmg.org.za/bill/1279/,https://pmg.org.za/files/B16-2025_Appropriatio...,82.0
30,Special Appropriation Bill,Act commenced,Minister of Finance,2025-09-28T08:41:57.654183+00:00,2026-02-17T12:33:52.866466+00:00,http://api.pmg.org.za/bill/1297/,https://pmg.org.za/files/Special_Appropriation...,142.0
1,,,,,,,No document available,
27,,,,,,,No document available,


Shapes after dedupe: df_sorted=(41, 8), old_data=(38, 6)
New bills have been added.
                                       Title  \
10    Constitution Nineteenth Amendment Bill   
23  Traditional and Khoi-San Leadership Bill   
27                 Labour Law Amendment Bill   
40                                       NaN   

                                           Status  \
10  Under consideration by the National Assembly.   
23                            No status available   
27                            No status available   
40                                            NaN   

                                        Introduced by  \
10                           Mr George Michalakis, MP   
23  Minister of Cooperative Governance and Traditi...   
27                  Minister of Employment and Labour   
40                                                NaN   

                          Created at                        Updated at  \
10  2025-02-28T06:03:41.104557+00:00  2025-02-28T

In [4]:
# Persist current snapshot for next run
with open('old_data.json', 'w') as f:
    df_sorted.to_json(f, orient='records', lines=True)