In [1]:
import numpy as np 
import pandas as pd
import seaborn as sns
from scipy import stats
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

### Population by state

In [2]:
URL_DATA = 'https://storage.dosm.gov.my/population/population_state.parquet'

state_population = pd.read_parquet(URL_DATA)

state_population.to_parquet("./staging/bronze/state_population_bronze_data.parquet", index=False)

### Population by national

In [3]:
URL_DATA = 'https://storage.dosm.gov.my/population/population_malaysia.parquet'

national_population = pd.read_parquet(URL_DATA)

national_population.to_parquet("./staging/bronze/national_population_bronze_data.parquet", index=False)

### Inflation rate national 

In [4]:
URL_DATA = 'https://storage.dosm.gov.my/cpi/cpi_2d_annual_inflation.parquet'

inflation_national = pd.read_parquet(URL_DATA)

inflation_national.to_parquet("./staging/bronze/inflation_national_bronze_data.parquet", index=False)

### Annual Real GDP by State & Economic Sector

In [5]:
URL_DATA = 'https://storage.dosm.gov.my/gdp/gdp_state_real_supply.parquet'

gdp_state = pd.read_parquet(URL_DATA)

gdp_state.to_parquet("./staging/bronze/gdp_state_bronze_data.parquet", index=False)

### Annual Real GDP by national & Economic Sector

In [6]:
URL_DATA = 'https://storage.dosm.gov.my/gdp/gdp_gni_annual_real.parquet'

gdp_national = pd.read_parquet(URL_DATA)

gdp_national.to_parquet("./staging/bronze/gdp_national_bronze_data.parquet", index=False)


### Labour force by state

In [7]:
URL_DATA = 'https://storage.dosm.gov.my/labour/lfs_state_sex.parquet'

Labour_force_state = pd.read_parquet(URL_DATA)

Labour_force_state.to_parquet("./staging/bronze/Labour_force_state_bronze_data.parquet", index=False)

### Labour force by national

In [8]:
URL_DATA = 'https://storage.dosm.gov.my/labour/lfs_year.parquet'

Labour_force_national = pd.read_parquet(URL_DATA)

Labour_force_national.to_parquet("./staging/bronze/Labour_force_national_bronze_data.parquet", index=False)

### Household income by state

In [9]:
URL_DATA = 'https://storage.dosm.gov.my/hies/hh_income_state.parquet'

house_hold_income_state = pd.read_parquet(URL_DATA)

house_hold_income_state.to_parquet("./staging/bronze/house_hold_income_state_bronze_data.parquet", index=False)

### Household income by state

In [10]:
URL_DATA = 'https://storage.dosm.gov.my/hies/hh_income.parquet'

house_hold_income_national = pd.read_parquet(URL_DATA)

house_hold_income_national.to_parquet("./staging/bronze/house_hold_income_national_bronze_data.parquet", index=False)

### Household expenditure state

In [11]:

import pdfplumber

# Path to the PDF file
pdf_path = './data sources/Mean_Expenditure_2016_2022.pdf'

# Initialize an empty list to store all tables
all_tables = []

# Open the PDF
with pdfplumber.open(pdf_path) as pdf:
    # Iterate through all pages
    for page in pdf.pages:
        # Extract tables from the current page
        tables = page.extract_tables()
        
        # If tables were found on this page, add them to our list
        if tables:
            all_tables.extend(tables)

# Convert the first table to a DataFrame
# Note: We'll use the first row as headers
if all_tables:
    df = pd.DataFrame(all_tables[0][1:], columns=all_tables[0][0])
else:
    print("No tables found in the PDF")
    
df

Unnamed: 0,Negeri\nState,Penengah / Median,Purata / Mean
0,,2016 * 2019 * 2022,2016 * 2019 * 2022
1,Malaysia\nJohor\nKedah\nKelantan\nMelaka\nNege...,"3,334 3,683 4,282\n3,635 4,038 4,636\n2,428 2,...","4,080 4,609 5,150\n4,167 4,826 5,342\n3,071 3,..."


In [12]:
from langchain_openai import ChatOpenAI
from langgraph.prebuilt import create_react_agent
from langchain_experimental.agents import create_pandas_dataframe_agent

# Assuming `df` is already defined
model = ChatOpenAI(
    model="gpt-4o-mini",
    openai_api_key="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",
    temperature=0
)

# Create the pandas agent
agent = create_pandas_dataframe_agent(
    llm=model,
    df=df,
    verbose=True,
    agent_type="tool-calling",       # or AgentType.ZERO_SHOT_REACT_DESCRIPTION
    allow_dangerous_code=True        # ← this flag is REQUIRED now :contentReference[oaicite:0]{index=0}
)

# run your transformation prompt
output = agent.run(
    """
    take this dataframe, and transform this dataframe into a new dataframe with this columns ['year','state','expenditure_mean_state(RM)','expenditure_median_state(RM)']. Then assign the result to a variable named new_df and show its head.
    
    #IMPORTANT#
    1) Make sure the each data pulled from the df is assigned to the new dataframe correctly, corresponding to the correct column names.
    2) Aggregate the year to be in ASCENDING ORDER from the minimum year to the maximum year, each year having all of the states and the mean and median values for that year. like group by year and then state.
    3) make sure the states column contain this state names only: Johor, Kedah, Kelantan, Melaka, Negeri Sembilan, Pahang, Pulau Pinang, Perak, Perlis, Sabah, Sarawak, Selangor, Terengganu, W.P. Kuala Lumpur, W.P. Putrajaya, W.P. Labuan.
    
    """
)

print(output)

# 1. Inspect your tools to confirm which one is the REPL
for i, tool in enumerate(agent.tools):
    print(i, tool.name)

# 2. Grab the REPL tool (usually the one named "python_repl_ast")
repl_tool = [t for t in agent.tools if t.name.startswith("python_repl")][0]

# 3. Extract new_df from its locals
new_df = repl_tool.locals["new_df"]

house_hold_expenditure = new_df


  output = agent.run(




[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `python_repl_ast` with `{'query': "import pandas as pd\n\n# Sample data based on the provided dataframe structure\ndata = {\n    'Negeri': ['Malaysia', 'Johor', 'Kedah', 'Kelantan', 'Melaka', 'Negeri Sembilan', 'Pahang', 'Pulau Pinang', 'Perak', 'Perlis', 'Selangor', 'Terengganu', 'Sabah', 'Sarawak', 'W.P. Kuala Lumpur', 'W.P. Labuan', 'W.P. Putrajaya'],\n    'Penengah / Median': ['3,334 3,683 4,282', '3,635 4,038 4,636', '2,428 2,766 3,298', '2,444 2,655 3,048', '3,617 3,938 4,861', '3,128 3,717 3,869', '2,761 3,107 3,513', '3,411 3,809 4,481', '2,850 2,938 3,341', '2,927 3,166 3,355', '4,432 4,874 5,747', '3,371 3,846 4,216', '2,111 2,279 2,770', '2,665 2,955 3,379', '5,222 5,719 6,232', '3,516 3,553 3,822', '5,153 6,486 6,644'],\n    'Purata / Mean': ['4,080 4,609 5,150', '4,167 4,826 5,342', '3,071 3,373 3,765', '2,884 3,231 3,505', '4,394 4,983 5,707', '3,700 4,362 4,678', '3,332 3,661 4,107', '4,219 4,651 5,3

In [13]:
house_hold_expenditure = new_df
house_hold_expenditure

Unnamed: 0,year,state,expenditure_mean_state(RM),expenditure_median_state(RM)
0,2016,Johor,4167,3635
1,2016,Kedah,3071,2428
2,2016,Kelantan,2884,2444
3,2016,Melaka,4394,3617
4,2016,Negeri Sembilan,3700,3128
5,2016,Pahang,3332,2761
6,2016,Perak,3251,2850
7,2016,Perlis,3090,2927
8,2016,Pulau Pinang,4219,3411
9,2016,Sabah,2601,2111


In [14]:
house_hold_expenditure.to_parquet("./staging/bronze/house_hold_expenditure_bronze_data.parquet", index=False)

### Relative Poverty compare to state household median income

In [15]:
URL_DATA = 'https://storage.dosm.gov.my/hies/hh_poverty_state.parquet'

house_hold_poverty_state = pd.read_parquet(URL_DATA)

house_hold_poverty_state.to_parquet("./staging/bronze/house_hold_poverty_state_bronze_data.parquet", index=False)

### Relative Poverty compare to national household median income

In [16]:
URL_DATA = 'https://storage.dosm.gov.my/hies/hh_poverty.parquet'

house_hold_poverty_national = pd.read_parquet(URL_DATA)

house_hold_poverty_national.to_parquet("./staging/bronze/house_hold_poverty_national_bronze_data.parquet", index=False)

In [None]:
import pandas as pd
from langchain_experimental.agents.agent_toolkits import create_pandas_dataframe_agent
from langchain_openai import OpenAI

# Your OpenAI API key directly declared here
api_key = "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"

# Create your DataFrame
df = pd.DataFrame({
    "country": ["United States", "United Kingdom", "France", "Germany", "Italy", "Spain", "Canada", "Australia", "Japan", "China"],
    "revenue": [5000, 3200, 2900, 4100, 2300, 2100, 2500, 2600, 4500, 7000]
})

# Initialize the OpenAI LLM with the API key
llm = OpenAI(temperature=0, api_key=api_key)

# Create the pandas dataframe agent
agent = create_pandas_dataframe_agent(llm, df, verbose=True)

# Ask your question
response = agent.invoke("Which are the top 5 countries by sales?")

print(response)


: 