Pull data from CSO 

In [1]:
import requests
import pandas as pd
from pyjstat import pyjstat
from ydata_profiling import ProfileReport
import pkg_resources

# --- STEP 3: Fetch dataset from CSO API ---
url = "https://ws.cso.ie/public/api.restful/PxStat.Data.Cube_API.ReadDataset/HPA02/JSON-stat/2.0/en"

response = requests.get(url)
response.raise_for_status()  # Check for errors

dataset = pyjstat.Dataset.read(response.text)

In [2]:

df = dataset.write('dataframe')
df.head()
# --- STEP 4: Clean column names (optional) ---
df.columns = df.columns.str.replace(' ', '_').str.lower()
# remove any rows where value is less than or equal to 0
print("all rows", len(df))
df = df[df['value'] > 0]
# remove any county that says 'all_counties'
df = df[df['county'] != 'All Counties']
# remove any stamp_duy_event that says 'Filings'
df = df[df['stamp_duty_event'] != 'Filings']
# print unique values in column 'type_of_buyer'
# Filter for Mean and Median Sale Price
df_mean = df[(df['statistic'] == 'Mean Sale Price') & (df['type_of_buyer'] == 'Household Buyer - First-Time Buyer Owner-Occupier')]
df_median = df[(df['statistic'] == 'Median Price') & (df['type_of_buyer'] == 'Household Buyer - First-Time Buyer Owner-Occupier')]
# # Rename for clarity
df_mean = df_mean.rename(columns={'value': 'mean_sale_price'})
df_median = df_median.rename(columns={'value': 'median_price'})

# Merge on region, dwelling, and quarter
df_prices = pd.merge(
    df_mean[['county', 'dwelling_status', 'year', 'mean_sale_price']],
    df_median[['county', 'dwelling_status', 'year', 'median_price']],
    on=['county', 'dwelling_status', 'year'],
    how='inner'
)
print("all rows left", len(df_prices))
df_prices.head()

all rows 174960
all rows left 9025


Unnamed: 0,county,dwelling_status,year,mean_sale_price,median_price
0,Carlow,All Dwelling Statuses,2010,163472.0,166250.0
1,Carlow,All Dwelling Statuses,2010,163472.0,169557.0
2,Carlow,All Dwelling Statuses,2010,179946.0,166250.0
3,Carlow,All Dwelling Statuses,2010,179946.0,169557.0
4,Carlow,All Dwelling Statuses,2010,56391.0,166250.0


In [3]:
# --- STEP 5: Display a quick feature summary ---
feature_summary = pd.DataFrame({
    'Data_Type': df_prices.dtypes,
    'Unique_Values': df_prices.nunique(),
    'Missing_Values': df_prices.isnull().sum(),
    'Sample_Value': [df_prices[col].dropna().iloc[0] for col in df_prices.columns]
})
display(feature_summary)

# --- STEP 6: Show first few rows for context ---
display(df_prices.head())

# --- STEP 7: Generate interactive profiling report ---
profile = ProfileReport(
    df_prices,
    title="CSO HPA02 - Residential Property Transactions Overview",
    explorative=True,
    minimal=True  # Faster rendering for large datasets
)

# Display the report inline in Jupyter
profile.to_notebook_iframe()

Unnamed: 0,Data_Type,Unique_Values,Missing_Values,Sample_Value
county,object,26,0,Carlow
dwelling_status,object,3,0,All Dwelling Statuses
year,object,15,0,2010
mean_sale_price,float64,3170,0,163472.0
median_price,float64,1355,0,166250.0


Unnamed: 0,county,dwelling_status,year,mean_sale_price,median_price
0,Carlow,All Dwelling Statuses,2010,163472.0,166250.0
1,Carlow,All Dwelling Statuses,2010,163472.0,169557.0
2,Carlow,All Dwelling Statuses,2010,179946.0,166250.0
3,Carlow,All Dwelling Statuses,2010,179946.0,169557.0
4,Carlow,All Dwelling Statuses,2010,56391.0,166250.0


Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

100%|██████████| 5/5 [00:00<00:00, 1385.08it/s]


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Model Setup and training

In [4]:
df_prices['gap_ratio'] = df_prices['mean_sale_price'] / df_prices['median_price']
df_prices['year'] = pd.to_numeric(df_prices['year'], errors='coerce')
df_prices = df_prices[(df_prices['year'] >= 2010) & (df_prices['year'] <= 2023)]

# Encode features
df_encoded = pd.get_dummies(df_prices, columns=['county', 'dwelling_status'], drop_first=True)

# Define target
y = df_encoded['gap_ratio']  
X = df_encoded.drop(columns=['mean_sale_price', 'median_price', 'gap_ratio'])



In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score
import mlflow
import mlflow.sklearn

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = RandomForestRegressor(n_estimators=200, random_state=42)
model.fit(X_train, y_train)

# Evaluate
y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# MLflow setup
mlflow.set_tracking_uri("http://127.0.0.1:5000")
mlflow.set_experiment("irish_housing_price_gap")

with mlflow.start_run(run_name="RF_Predict_Gap_2010_2023"):
    # Log parameters
    mlflow.log_param("target", "gap_ratio (€ difference)")
    mlflow.log_param("model_type", "RandomForestRegressor")
    mlflow.log_param("n_estimators", 200)
    mlflow.log_param("train_years", "2010–2023")

    # Log core metrics
    mlflow.log_metric("MAE", mae)
    mlflow.log_metric("R2", r2)

    # 🔥 Log mean price gap per county (for drift monitoring)
    mean_gap_by_county = df_prices.groupby('county')['gap_ratio'].mean()
    mlflow.log_metrics({f"gap_{c}": round(g, 2) for c, g in mean_gap_by_county.items()})

    # Log model and register
    mlflow.sklearn.log_model(model, name="HousingPriceGapPredictor")


print("✅ Model predicting price gap logged successfully!")


2025/10/06 00:13:28 INFO mlflow.tracking.fluent: Experiment with name 'irish_housing_price_gap' does not exist. Creating a new experiment.


✅ Model predicting price gap logged successfully!
