### 0.1 READ THE DATA SOURCE

In [1]:
import configparser

# Initiate the configparser
config = configparser.ConfigParser()

# Read the config ini file
config.read('config.ini')

# Read the csv file path
csv_file_path = config['path']['house_market']

### 0.2 DEFINE THE HTML DISPLAY

In [2]:
from IPython.display import display, HTML
import pandas as pd

# Define the css that will make the table scrollable
css = """ 
.output {
    max-height: 500px; /* Adjiust as needed*/
    overflow: scroll /* Allows scrolling*/
}
"""

# Apply the css for the scrollable output
display(HTML('<style>{}<style>'.format(css)))

# Configure pandas display options for better visuals
pd.set_option('display.max_rows', None) # Display all rows, adjust as needed
pd.set_option('display.max_columns', None) # Display all columns, adjust as needed
pd.set_option('display.max_colwidth', 90) # Set max column width for long text
pd.options.display.float_format = '{:,.2f}'.format # Format floating-point numbers

____________
### 1. DATA EXPOSURE

In [3]:
# Read the csv file
housing_market = pd.read_csv(csv_file_path, encoding='UTF-8-SIG')

# Function to display the data overview
def display_data (df,title):
    shape_info = f"<div><b>Dataframe Shape:<b>{df.shape}</div>" 
    summary_stats = df.describe().to_html()
    data_types = df.dtypes.to_frame().to_html()

    # Display title, table, data types, summary statistics and complete dataset
    display(HTML(f"<h3>{title}</h3>"))
    display(HTML(shape_info))
    display(HTML(f"<div><b>Data Types:</b></div>"))
    display(HTML(data_types))
    display(HTML(f"<div><b>Summary Statistics:</b></div>"))
    display(HTML(summary_stats))
    display(HTML(f"<div><b>Complete Dataset:</b></div>"))
    display(HTML(df.to_html(index=False)))

# Display the function
display_data(housing_market, "HOUSING MARKET DATA OVERVIEW")




Unnamed: 0,0
Country,object
Year,int64
House Price Index,float64
Rent Index,float64
Affordability Ratio,float64
Mortgage Rate (%),float64
Inflation Rate (%),float64
GDP Growth (%),float64
Population Growth (%),float64
Urbanization Rate (%),float64


Unnamed: 0,Year,House Price Index,Rent Index,Affordability Ratio,Mortgage Rate (%),Inflation Rate (%),GDP Growth (%),Population Growth (%),Urbanization Rate (%),Construction Index
count,200.0,200.0,200.0,200.0,200.0,200.0,200.0,200.0,200.0,200.0
mean,2019.5,130.38,83.05,7.24,4.15,3.65,2.13,0.72,74.77,111.2
std,2.88,28.75,21.44,2.58,1.38,1.88,2.41,1.04,8.73,24.31
min,2015.0,80.55,50.35,3.04,1.54,0.53,-1.92,-0.96,60.17,70.97
25%,2017.0,104.14,60.47,5.03,3.05,1.94,-0.1,-0.18,66.92,90.18
50%,2019.5,129.19,83.72,7.38,4.33,3.66,2.31,0.72,75.1,110.59
75%,2022.0,157.13,100.6,9.28,5.22,5.26,4.27,1.62,82.68,133.78
max,2024.0,179.97,119.86,11.88,6.49,6.91,5.96,2.5,89.79,149.74


Country,Year,House Price Index,Rent Index,Affordability Ratio,Mortgage Rate (%),Inflation Rate (%),GDP Growth (%),Population Growth (%),Urbanization Rate (%),Construction Index
USA,2015,117.45,116.55,9.59,4.49,1.51,-0.75,-0.8,85.99,118.09
USA,2016,150.81,51.44,11.73,5.66,1.88,-0.55,-0.36,69.13,111.98
USA,2017,123.19,70.39,8.51,2.2,2.4,0.93,0.6,83.56,85.97
USA,2018,131.42,91.47,3.42,4.54,1.61,-1.48,2.32,88.97,134.67
USA,2019,110.46,56.84,9.16,3.7,1.29,1.96,-0.88,87.28,90.7
USA,2020,146.25,71.82,7.68,4.23,1.7,5.76,1.71,88.18,141.59
USA,2021,139.79,114.53,3.8,2.48,0.79,0.6,0.36,68.14,136.3
USA,2022,115.68,69.67,7.88,2.2,5.71,-1.4,2.45,83.17,85.9
USA,2023,80.55,107.08,9.36,5.15,5.51,-1.41,0.25,63.48,139.05
USA,2024,142.33,73.16,3.57,3.05,2.61,3.84,1.23,86.62,107.78


#### MARKET INDICATORS OVERVIEW

- **House Price Index:** The average price changes in repeat sales or refinancings on the same properties [Investopedia](https://www.investopedia.com/terms/h/house-price-index-hpi.asp).
- **Rent Index:** The change in rental prices over time, considering geography and property type.
- **Affordability Ratio:** The general cost of living in the region, and the capability of affording basic living goods. [Investopedia](https://www.investopedia.com/terms/a/affordability-index.asp). 
- **Mortgage Rate(%):** The interest charged for a home loan, highly sensitive to economic conditions [Investopedia](https://www.investopedia.com/terms/m/mortgage-rate.asp).
- **Population Growth(%):** Population size changes overtime [WorldBank](https://data.worldbank.org/indicator/SP.POP.GROW?end=2023&start=1961&view=chart).
- **Urbanization Rate (%):** The numbers of persons residing in an area defined as ''urban'' per 100 total population. [WorldBank](https://databank.worldbank.org/metadataglossary/world-development-indicators/series/SP.URB.TOTL.IN.ZS)
- **Construction Index:** Changes in the cost of construction, or price fluctuation of required resources.

____________
### 2. DATA PREPARE


##### A. DEFINE AND APPLY A STANDARD CLEANING FUNCTION

In [4]:
# Define a standard cleaning function
def clean_dataset (df,dtype_mapping=None, index_col=None):
    """ 
    data cleaning function

    Parameters
    1. df: Input DataFrame
    2. dtype_mapping: Dictionary of column: dtype for conversion
    3. index_col: Column to set as index

    Returns
    - cleaned display
    - Display cleaning report
    """
    # Initiate the cleaning report
    report ={
        'original_shape': df.shape,
        'duplicates_removed': 0,
        'missing_values': {},
        'type_changes': {},
        'index_set': None,
        'final_shape': None
    }

    # Create working copy
    df_clean = df.copy()

    # 1. Handle data type conversions
    if dtype_mapping:
        for col, dtype in dtype_mapping.items():
            if col not in df_clean.columns:
                raise ValueError(f"Column '{col}' not found in DataFrame")
            try:
                old_type = str(df_clean[col].dtype)
                df_clean[col] = df_clean[col].astype(dtype)
                new_type = str(df_clean[col].dtype)
                if old_type != new_type:
                    report['type_changes'][col] = f"{old_type} → {new_type}"
            except (ValueError, TypeError) as e:
                raise ValueError(f"Failed to convert column '{col}' to {dtype}: {str(e)}")

    # 2. Identify missing values and report only
    for col in df_clean.columns:
        missing = df_clean[col].isna().sum()
        if missing > 0:
            report['missing_values'][col] = missing
    
   # 3. Remove duplicates
    duplicates = df_clean.duplicated().sum()
    df_clean.drop_duplicates(inplace=True)
    report['duplicates_removed'] = duplicates

    # 4. Set index if specified
    if index_col and index_col in df_clean.columns:
        df_clean.set_index(index_col, inplace=True)
        report['index_set'] = index_col

    # 5. Final metadata
    report['final_shape'] = df_clean.shape

    # Display report
    display_cleaning_report(report)

    return df_clean

def display_cleaning_report(report):
    """Display compact cleaning report"""
    html = """
    <style>
        .cleaning-report {
            font-family: Arial, sans-serif;
            border-collapse: collapse;
            width: 100%;
            margin-bottom: 20px;
        }
        .cleaning-report th {
            background-color: #f2f2f2;
            text-align: left;
            padding: 8px;
            border: 1px solid #ddd;
        }
        .cleaning-report td {
            padding: 8px;
            border: 1px solid #ddd;
        }
        .cleaning-report tr:nth-child(even) {
            background-color: #f9f9f9;
        }
    </style>
    <h3>Data Cleaning Report</h3>
    <table class="cleaning-report">
        <tr>
            <th>Operation</th>
            <th>Details</th>
        </tr>
    """
    
    # Basic stats
    html += f"""
        <tr>
            <td>Original Shape</td>
            <td>{report['original_shape']}</td>
        </tr>
        <tr>
            <td>Final Shape</td>
            <td>{report['final_shape']}</td>
        </tr>
        <tr>
            <td>Duplicates Removed</td>
            <td>{report['duplicates_removed']}</td>
        </tr>
    """
    
    # Missing values
    if report['missing_values']:
        html += """
        <tr>
            <td>Missing Values Found</td>
            <td>
        """
        for col, count in report['missing_values'].items():
            html += f"{col}: {count}<br>"
        html += "</td></tr>"
    
    # Type changes
    html += """
        <tr>
            <td>Data Type Changes</td>
            <td>
    """
    if report['type_changes']:
        for col, change in report['type_changes'].items():
            html += f"{col}: {change}<br>"
    else:
        html += "None"
    html += "</td></tr>"

    
    # Index setting
    if report['index_set']:
        html += f"""
        <tr>
            <td>Index Set</td>
            <td>{report['index_set']}</td>
        </tr>
        """
    
    html += "</table>"
    display(HTML(html))

In [5]:
# Define the new dtype mapping
dtype_spec = {
    'Country': 'object',
    'Year':'int64',
    'House Price Index':'float64',
    'Rent Index':'float64',
    'Affordability Ratio':'float64',
    'Mortgage Rate (%)':'float64',
    'Inflation Rate (%)':'float64',
    'GDP Growth (%)':'float64',
    'Population Growth (%)':'float64',
    'Urbanization Rate (%)':'float64',
    'Construction Index':'float64',
}

# Clean the data
cleaned_df = clean_dataset(
    housing_market,
    dtype_mapping=dtype_spec,
    index_col=None
)

Operation,Details
Original Shape,"(200, 11)"
Final Shape,"(200, 11)"
Duplicates Removed,0
Data Type Changes,


##### B. CREATE DERIVE FIELDS TO OPTIMIZE ANALYSIS

- **Real house price index:**
- **Real mortgage cost:**
- **Ownnership rent spread:**
- **Affordibility gdp population score:**
- **Construction urbanizatio ratio:**

In [13]:
# Create derived fields using the chaining method
cleaned_df = (
    cleaned_df.assign(
        # Real house price index adjusted for inflation
        derived_real_house_price_index = lambda df:df['House Price Index']/ (1 + df['Inflation Rate (%)']/100),
        # Mortgage cost net of inflation
        derived_real_mortgage_cost = lambda df:df['Mortgage Rate (%)'] -  df['Inflation Rate (%)'],
        # Ownership vs rent spread
        derived_ownership_rent_spread = lambda df: (df['House Price Index']/df['Rent Index']) * df['Mortgage Rate (%)'],
        # Affordability ratio adjusted to economic/demographic factors
        derived_affordability_gdp_population_score = lambda df:df['Affordability Ratio']*(1 + df['GDP Growth (%)']/100)/ (1 + df['Population Growth (%)']/100),
        # Construction vs urbanization ratio
        derived_construction_urbanization_ratio = lambda df:df['Construction Index']/ (df['Urbanization Rate (%)']/100)
    )
)

# Display the top 5 rows of the table and the included derived fields
display(HTML(cleaned_df.head(20).to_html()))

Unnamed: 0,Country,Year,House Price Index,Rent Index,Affordability Ratio,Mortgage Rate (%),Inflation Rate (%),GDP Growth (%),Population Growth (%),Urbanization Rate (%),Construction Index,derived_real_house_price_index,derived_real_mortgage_cost,derived_ownership_rent_spread,derived_affordability_gdp_population_score,derived_construction_urbanization_ratio
0,USA,2015,117.45,116.55,9.59,4.49,1.51,-0.75,-0.8,85.99,118.09,115.7,2.98,4.53,9.59,137.34
1,USA,2016,150.81,51.44,11.73,5.66,1.88,-0.55,-0.36,69.13,111.98,148.02,3.78,16.6,11.71,161.99
2,USA,2017,123.19,70.39,8.51,2.2,2.4,0.93,0.6,83.56,85.97,120.31,-0.2,3.85,8.53,102.89
3,USA,2018,131.42,91.47,3.42,4.54,1.61,-1.48,2.32,88.97,134.67,129.34,2.93,6.52,3.29,151.37
4,USA,2019,110.46,56.84,9.16,3.7,1.29,1.96,-0.88,87.28,90.7,109.05,2.41,7.19,9.42,103.92
5,USA,2020,146.25,71.82,7.68,4.23,1.7,5.76,1.71,88.18,141.59,143.81,2.53,8.62,7.99,160.56
6,USA,2021,139.79,114.53,3.8,2.48,0.79,0.6,0.36,68.14,136.3,138.69,1.69,3.03,3.81,200.03
7,USA,2022,115.68,69.67,7.88,2.2,5.71,-1.4,2.45,83.17,85.9,109.42,-3.51,3.66,7.59,103.28
8,USA,2023,80.55,107.08,9.36,5.15,5.51,-1.41,0.25,63.48,139.05,76.34,-0.37,3.87,9.21,219.06
9,USA,2024,142.33,73.16,3.57,3.05,2.61,3.84,1.23,86.62,107.78,138.7,0.44,5.94,3.66,124.43


______________
### 3. DISCOVERY QUESTIONS

##### A. HOW HAS THE REAL HOUSE PRICE GROWTH CHANGED OVER TIME?

##### B. WHAT IS THE COST OF OWNING VS RENTING, RELATIVE TO MORTGAGE RATES?

##### C. WHICH YEARS SAW THE LARGEST AFFORDABILITY SHIFTS?

#### D. HOW DOES CONSTRUCTION RELATE TO URBANIZATION TRENDS?

##### E. WHICH COUNTRIES HAVE THE MOST VOLATILE  HOUSE PRICES?

##### F. WHAT IS THE RELATIONSHIP BETWEEN POPULATION GROWTH AND HOUSING DEMAND?