https://www.kaggle.com/datasets/vikrishnan/boston-house-prices

In [133]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

pd.set_option('display.max_colwidth', None)

In [2]:
import plotly.express as px
import plotly.graph_objects as go

from plotly.subplots import make_subplots
import plotly.figure_factory as ff

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, PowerTransformer
from sklearn import metrics

from scipy import stats
import joblib

In [5]:
from plots.kdeplots import plots_kde, plots_kde_outliers

In [6]:
column_names = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV']
df = pd.read_csv("F:/Data/datas/housing.csv", delimiter=r"\s+", names=column_names)

In [7]:
CAT_COLS = ['CHAS', 'RAD']
NUM_COLS = df.columns[~df.columns.isin(CAT_COLS)].to_list()

### Data Legend

- CRIM: per capita crime rate by town
- ZN: proportion of residential land zoned for lots over 25,000 sq.ft.
- INDUS: proportion of non-retail business acres per town
- CHAS: Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
- NOX: nitric oxides concentration (parts per 10 million)
- RM: average number of rooms per dwelling
- AGE: proportion of owner-occupied units built prior to 1940
- DIS: weighted distances to ﬁve Boston employment centers
- RAD: index of accessibility to radial highways
- TAX: full-value property-tax rate per \$10,000
- PTRATIO: pupil-teacher ratio by town 
- B: 1000(Bk−0.63)^2 where Bk is the proportion of blacks by town
    - Shows non-linear formula, when the higher difference from (Bk−0.63) results higher B.
    - In intuitive way, not all blacks are from the lower class, so the larger black population also indicates a more comfortable place for black people from the upper class when racial issues matter in society.
- LSTAT: \%lower status of the population
- MEDV: Median value of owner-occupied homes in $1000s

### Categorical Features Compositions

In [50]:
def plot_pie(churn_counts, textposition='inside', textinfo='percent+label', sorted_pie=False, **layouts):
    fig = px.pie(names=churn_counts.index, values=churn_counts.values, hole=0.5)
    fig.update_traces(textposition=textposition, textinfo=textinfo)
    
    if sorted_pie:
        sorted_idx = np.argsort(fig._data[0]['values'])
        fig._data[0]['labels'] = fig._data[0]['labels'][sorted_idx]
        fig._data[0]['values'] = fig._data[0]['values'][sorted_idx]
    
    return fig.update_layout(**layouts)

In [56]:
plot_pie(df['RAD'].value_counts(), textposition='outside', sorted_pie=True, title='RAD Values Compositions', title_x=0.5)

In [37]:
plot_pie(df['CHAS'].value_counts(), title='CHAS Values Compositions', title_x=0.5, textinfo='percent+value')

### Numerical Features Distributions

In [16]:
plots_kde(df, NUM_COLS, 3, v_space=0.1, height=900)

In [15]:
plots_kde_outliers(df, NUM_COLS, 3)

In [62]:
def replace_outliers(df, columns):
    df = df.copy()
    
    for col in columns:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3-Q1
        
        top_cap = Q3 + (IQR * 1.5)
        bottom_cap = Q1 - (IQR * 1.5)
        
        if df[col].max() > top_cap:
            print("above bottom cap:", df[df[col] > top_cap].shape[0])            
            df[col] = df[col].apply(lambda x: top_cap if x > top_cap else x)
        if df[col].min() < bottom_cap:
            print("below bottom cap:", df[df[col] < top_cap].shape[0])
            df[col] = df[col].apply(lambda x: bottom_cap if x < bottom_cap else x)
                
    return df

### Features Correlations

In [60]:
fig_corr_heatmap = px.imshow(df.corr().round(2), text_auto=True, height=700, template='plotly_white', color_continuous_midpoint=0)
fig_corr_heatmap.show()

### Features Correlations to Target Value

In [129]:
df_corr = df.corr()['MEDV'].sort_values()
df_corr = df_corr.drop('MEDV').reset_index()
df_corr

Unnamed: 0,index,MEDV
0,LSTAT,-0.737663
1,PTRATIO,-0.507787
2,INDUS,-0.483725
3,TAX,-0.468536
4,NOX,-0.427321
5,CRIM,-0.388305
6,RAD,-0.381626
7,AGE,-0.376955
8,CHAS,0.17526
9,DIS,0.249929


In [59]:
fig_corr_bar = px.bar(df_corr.round(3), x='index', y='MEDV', color='index', template='plotly_white', text_auto=True)
fig_corr_bar.show()

### Summary

- There are two categorical features that are "CHAS" and "RAD" with less than 10 unique values.
- For numerical features there are:
    - 5 features have high skewed values (skew > 1):
        - CRIM
        - ZN
        - DIS
        - B
        - MEDV (target value)
    - 5 features have moderate skewed values (skew 0.5-1.0):
        - NOX
        - AGE
        - TAX
        - PTRATIO
        - LSTAT
    - 2 features have near-zero skewed values:
        - INDUS
        - RM
- Features with negative correlations indicate factors that tend to decrease housing prices (e.g., high crime rate, pollution, lower status population).
- Features with positive correlations suggest factors that contribute to higher housing prices (e.g., larger lots, more rooms, proximity to the Charles River).

In [100]:
map_legend = {
    "CRIM": "per capita crime rate by town",
    "ZN": "proportion of residential land zoned for lots over 25,000 sq.ft.",
    "INDUS": "proportion of non-retail business acres per town",
    "CHAS": "Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)",
    "NOX": "nitric oxides concentration (parts per 10 million)",
    "RM": "average number of rooms per dwelling",
    "AGE": "proportion of owner-occupied units built prior to 1940",
    "DIS": "weighted distances to ﬁve Boston employment centers",
    "RAD": "index of accessibility to radial highways",
    "TAX": "full-value property-tax rate per $10,000",
    "PTRATIO": "pupil-teacher ratio by town",
    "B": "1000(Bk−0.63) where Bk is the proportion of blacks by town",
    "LSTAT": "%lower status of the population",
    "MEDV": "Median value of owner-occupied homes in $1000s"
}

In [138]:
df_corr

Unnamed: 0,index,MEDV
0,LSTAT,-0.737663
1,PTRATIO,-0.507787
2,INDUS,-0.483725
3,TAX,-0.468536
4,NOX,-0.427321
5,CRIM,-0.388305
6,RAD,-0.381626
7,AGE,-0.376955
8,CHAS,0.17526
9,DIS,0.249929


In [139]:
map_interpretation = {
    "CRIM": "areas with higher crime rates tend to have lower median home values.",
    "ZN": "areas with larger lots tend to have higher median home values.",
    "INDUS": "industrial or business-heavy areas tend to have lower median home values.",
    "CHAS": "being near the river slightly increases median home values.",
    "NOX": "more polluted areas tend to have lower median home values.",
    "RM": "homes with more rooms tend to have higher median values.",
    "AGE": "areas with more old homes tend to have lower median home values.",
    "DIS": "homes further from employment hubs may have slightly higher values, possibly due to more suburban or desirable residential locations.",
    "RAD": "homes near major roads or highways are less desirable and thus lower in value.",
    "TAX": "areas with higher taxes tend to have lower median home values.",
    "PTRATIO": "homes in areas with larger class sizes (imply lower student-teacher engagement) tend to have lower median values.",
    "B": "a black population proportion significantly different from 0.63 tend to have higher housing prices.",
    "LSTAT": "as the proportion of lower status residents increases, the median home value tends to decrease."
}

In [140]:
df_corr_summary = df_corr.copy()
df_corr_summary['Legend'] = df_corr_summary['index'].map(map_legend)
df_corr_summary.rename({"MEDV": "Pearson-Correlations to MEDV", "index": "Feature"}, axis=1, inplace=True)
df_corr_summary.set_index('Feature', inplace=True)
df_corr_summary['Interpretation'] = df_corr_summary.index.map(map_interpretation)

In [141]:
df_corr_summary

Unnamed: 0_level_0,Pearson-Correlations to MEDV,Legend,Interpretation
Feature,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
LSTAT,-0.737663,%lower status of the population,"as the proportion of lower status residents increases, the median home value tends to decrease."
PTRATIO,-0.507787,pupil-teacher ratio by town,homes in areas with larger class sizes (imply lower student-teacher engagement) tend to have lower median values.
INDUS,-0.483725,proportion of non-retail business acres per town,industrial or business-heavy areas tend to have lower median home values.
TAX,-0.468536,"full-value property-tax rate per $10,000",areas with higher taxes tend to have lower median home values.
NOX,-0.427321,nitric oxides concentration (parts per 10 million),more polluted areas tend to have lower median home values.
CRIM,-0.388305,per capita crime rate by town,areas with higher crime rates tend to have lower median home values.
RAD,-0.381626,index of accessibility to radial highways,homes near major roads or highways are less desirable and thus lower in value.
AGE,-0.376955,proportion of owner-occupied units built prior to 1940,areas with more old homes tend to have lower median home values.
CHAS,0.17526,Charles River dummy variable (= 1 if tract bounds river; 0 otherwise),being near the river slightly increases median home values.
DIS,0.249929,weighted distances to ﬁve Boston employment centers,"homes further from employment hubs may have slightly higher values, possibly due to more suburban or desirable residential locations."
