In [None]:
# Import all required libraries for data analysis and visualization
import pandas as pd  
import matplotlib.pyplot as plt  
import numpy as np  
import matplotlib.ticker as mticker 

## Data Loading and Exploration

Load the real estate dataset and examine its structure.

In [None]:
# Load and examine the real estate dataset
housing_df = pd.read_csv('realtor-data.zip.csv')
print(f"Dataset shape: {housing_df.shape}")  
print(f"Columns: {housing_df.columns.tolist()}")  
housing_df.head()  

## Analysis 1: Top 5 States by House Sales

Identify which states have the most real estate transactions.

In [None]:
# Count properties by state and identify top 5 markets by transaction volume
top_five_states = dict(housing_df.state.value_counts().sort_values(
    ascending=False).iloc[:5])
print("Top 5 states with most houses sold:")
for state, count in top_five_states.items():
    print(f"  {state}: {count}")

In [None]:
# Create bar chart visualizing top 5 states by sales volume
fig, ax = plt.subplots(figsize=(8, 6))
bars = plt.bar(top_five_states.keys(), top_five_states.values(), 
               width=0.7, color='mediumseagreen')
plt.title('States with most houses sold', fontdict={'size': 16})
plt.yticks([])  
ax.tick_params(axis='x', length=0) 
ax.spines[['top','right','left','bottom']].set_visible(False)  

# Apply light gray background color
fig.patch.set_facecolor('lightgray')
ax.set_facecolor('lightgray')

# Add value labels on top of each bar for precise reading
for bar in bars:
    ax.text(
        bar.get_x() + bar.get_width() / 2,
        bar.get_height() + 1000,
        f'{bar.get_height()}',
        va='bottom',
        ha='center',
        fontsize=10
    )

# Highlight the top state with darker green color
bars[0].set_color('forestgreen')
plt.show()

## Analysis 2: House Size Distribution

Analyze the distribution of house sizes and compare mean, median, and mode.
- **Mean**: Average value (sensitive to outliers)
- **Median**: Middle value (robust to outliers)
- **Mode**: Most frequent value

In [None]:
# Calculate central tendency measures for house size distribution
mean = housing_df.house_size.mean() 
median = housing_df.house_size.median() 
mode = housing_df.house_size.mode()[0] 

print(f"House Size Statistics:")
print(f"  Mean:   {mean:,.0f} sqft")  
print(f"  Median: {median:,.0f} sqft")  
print(f"  Mode:   {mode:,.0f} sqft")  

house_sizes = list(housing_df.house_size)  # Convert to list for histogram

In [None]:
# Create histogram with reference lines showing central tendency measures
fig, ax = plt.subplots(figsize=(8,6))

# Configure y-axis formatter for large numbers
scalar_formatter = mticker.ScalarFormatter(useMathText=True)
scalar_formatter.set_powerlimits((0,0))
ax.yaxis.set_major_formatter(scalar_formatter)

# Create histogram with 30 bins ranging from 0 to 6000 sqft
plt.hist(house_sizes, bins=30, range=(0,6000), color='skyblue', 
         edgecolor='white')
plt.title('House size distribution', fontsize=14)
plt.xlabel('Size (sqft)')
plt.ylabel('Frequency')
ax.spines[['top','right']].set_visible(False)

# Apply lavender background for aesthetic appearance
ax.set_facecolor('lavender')
fig.set_facecolor('lavender')

# Add reference lines for central tendency comparison
plt.axvline(mean, color='red', linestyle='--', linewidth=2, 
            label=f'Mean: {mean:,.0f}')
plt.axvline(median, color='green', linestyle='-', linewidth=2, 
            label=f'Median: {int(median):,}')
plt.axvline(mode, color='orange', linestyle=':', linewidth=3, 
            label=f'Mode: {int(mode):,}')
plt.legend(
    loc='best',    
    facecolor='lavender', 
    edgecolor='black',      
    framealpha=1,           
    fontsize=10              
)
plt.show()

## Analysis 3: Boston Market - Size vs Price

Explore the relationship between house size and price in Boston.
This scatter plot reveals market trends and pricing patterns.

In [None]:
# Filter data for Boston market
boston_df = housing_df[housing_df['city'] == 'Boston']
print(f"Boston properties: {len(boston_df)}")
print(f"Price range: ${boston_df['price'].min():,.0f} - ${boston_df['price'].max():,.0f}")
boston_df.head()

In [None]:
fig, ax = plt.subplots(figsize=(8, 6))
ax.yaxis.set_major_formatter(scalar_formatter)

# Create scatter plot with color gradient representing price intensity
scatter = plt.scatter(boston_df['house_size'], boston_df['price'], 
                     alpha=0.6, s=50, edgecolors='none',
                     c=boston_df['price'], cmap='viridis')

# Configure plot labels, title, and axis limits for clarity
plt.title('House size vs Price in Boston', fontsize=14)
plt.xlabel('Size (sqft)', fontsize=11)
plt.ylabel('Price ($)', fontsize=11)
plt.xlim(0, 6500)  # Set x-axis range
plt.ylim(0, 12000000)  # Set y-axis range
ax.spines[['top', 'right']].set_visible(False)

# Add colorbar legend showing price scale
cbar = plt.colorbar(scatter, label='Price ($)')

# Apply light gray background for better contrast
fig.set_facecolor('#f0f0f0')
ax.set_facecolor('#f0f0f0')
plt.tight_layout()
plt.show()

## Analysis 4: Historical Price Trends (1900-2025)

Examine how average house prices have evolved over 125 years of market history.
This time-series analysis reveals long-term real estate market cycles and inflation trends.

In [None]:
# Data Preparation: Clean dataset for time-series analysis
housing_df_clean = housing_df.dropna(subset=['prev_sold_date']).reset_index(drop=True)
housing_df_clean['prev_sold_date'] = pd.to_datetime(housing_df_clean['prev_sold_date'], errors='coerce')
housing_df_clean.info()

In [None]:
# Time-Series Aggregation: Calculate average price per year
price_by_year = housing_df_clean.groupby(housing_df_clean['prev_sold_date'].dt.year)['price'].mean().reset_index()
price_by_year.columns = ['year', 'avg_price']

# Outlier Detection: Identify and remove anomalous year with unusually high average price
max_price_idx = price_by_year['avg_price'].idxmax()
price_by_year = price_by_year.drop(max_price_idx).reset_index(drop=True)
price_by_year

In [None]:
# Time-Series Visualization: Plot average housing prices over 125 years (1900-2025)
fig, ax = plt.subplots(figsize=(8, 6))
ax.yaxis.set_major_formatter(scalar_formatter)

# Plot line chart with yellow color for high visibility on dark background
plt.plot(price_by_year['year'], price_by_year['avg_price'], 
         linestyle='-', linewidth=2, color='#ccff00')
plt.title('Average House Prices by Year (1900-2025)', fontsize=14, 
          fontweight='bold')
plt.xlabel('Year', fontsize=11)
plt.ylabel('Average Price ($)', fontsize=11)
ax.spines[['top', 'right']].set_visible(False)

# Apply dark theme for professional appearance
ax.set_facecolor('#2c3e50')
fig.set_facecolor('#2c3e50')

ax.tick_params(colors='white')
ax.xaxis.label.set_color('white')
ax.yaxis.label.set_color('white')
ax.title.set_color('white')

plt.tight_layout()
plt.show()

## Summary & Key Findings

### Analysis 1: Geographic Market Distribution
- **Objective**: Identify the most active real estate markets by transaction volume
- **Method**: Count properties per state and rank top 5
- **Key Result**: Five states dominate the dataset, showing geographic concentration
- **Business Impact**: Focus marketing efforts on high-volume states

### Analysis 2: Property Size Distribution
- **Objective**: Understand typical house size and identify distribution shape
- **Method**: Calculate mean, median, and mode; visualize with histogram
- **Key Finding**: Mean > Median indicates right-skewed distribution (luxury outliers)
- **Interpretation**: Median is more representative of typical house size than mean

### Analysis 3: Boston Market Dynamics
- **Objective**: Analyze price-to-size relationship in a major metro area
- **Method**: Scatter plot with color gradient for price visualization
- **Key Pattern**: Strong positive correlation between size and price
- **Market Insight**: Larger properties command premium prices; useful for appraisal models

### Analysis 4: Long-Term Price Evolution
- **Objective**: Track housing price trends across 125 years (1900-2025)
- **Method**: Group by year and calculate mean price; remove statistical outliers
- **Key Observation**: Consistent upward trend demonstrating real estate appreciation
- **Economic Implication**: Real estate has served as effective inflation hedge

### Conclusions & Recommendations
1. **Investment Strategy**: Real estate shows consistent long-term appreciation
2. **Market Segmentation**: Regional analysis crucial due to geographic variation
3. **Data Quality**: Remove outliers for accurate trend analysis
4. **Valuation Model**: Use median and comparable properties for more accurate pricing