# Exploratory Data Analysis - SmartRent Manhattan

This notebook performs comprehensive exploratory data analysis on the StreetEasy Manhattan rental dataset.



In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import sys

project_root = os.path.dirname(os.path.dirname(os.path.abspath('')))
sys.path.insert(0, project_root)

from src.data_loader import load_raw_data, get_data_info

plt.style.use('seaborn-v0_8')
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = (12, 6)


## 1. Load and Inspect Data


In [None]:
data_path = os.path.join(project_root, 'data', 'raw', 'manhattan.csv')

if os.path.exists(data_path):
    df = load_raw_data(data_path)
    print(f"Dataset loaded successfully!")
    print(f"Shape: {df.shape}")
    print(f"\nColumns: {df.columns.tolist()}")
else:
    print(f"Raw data file not found at: {data_path}")
    print("Please ensure manhattan.csv is in data/raw/ directory")
    df = None


## 2. Data Overview


In [None]:
if df is not None:
    print("First few rows:")
    display(df.head())
    
    print("\nDataset Info:")
    print(df.info())
    
    print("\nBasic Statistics:")
    display(df.describe())
    
    print("\nMissing Values:")
    missing = df.isnull().sum()
    print(missing[missing > 0])


## 3. Target Variable Analysis


In [None]:
if df is not None and 'rent' in df.columns:
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    
    axes[0].hist(df['rent'], bins=50, edgecolor='black', alpha=0.7)
    axes[0].set_xlabel('Rent ($)')
    axes[0].set_ylabel('Frequency')
    axes[0].set_title('Distribution of Rental Prices')
    axes[0].grid(True, alpha=0.3)
    
    axes[1].boxplot(df['rent'], vert=True)
    axes[1].set_ylabel('Rent ($)')
    axes[1].set_title('Box Plot of Rental Prices')
    axes[1].grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()
    
    print(f"Rent Statistics:")
    print(f"  Mean: ${df['rent'].mean():,.2f}")
    print(f"  Median: ${df['rent'].median():,.2f}")
    print(f"  Std: ${df['rent'].std():,.2f}")
    print(f"  Min: ${df['rent'].min():,.2f}")
    print(f"  Max: ${df['rent'].max():,.2f}")


## 4. Feature Distributions


In [None]:
if df is not None:
    numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    
    if 'rent' in numeric_cols:
        numeric_cols.remove('rent')
    
    if numeric_cols:
        n_cols = 3
        n_rows = (len(numeric_cols) + n_cols - 1) // n_cols
        
        fig, axes = plt.subplots(n_rows, n_cols, figsize=(15, 5 * n_rows))
        axes = axes.flatten() if n_rows > 1 else [axes] if n_cols == 1 else axes
        
        for idx, col in enumerate(numeric_cols[:9]):
            axes[idx].hist(df[col].dropna(), bins=30, edgecolor='black', alpha=0.7)
            axes[idx].set_xlabel(col)
            axes[idx].set_ylabel('Frequency')
            axes[idx].set_title(f'Distribution of {col}')
            axes[idx].grid(True, alpha=0.3)
        
        for idx in range(len(numeric_cols[:9]), len(axes)):
            fig.delaxes(axes[idx])
        
        plt.tight_layout()
        plt.show()


## 5. Correlation Analysis


In [None]:
if df is not None and 'rent' in df.columns:
    numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    
    if len(numeric_cols) > 1:
        corr_matrix = df[numeric_cols].corr()
        
        plt.figure(figsize=(12, 10))
        sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='coolwarm', center=0,
                    square=True, linewidths=1, cbar_kws={"shrink": 0.8})
        plt.title('Correlation Matrix of Numerical Features')
        plt.tight_layout()
        plt.show()
        
        if 'rent' in corr_matrix.columns:
            rent_corr = corr_matrix['rent'].sort_values(ascending=False)
            print("\nCorrelation with Rent:")
            print(rent_corr)


## 6. Neighborhood Analysis


In [None]:
if df is not None and 'neighborhood' in df.columns and 'rent' in df.columns:
    neighborhood_stats = df.groupby('neighborhood')['rent'].agg(['mean', 'count', 'std']).sort_values('mean', ascending=False)
    neighborhood_stats.columns = ['Avg Rent', 'Count', 'Std']
    
    print("Top 10 Neighborhoods by Average Rent:")
    display(neighborhood_stats.head(10))
    
    print("\nBottom 10 Neighborhoods by Average Rent:")
    display(neighborhood_stats.tail(10))
    
    plt.figure(figsize=(14, 8))
    top_10 = neighborhood_stats.head(10)
    plt.barh(range(len(top_10)), top_10['Avg Rent'])
    plt.yticks(range(len(top_10)), top_10.index)
    plt.xlabel('Average Rent ($)')
    plt.title('Top 10 Neighborhoods by Average Rent')
    plt.gca().invert_yaxis()
    plt.tight_layout()
    plt.show()


## 7. Amenity Analysis


In [None]:
if df is not None:
    amenity_cols = ['no_fee', 'has_roofdeck', 'has_washer_dryer', 'has_doorman',
                    'has_elevator', 'has_dishwasher', 'has_patio', 'has_gym']
    
    available_amenities = [col for col in amenity_cols if col in df.columns]
    
    if available_amenities:
        amenity_counts = df[available_amenities].sum().sort_values(ascending=False)
        
        plt.figure(figsize=(10, 6))
        plt.barh(range(len(amenity_counts)), amenity_counts.values)
        plt.yticks(range(len(amenity_counts)), amenity_counts.index)
        plt.xlabel('Number of Properties')
        plt.title('Amenity Availability Across Properties')
        plt.gca().invert_yaxis()
        plt.tight_layout()
        plt.show()
        
        if 'rent' in df.columns:
            amenity_rent_impact = {}
            for amenity in available_amenities:
                with_amenity = df[df[amenity] == 1]['rent'].mean()
                without_amenity = df[df[amenity] == 0]['rent'].mean()
                amenity_rent_impact[amenity] = with_amenity - without_amenity
            
            impact_df = pd.DataFrame(list(amenity_rent_impact.items()), 
                                    columns=['Amenity', 'Rent Difference ($)'])
            impact_df = impact_df.sort_values('Rent Difference ($)', ascending=False)
            
            print("\nRent Impact of Amenities:")
            display(impact_df)


## 8. Size and Rent Relationship


In [None]:
if df is not None and 'size_sqft' in df.columns and 'rent' in df.columns:
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    
    axes[0].scatter(df['size_sqft'], df['rent'], alpha=0.5, s=10)
    axes[0].set_xlabel('Size (sqft)')
    axes[0].set_ylabel('Rent ($)')
    axes[0].set_title('Rent vs Size')
    axes[0].grid(True, alpha=0.3)
    
    if 'bedrooms' in df.columns:
        for bedrooms in sorted(df['bedrooms'].unique())[:5]:
            subset = df[df['bedrooms'] == bedrooms]
            axes[1].scatter(subset['size_sqft'], subset['rent'], 
                          alpha=0.5, s=10, label=f'{bedrooms} BR')
        axes[1].set_xlabel('Size (sqft)')
        axes[1].set_ylabel('Rent ($)')
        axes[1].set_title('Rent vs Size by Bedrooms')
        axes[1].legend()
        axes[1].grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()
    
    df['price_per_sqft'] = df['rent'] / df['size_sqft']
    print(f"\nPrice per Square Foot Statistics:")
    print(f"  Mean: ${df['price_per_sqft'].mean():,.2f}")
    print(f"  Median: ${df['price_per_sqft'].median():,.2f}")
    print(f"  Std: ${df['price_per_sqft'].std():,.2f}")


## 9. Building Age Analysis


In [None]:
if df is not None and 'building_age_yrs' in df.columns and 'rent' in df.columns:
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    
    axes[0].hist(df['building_age_yrs'].dropna(), bins=30, edgecolor='black', alpha=0.7)
    axes[0].set_xlabel('Building Age (years)')
    axes[0].set_ylabel('Frequency')
    axes[0].set_title('Distribution of Building Ages')
    axes[0].grid(True, alpha=0.3)
    
    age_rent = df.groupby(pd.cut(df['building_age_yrs'], bins=10))['rent'].mean()
    axes[1].plot(range(len(age_rent)), age_rent.values, marker='o')
    axes[1].set_xlabel('Age Group')
    axes[1].set_ylabel('Average Rent ($)')
    axes[1].set_title('Average Rent by Building Age')
    axes[1].set_xticks(range(len(age_rent)))
    axes[1].set_xticklabels([f"{int(interval.left)}-{int(interval.right)}" 
                             for interval in age_rent.index], rotation=45, ha='right')
    axes[1].grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()


## 10. Summary and Insights

### Key Findings:
1. **Rental Price Distribution**: [Add insights from rent analysis]
2. **Feature Correlations**: [Add insights from correlation matrix]
3. **Neighborhood Impact**: [Add insights from neighborhood analysis]
4. **Amenity Effects**: [Add insights from amenity analysis]
5. **Size-Rent Relationship**: [Add insights from size analysis]
6. **Building Age Impact**: [Add insights from age analysis]

### Next Steps:
- Proceed to data preprocessing
- Feature engineering
- Model training
