# Housing-in-Buenos-Aires
Starter notebook for the Housing-in-Buenos-Aires project.

In [2]:
# =========================================================
# Starter Notebook: Housing-in-Buenos-Aires Analysis
# =========================================================

# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from glob import glob
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_absolute_error
from category_encoders import OneHotEncoder
import os
import warnings
warnings.simplefilter(action="ignore", category=FutureWarning)

In [12]:
os.makedirs(os.path.expanduser("~/Desktop/Housing-in-Buenos-Aires/data/raw"), exist_ok=True)

# Sample data for Buenos Aires apartments
data = {
    "place_with_parent_names": [
        "Argentina|Buenos Aires|Palermo",
        "Argentina|Buenos Aires|Recoleta",
        "Argentina|Buenos Aires|Palermo",
        "Argentina|Buenos Aires|Caballito",
        "Argentina|Buenos Aires|Belgrano"
    ],
    "property_type": ["apartment", "apartment", "apartment", "apartment", "apartment"],
    "price_aprox_usd": [95000, 87000, 99000, 92000, 88000],
    "surface_covered_in_m2": [55, 45, 60, 50, 48],
    "lat-lon": ["-34.58,-58.41", "-34.59,-58.38", "-34.57,-58.42", "-34.60,-58.44", "-34.56,-58.39"],
    "surface_total_in_m2": [60, 50, 65, 55, 53],
    "price_usd_per_m2": [1727, 1933, 1650, 1840, 1830],
    "floor": [3, 2, 5, 4, 2],
    "rooms": [2, 1, 3, 2, 1],
    "expenses": [200, 150, 250, 180, 160],
    "operation": ["sale", "sale", "sale", "sale", "sale"],
    "currency": ["USD", "USD", "USD", "USD", "USD"],
    "properati_url": ["url1", "url2", "url3", "url4", "url5"],
    "price": [95000, 87000, 99000, 92000, 88000],
    "price_aprox_local_currency": [0,0,0,0,0],
    "price_per_m2": [0,0,0,0,0]
}

# Create DataFrame
df = pd.DataFrame(data)

# Save to CSV
csv_path = os.path.expanduser("~/Desktop/Housing-in-Buenos-Aires/data/raw/housing_data.csv")
df.to_csv(csv_path, index=False)

print(f"Sample housing data created at: {csv_path}")


Sample housing data created at: /Users/ayoub.elfilali/Desktop/Housing-in-Buenos-Aires/data/raw/housing_data.csv


In [13]:
# =========================================================
# Paths
# =========================================================
RAW_DATA_PATH = "../data/raw"
OUTPUT_PATH = "../outputs"
os.makedirs(OUTPUT_PATH, exist_ok=True)

In [14]:
# =========================================================
# Load all housing CSVs
# =========================================================
from glob import glob
import pandas as pd

# Get all CSV files matching the pattern
all_files = glob("data/raw/*.csv")
print("Files found:", all_files)

dfs = []
for file in all_files:
    df_file = pd.read_csv(file)
    dfs.append(df_file)

# Only concatenate if thereâ€™s at least one DataFrame
if dfs:
    df = pd.concat(dfs, ignore_index=True)
    print(f"Loaded {len(all_files)} files, total rows: {len(df)}")
else:
    print("No CSV files found in the folder.")
    df = pd.DataFrame()  # create empty DataFrame to avoid breaking the rest of the notebook


Files found: []
No CSV files found in the folder.


In [15]:
# =========================================================
# Wrangle function
# =========================================================
def wrangle(df, city_name="Buenos Aires"):
    # Filter apartments in city < 100,000 USD
    mask_city = df["place_with_parent_names"].str.contains(city_name, na=False)
    mask_apt = df["property_type"] == "apartment"
    mask_price = df["price_aprox_usd"] < 100_000
    df = df[mask_city & mask_apt & mask_price].copy()

    # Remove outliers for covered area
    low, high = df["surface_covered_in_m2"].quantile([0.1, 0.9])
    df = df[df["surface_covered_in_m2"].between(low, high)]

    # Split lat-lon column
    df[["lat", "lon"]] = df["lat-lon"].str.split(",", expand=True).astype(float)
    df.drop(columns="lat-lon", inplace=True)

    # Extract borough
    df["borough"] = df["place_with_parent_names"].str.split("|", expand=True)[1]
    df.drop(columns="place_with_parent_names", inplace=True)

    # Drop unneeded columns
    drop_cols = ["surface_total_in_m2", "price_usd_per_m2", "floor", "rooms", 
                 "expenses", "operation", "property_type", "currency", 
                 "properati_url", "price", "price_aprox_local_currency", 
                 "price_per_m2"]
    df = df.drop(columns=[col for col in drop_cols if col in df.columns])

    return df


In [16]:
# Apply wrangling
def wrangle(df, city_name="Buenos Aires"):
    # Example: adjust column names to match your CSV
    mask_city = df["place_name"].str.contains(city_name, na=False)
    mask_apt = df["property_type"] == "apartment"
    mask_price = df["price_aprox_usd"] < 100_000
    df = df[mask_city & mask_apt & mask_price]
    # Continue with the rest of your wrangling...
    return df
df = pd.read_csv("Housing-in-Buenos-Aires/data/raw/housing_data.csv")

print("Data wrangled successfully")
display(df.head())

FileNotFoundError: [Errno 2] No such file or directory: 'Housing-in-Buenos-Aires/data/raw/housing_data.csv'

In [9]:
# =========================================================
# Train-test split
# =========================================================
target = "price_aprox_usd"
features = ["surface_covered_in_m2", "lat", "lon", "borough"]

X = df[features]
y = df[target]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

KeyError: "None of [Index(['surface_covered_in_m2', 'lat', 'lon', 'borough'], dtype='object')] are in the [columns]"

In [None]:
# =========================================================
# Baseline model: Mean predictor
# =========================================================
y_mean = y_train.mean()
y_pred_baseline = [y_mean] * len(y_train)
baseline_mae = mean_absolute_error(y_train, y_pred_baseline)
print(f"Baseline MAE (train): {baseline_mae:.2f}")


In [None]:
# =========================================================
# Ridge regression pipeline
# =========================================================
model = make_pipeline(
    OneHotEncoder(use_cat_names=True),
    SimpleImputer(),
    Ridge()
)

In [None]:
# Fit model
model.fit(X_train, y_train)


In [None]:
# Predict
y_test_pred = pd.Series(model.predict(X_test), index=y_test.index)
mae_test = mean_absolute_error(y_test, y_test_pred)
print(f"Ridge regression MAE (test): {mae_test:.2f}")


In [None]:
# =========================================================
# Feature importance
# =========================================================
intercept = model.named_steps["ridge"].intercept_
coefficients = model.named_steps["ridge"].coef_
features_encoded = model.named_steps["onehotencoder"].get_feature_names()

feat_imp = pd.Series(coefficients, index=features_encoded).sort_values(key=abs, ascending=False)
print("Top feature importances:")
display(feat_imp.head(10))

In [None]:

# =========================================================
# Quick plots
# =========================================================
plt.figure(figsize=(10,6))
sns.histplot(df[target], bins=30, kde=True)
plt.title("Distribution of Property Prices")
plt.xlabel("Price (USD)")
plt.ylabel("Count")
plt.savefig(os.path.join(OUTPUT_PATH, "price_distribution.png"))
plt.show()

# Save feature importance plot
plt.figure(figsize=(12,6))
feat_imp.head(10).plot(kind="barh")
plt.title("Top 10 Feature Importances (Ridge Regression)")
plt.gca().invert_yaxis()
plt.savefig(os.path.join(OUTPUT_PATH, "feature_importance.png"))
plt.show()