In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

import warnings

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from IPython.display import VimeoVideo
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.utils.validation import check_is_fitted

warnings.simplefilter(action="ignore", category=FutureWarning)

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/brasil-real-estate/Brasile-real-estate-dataset.csv


In [2]:
def wrangle(filepath):
    df = pd.read_csv(filepath, encoding="latin1")

    # Ensure numeric columns
    df["price_brl"] = pd.to_numeric(df["price_brl"], errors="coerce")
    df["area_m2"]   = pd.to_numeric(df["area_m2"], errors="coerce")

    # Add USD column
    df["price_in_usd"] = df["price_brl"] * 0.18

    # Subsets
    subset_pt = df["property_type"] == "apartment"
    subset_city = df["state"].str.contains("Pernambuco", na=False)
    subset_price = (df["price_brl"].notna()) & (df["price_brl"] < 4000_000)

    df = df[subset_pt & subset_city & subset_price]

    # Handle outliers in area (drop NaNs first)
    df = df[df["area_m2"].notna()]
    low, high = df["area_m2"].quantile([0.05, 0.95])
    df = df[df["area_m2"].between(low, high, inclusive="both")]

    return df.reset_index(drop=True)


In [3]:
df1 = wrangle("/kaggle/input/brasil-real-estate/Brasile-real-estate-dataset.csv")
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 277 entries, 0 to 276
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Unnamed: 0     277 non-null    int64  
 1   property_type  277 non-null    object 
 2   state          277 non-null    object 
 3   region         277 non-null    object 
 4   lat            277 non-null    float64
 5   lon            277 non-null    float64
 6   area_m2        277 non-null    float64
 7   price_brl      277 non-null    float64
 8   price_in_usd   277 non-null    float64
dtypes: float64(5), int64(1), object(3)
memory usage: 19.6+ KB


In [4]:
fig = px.scatter_mapbox(
    df1,  # Our DataFrame
    lat= "lat",
    lon= "lon",
    width=600,  # Width of map
    height=600,  # Height of map
    color= "price_in_usd",
    hover_data=["price_in_usd"],  # Display price when hovering mouse over house
)

fig.update_layout(mapbox_style="open-street-map")

fig.show()

In [5]:
fig = px.scatter_3d(
    df1,
    x= "lon",
    y= "lat",
    z= "price_in_usd",
    labels={"lon": "longitude", "lat": "latitude", "price_in_usd": "price"},
    width=600,
    height=500,
)

# Refine formatting
fig.update_traces(
    marker={"size": 4, "line": {"width": 2, "color": "DarkSlateGrey"}},
    selector={"mode": "markers"},
)

# Display figure
fig.show()

In [6]:
features = ["lon", "lat"]
X_train = df1[features]
X_train.head()

Unnamed: 0,lon,lat
0,-34.906326,-8.134204
1,-34.903924,-8.126664
2,-34.907601,-8.12555
3,-34.89592,-8.120249
4,-34.906906,-8.142666


In [7]:
target = "price_in_usd"
y_train = df1[target]
y_train.head()

0     74560.1364
1    152713.5354
2     53898.8904
3    152713.5354
4     83543.2848
Name: price_in_usd, dtype: float64

In [8]:
y_mean = y_train.mean()
y_mean

119140.01916823104

In [9]:
y_pred_baseline = [y_mean] * len(y_train)
y_pred_baseline[:5]

[119140.01916823104,
 119140.01916823104,
 119140.01916823104,
 119140.01916823104,
 119140.01916823104]

In [10]:
mae_baseline = mean_absolute_error(y_train, y_pred_baseline)

print("Mean apt price", round(y_mean, 2))
print("Baseline MAE:", round(mae_baseline, 2))

Mean apt price 119140.02
Baseline MAE: 43005.07


In [11]:
imputer = SimpleImputer()

In [12]:
imputer.fit(X_train)

In [13]:
XT_train = imputer.transform(X_train)
pd.DataFrame(XT_train, columns=X_train.columns).info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 277 entries, 0 to 276
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   lon     277 non-null    float64
 1   lat     277 non-null    float64
dtypes: float64(2)
memory usage: 4.5 KB


In [14]:
model = make_pipeline(
    SimpleImputer(),
    LinearRegression()
)

In [15]:
model.fit(X_train, y_train)

In [16]:
y_pred_training = model.predict(X_train)

In [17]:
mae_training = mean_absolute_error(y_train, y_pred_training)
print("Training MAE:", round(mae_training, 2))

Training MAE: 42904.69


In [18]:

_, X_test, _, y_test = train_test_split(
    X_train, y_train, test_size=0.2, random_state=42
)

print("Training data:", X_train.shape, y_train.shape)
print("Test data:", X_test.shape, y_test.shape)


Training data: (277, 2) (277,)
Test data: (56, 2) (56,)


In [19]:
y_pred_test = pd.Series(model.predict(X_test))
y_pred_test.head()


0    117286.328942
1    123749.167546
2    111829.261786
3    113449.241977
4    120296.554184
dtype: float64

In [20]:
intercept = model.named_steps["linearregression"].intercept_.round()
coefficients = model.named_steps["linearregression"].coef_.round()

print("Intercept:", intercept)
print("Coefficients:", coefficients)

Intercept: 5521092.0
Coefficients: [133107.  93547.]


In [21]:
coef_lon, coef_lat = coefficients  # unpack the two values

In [22]:
print(
    f"price = {intercept} + ({coef_lon} * longitude) + ({coef_lat} * latitude)"
)

price = 5521092.0 + (133107.0 * longitude) + (93547.0 * latitude)


In [23]:
# Create 3D scatter plot
fig = px.scatter_3d(
    df1,
    x= "lon",
    y= "lat",
    z= "price_in_usd",
    labels={"lon": "longitude", "lat": "latitude", "price_in_usd": "price"},
    width=600,
    height=500,
)

# Create x and y coordinates for model representation
x_plane = np.linspace(df1["lon"].min(), df1["lon"].max(), 10)
y_plane = np.linspace(df1["lat"].min(), df1["lat"].max(), 10)
xx, yy = np.meshgrid(x_plane, y_plane)

# Use model to predict z coordinates
z_plane = model.predict(pd.DataFrame({"lon": x_plane, "lat": y_plane}))
zz = np.tile(z_plane, (10, 1))

# Add plane to figure
fig.add_trace(go.Surface(x=xx, y=yy, z=zz))

# Refine formatting
fig.update_traces(
    marker={"size": 4, "line": {"width": 2, "color": "DarkSlateGrey"}},
    selector={"mode": "markers"},
)

# Display figure
fig.show()