# Brazilian Real Estate Market Analysis

Complete analysis of Brazilian real estate data examining regional differences and the relationship between home size and price.

## 1. Import Packages

In [None]:
# Import Matplotlib, pandas, and plotly
import matplotlib.pyplot as plt
import pandas as df
import plotly.express as px

## 2. Import and Explore First Dataset

In [None]:
# Import the first dataset
df1 = df.read_csv("data/brasil-real-estate-1.csv")
df1.info()
df1.head()

## 3. Data Cleaning - Dataset 1

In [None]:
# Remove rows with null values
df1.dropna(inplace=True)
df1.head()

In [None]:
# Split lat-lon column into two separate columns and change data type
df1[["lat", "lon"]] = df1["lat-lon"].str.split(",", expand=True)
df1["lat"] = df1["lat"].astype(float)
df1["lon"] = df1["lon"].astype(float)
df1.head()

In [None]:
# Extract state from place_with_parent_names column
df1["state"] = df1["place_with_parent_names"].str.split("|", expand=True)[2]
df1.head()

In [None]:
# Check the cleaned data
df1.info()

In [None]:
# Drop unnecessary columns
df1.drop(columns=["lat-lon", "place_with_parent_names"], inplace=True)
df1.head()

In [None]:
# Save the cleaned dataset
df1.to_csv("data/brasil-real-estate-clean-1.csv", index=False)

## 4. Import and Clean Second Dataset

In [None]:
# Import the second dataset
df2 = df.read_csv("data/brasil-real-estate-2.csv")
df2.info()

In [None]:
# Convert price from Brazilian Reais to USD (exchange rate: 1 USD = 3.19 BRL)
df2["price_usd"] = df2["price_brl"] / 3.19
df2.info()

In [None]:
# Drop the price_brl column and remove rows with NaN values
df2.drop(columns=["price_brl"], inplace=True)
df2.dropna(inplace=True)
df2.head()

## 5. Combine Datasets

In [None]:
# Concatenate df1 and df2 to create a new DataFrame
df = df.concat([df1, df2], ignore_index=True)
df.head()

## 6. Exploratory Data Analysis

In [None]:
# Summary statistics for area_m2 and price_usd
summary_stats = df[["area_m2", "price_usd"]].describe()
print(summary_stats)

In [None]:
# Histogram of price_usd
plt.figure(figsize=(10, 6))
plt.hist(df["price_usd"].head(20000), bins=50, edgecolor='black')
plt.xlabel("Price [USD]")
plt.ylabel("Frequency")
plt.title("Distribution of Home Prices")
plt.show()

In [None]:
# Horizontal boxplot of area_m2
plt.figure(figsize=(10, 6))
plt.boxplot(df["area_m2"], vert=False)
plt.xlabel("Area [sq meters]")
plt.title("Distribution of Home Sizes")
plt.show()

## 7. Regional Analysis

In [None]:
# Mean home price by region
mean_price_by_region = df.groupby("region")["price_usd"].mean().sort_values()
print(mean_price_by_region)

In [None]:
# Bar chart of mean price by region
mean_price_by_region.plot(kind="bar", figsize=(10, 6))
plt.xlabel("Region")
plt.ylabel("Mean Price [USD]")
plt.title("Mean Home Price by Region")
plt.xticks(rotation=45)
plt.show()

## 8. Southern Region Analysis

In [None]:
# Create DataFrame for South region only
df_south = df[df["region"] == "South"]
df_south.head()

In [None]:
# Count of homes by state in the South region
homes_by_state = df_south["state"].value_counts()
print(homes_by_state)

In [None]:
# Scatter plot for state with largest number of properties
# Get the state with the most properties
largest_state = homes_by_state.index[0]

# Filter data for that state
df_largest_state = df_south[df_south["state"] == largest_state]

# Create scatter plot
plt.figure(figsize=(10, 6))
plt.scatter(df_largest_state["area_m2"], df_largest_state["price_usd"], alpha=0.5)
plt.xlabel("Area [sq meters]")
plt.ylabel("Price [USD]")
plt.title(f"{largest_state}: Price vs. Area")
plt.show()

In [None]:
# Correlation coefficients for each state in South region
south_states_corr = {}
for state in df_south["state"].unique():
    df_state = df_south[df_south["state"] == state]
    correlation = df_state["area_m2"].corr(df_state["price_usd"])
    south_states_corr[state] = correlation

print(south_states_corr)