In [None]:
# WQU-Applied_Data_science_Lab-First_project
# Housing in Mexico
In this project, I worked with a dataset with 21,000 properties for sale in Mexico through the real estate website Properati.com. The goal was to determine whether sale prices are influenced more by property size or location.
# Import Matplotlib, pandas, and plotly
import matplotlib.pyplot as plt
import pandas as pd
import plotly.express as px

# Import csv into the dataframe
df1 = pd.read_csv("data/brasil-real-estate-1.csv")
df1.info()

# Inspect the DataFrame
df1.info()
df1.head()

# Remove Nan Value
df1.dropna(inplace=True)
df1.info()

# Split "lat-lon"
df1[["lat","lon"]]= df1["lat-lon"].str.split(",", expand = True).astype(float)
df1.head()

# Create "state" from "place_with_parent_names"
df1["state"]= df1["place_with_parent_names"].str.split("|", expand = True)[2]
df1.head()

# Transform "price_usd" to floating point number
df1["price_usd"]= df1["price_usd"].str.replace("$", "", regex=False).str.replace(",", "").astype(float)
df1.head()

# Drop "lat-lon" and "place_with_parent_names" cols
df1.drop(columns= ["lat-lon", "place_with_parent_names"], inplace= True)

# Import csv into the dataframe(2)
df2 = pd.read_csv("data/brasil-real-estate-2.csv")
df2.info()

# Create "price_usd" from "price_brl" col
df2["price_usd"] = df2["price_brl"]/3.19
df2.head()

# Drop "price_brl" col and Remove Nan Value
df2=df2.drop('price_brl', axis='columns')
df2.dropna(inplace=True)
df2.head()

# Drop "lat-lon" & "place_with_parent_names" cols
df2=df2.drop(['lat-lon','place_with_parent_names'], axis='columns')
df2.head()

# Create a scatterplot of df
fig = px.scatter_mapbox(
    df,
    lat="lat",
    lon="lon",
    center={"lat": -14.2, "lon": -51.9},  # Map will be centered on Brazil
    width=600,
    height=600,
    hover_data=["price_usd"],  # Display price when hovering mouse over house
)

fig.update_layout(mapbox_style="open-street-map")
fig.show()

# Create a DataFrame summary_stats with Summary statistics for "area_m2" and "price_usd" cols
summary_stats = df[['area_m2','price_usd']].describe()
summary_stats

# Create a histogram of "price_usd" 
plt.hist(df["price_usd"])
plt.xlabel("Price [USD]")
plt.ylabel("Frequency")
plt.title("Distribution of Home Prices")

# Create a horizontal boxplot of "area_m2" 
plt.boxplot(df['area_m2'], vert=False)
plt.xlabel('Area [sq meters]')
plt.title('Distribution of Home Sizes')

# Create a series showing the mean home price in each region
mean_price_by_region = df.groupby('region')['price_usd'].mean().sort_values(ascending=False)
mean_price_by_region

# Create a barchart of mean_price_by_region 
plt.bar(mean_price_by_region.index,mean_price_by_region)
plt.xlabel('Region')
plt.ylabel('Mean Price [USD]')
plt.title("Mean Home Price by Region")

mean_price_by_region.plot(
    kind= "bar",
    xlabel="Region",
    ylabel= "Mean Price [USD]",
    title="Mean Home Price by Region"
)

# Create a DataFrame containing all the homes in the "South"
df_south = df[df['region'] == 'South']
df_south.head()

# Create a series showing the number of properties in each state in the south
homes_by_state = df_south["state"].value_counts().head(3)
homes_by_state

# Create a series showing the number of properties in each state in the south
homes_by_state = df_south["state"].value_counts().head(3)
homes_by_state

# Create a scatterplot showing
df_largest = df_south[df_south["state"] =="Rio Grande do Sul"]
df_largest
plt.scatter(x=df_largest['area_m2'], y=df_largest['price_usd'])
plt.xlabel('Area [sq meters]')
plt.ylabel("Price [USD]")
plt.title('Rio Grande do Sul: Price vs. Area')

south_states_corr =  {}

south_states = df[df["region"]== "South"]["state"].unique()
for state in south_states:
    state_df = df[df["state"] == state]
    coef = state_df["area_m2"].corr(state_df["price_usd"])
    south_states_corr[state] = coef
    
south_states_corr

South_states = df[df["region"]== "South"]["state"].unique()
