In [1]:
# Introduction and Setup
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler

# Goals: Calculate and visualize Sales ROI, Rental ROI, and Overall ROI for real estate investment.

In [2]:
# Data Import
files = {
    "inventory": "./Metro_invt_fs_uc_sfrcondo_sm_month.csv",
    "sales_data": "./Metro_sales_count_now_uc_sfrcondo_month.csv",
    "zhvf_growth": "./Metro_zhvf_growth_uc_sfrcondo_tier_0.33_0.67_sm_sa_month.csv",
    "zori": "./Metro_zori_uc_sfrcondomfr_sm_month.csv"
}
dataframes = {key: pd.read_csv(path) for key, path in files.items()}

In [3]:
# Display datasets
for name, df in dataframes.items():
    print(f"Dataset: {name}")
    print(df.head(), "\n")

Dataset: inventory
   RegionID  SizeRank       RegionName RegionType StateName  2018-03-31  \
0    102001         0    United States    country       NaN   1421530.0   
1    394913         1     New York, NY        msa        NY     73707.0   
2    753899         2  Los Angeles, CA        msa        CA     21998.0   
3    394463         3      Chicago, IL        msa        IL     38581.0   
4    394514         4       Dallas, TX        msa        TX     24043.0   

   2018-04-30  2018-05-31  2018-06-30  2018-07-31  ...  2024-01-31  \
0   1500196.0   1592417.0   1660615.0   1709144.0  ...    890491.0   
1     80345.0     85864.0     90067.0     91881.0  ...     36461.0   
2     23784.0     25605.0     27109.0     28811.0  ...     14058.0   
3     42253.0     45757.0     47492.0     48984.0  ...     19092.0   
4     25876.0     28225.0     30490.0     32408.0  ...     21664.0   

   2024-02-29  2024-03-31  2024-04-30  2024-05-31  2024-06-30  2024-07-31  \
0    876361.0    913841.0    967

In [4]:
# Data Cleaning
for name, df in dataframes.items():
    df.dropna(thresh=len(df.columns) * 0.8, inplace=True)
    dataframes[name] = df
print("Data cleaning completed.")

Data cleaning completed.


In [5]:
# Sales ROI Analysis
sales_data = dataframes["sales_data"]
latest_date = sales_data.columns[sales_data.columns.str.match(r"\d{4}-\d{2}-\d{2}")].max()
growth_column = sales_data.columns[sales_data.columns.str.contains("growth", case=False)].max()

sales_data[latest_date] = pd.to_numeric(sales_data[latest_date], errors="coerce")
sales_data[growth_column] = pd.to_numeric(sales_data[growth_column], errors="coerce")
purchase_price = sales_data[latest_date]
sales_data["Projected_Price"] = purchase_price * (1 + sales_data[growth_column] / 100)

KeyError: nan

In [None]:
print(sales_data.columns)

In [None]:
latest_date = sales_data.columns[sales_data.columns.str.match(r"\d{4}-\d{2}-\d{2}")].max()
growth_column = sales_data.columns[sales_data.columns.str.contains("growth", case=False)].max()
print("Latest Date Column:", latest_date)
print("Growth Column:", growth_column)

In [None]:
sales_data[latest_date] = pd.to_numeric(sales_data[latest_date], errors="coerce")
sales_data[growth_column] = pd.to_numeric(sales_data[growth_column], errors="coerce")

In [None]:
print(sales_data[latest_date].head())
print(sales_data[growth_column].head())

In [None]:
sales_data[latest_date] = pd.to_numeric(sales_data[latest_date], errors="coerce")
sales_data[growth_column] = pd.to_numeric(sales_data[growth_column], errors="coerce")
print(sales_data[sales_data[latest_date].isna()])  # Check rows with NaN in `latest_date`
print(sales_data[sales_data[growth_column].isna()])  # Check rows with NaN in `growth_column`

# Drop rows with NaN in these columns
sales_data.dropna(subset=[latest_date, growth_column], inplace=True)

In [None]:
purchase_price = sales_data[latest_date]
sales_data["Projected_Price"] = purchase_price * (1 + sales_data[growth_column] / 100)