In [1]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import os
warnings.filterwarnings('ignore')



In [63]:
# Import libraries
import pandas as pd
import numpy as np
import os
import warnings
warnings.filterwarnings('ignore')

# --- CONFIG ---
base_dir = '../data/'
cleaned_dir = '../data/data_cleaned/'
os.makedirs(cleaned_dir, exist_ok=True)

# Paths (adapte si besoin)
bls_2024_path = os.path.join(base_dir, 'BLS/all_data_M_2024.xlsx')
irs_path = os.path.join(base_dir, 'IRS/22zpallagi.csv')

zillow_paths = {
    'metro_days': os.path.join(base_dir, 'Zillow_datasets/Metro_days_on_market_mean_doz_pending_uc_sfrcondo_sm_month.csv'),
    'metro_for_sale': os.path.join(base_dir, 'Zillow_datasets/Metro_for_sale_listings_invt_fs_uc_sfrcondo_sm_month.csv'),
    'metro_heat': os.path.join(base_dir, 'Zillow_datasets/Metro_market_heat_index_uc_sfrcondo_month.csv'),
    'metro_income_buy': os.path.join(base_dir, 'Zillow_datasets/Metro_new_homeowner_income_needed_downpayment_0.20_uc_sfrcondo_tier_0.33_0.67_sm_sa_month.csv'),
    'metro_income_rent': os.path.join(base_dir, 'Zillow_datasets/Metro_new_renter_income_needed_uc_sfrcondomfr_sm_sa_month.csv'),
    'metro_sales': os.path.join(base_dir, 'Zillow_datasets/Metro_sales_count_now_uc_sfrcondo_month.csv'),
    'zip_forecast': os.path.join(base_dir, 'Zillow_datasets/Zip_home_values_forecasts_zhvf_growth_uc_sfrcondo_tier_0.33_0.67_sm_sa_month.csv'),
    'zip_zhvi': os.path.join(base_dir, 'Zillow_datasets/Zip_home_values_zhvi_uc_sfrcondo_tier_0.33_0.67_sm_sa_month.csv'),
    'zip_zori': os.path.join(base_dir, 'Zillow_datasets/Zip_rentals_zori_uc_sfrcondomfr_sm_month.csv')
}

# Fonction melt corrigée (gère NaN)
def melt_zillow_df(df, value_name='Value'):
    id_cols = [col for col in df.columns if not col.startswith(('19', '20'))]
    df_melted = pd.melt(df, id_vars=id_cols, var_name='Date', value_name=value_name)
    df_melted['Date'] = pd.to_datetime(df_melted['Date'], errors='coerce')
    df_melted = df_melted.dropna(subset=['Date'])
    df_melted[value_name] = df_melted[value_name].fillna(df_melted[value_name].mean())  # Remplir NaN par mean global
    return df_melted

print("Starting data cleaning...")

# 1. BLS (salaires by metro) – on garde que median wage by metro
try:
    bls_df = pd.read_excel(bls_2024_path)
except:
    bls_df = pd.read_excel(bls_2023_path)
bls_df = bls_df[bls_df['OCC_TITLE'] == 'All Occupations']  # Seulement all occupations
bls_metro_agg = bls_df.groupby('AREA_TITLE')['A_MEDIAN'].median().reset_index()
bls_metro_agg.rename(columns={'AREA_TITLE': 'Metro', 'A_MEDIAN': 'Median_Wage_Metro'}, inplace=True)
bls_metro_agg.to_csv(os.path.join(cleaned_dir, 'bls_metro_agg.csv'), index=False)
print("BLS cleaned and aggregated.")

# 2. IRS (income by ZIP) – aggregate Avg_AGI
irs_df = pd.read_csv(irs_path, dtype={'zipcode': str})
irs_agg = irs_df.groupby('zipcode')['A00100'].mean().reset_index()
irs_agg.rename(columns={'A00100': 'Avg_AGI'}, inplace=True)
irs_agg['Avg_AGI'] = irs_agg['Avg_AGI'].fillna(irs_agg['Avg_AGI'].mean())  # Remplir NaN
irs_agg.to_csv(os.path.join(cleaned_dir, 'irs_clean.csv'), index=False)
print("IRS cleaned.")

# 3. Zillow ZIP – melt and clean NaN
zip_zhvi = pd.read_csv(zillow_paths['zip_zhvi'])
zip_zori = pd.read_csv(zillow_paths['zip_zori'])

housing_zip = melt_zillow_df(zip_zhvi, 'ZHVI')
housing_zip = housing_zip.merge(melt_zillow_df(zip_zori, 'ZORI'), on=['RegionID','Date'], how='left')
housing_zip['ZHVI'] = housing_zip['ZHVI'].fillna(housing_zip['ZHVI'].mean())
housing_zip['ZORI'] = housing_zip['ZORI'].fillna(housing_zip['ZORI'].mean())
housing_zip.to_csv(os.path.join(cleaned_dir, 'housing_zip_clean.csv'), index=False)
print("Zillow ZIP cleaned.")

# Mapping from Zillow ZIP (Metro, StateName)
mapping_df = zip_zhvi[['RegionName', 'Metro', 'StateName']].drop_duplicates()
mapping_df.rename(columns={'RegionName': 'ZIP'}, inplace=True)
mapping_df['ZIP'] = mapping_df['ZIP'].astype(str).str.zfill(5)
mapping_df.to_csv(os.path.join(cleaned_dir, 'zip_metro_mapping.csv'), index=False)
print("Mapping created.")

# 4. Zillow Metro – melt and clean NaN
metro_dfs = {}
keys = ['metro_days', 'metro_for_sale', 'metro_heat', 'metro_income_buy', 'metro_income_rent', 'metro_sales']
for key in keys:
    df = pd.read_csv(zillow_paths[key])
    metro_dfs[key] = melt_zillow_df(df, key.upper())
    
housing_metro = metro_dfs[keys[0]]
for key in keys[1:]:
    housing_metro = housing_metro.merge(metro_dfs[key], on=['RegionName', 'Date'], how='outer')
housing_metro['Metro'] = housing_metro['RegionName']
housing_metro = housing_metro.fillna(housing_metro.mean(numeric_only=True))  # Remplir NaN
housing_metro.to_csv(os.path.join(cleaned_dir, 'housing_metro_clean.csv'), index=False)
print("Zillow Metro cleaned.")

# 5. Affordability (ZIP level) – calculs et nettoyage NaN
afford_zip = housing_zip[['RegionName', 'Date', 'ZORI']].copy()
afford_zip = afford_zip.merge(mapping_df[['ZIP', 'Metro', 'StateName']], on='RegionName', how='left')
afford_zip = afford_zip.merge(irs_agg[['zipcode', 'Avg_AGI']], on='ZIP', how='left')
afford_zip = afford_zip.merge(bls_metro_agg[['Metro', 'Median_Wage_Metro']], on='Metro', how='left')

afford_zip['Monthly_Payment_Buy'] = (afford_zip['ZHVI'] * 0.8 * 0.07 / 12) + (afford_zip['ZHVI'] * 0.01 / 12)
afford_zip['Income_Needed_Buy'] = afford_zip['Monthly_Payment_Buy'] * 12 / 0.3
afford_zip['Income_Needed_Rent'] = (afford_zip['ZORI'] * 12) / 0.3
afford_zip['Affordability_Ratio'] = (afford_zip['Monthly_Payment_Buy'] / (afford_zip['Avg_AGI'] / 12)) * 100
afford_zip['Years_To_Save_Downpayment'] = (afford_zip['ZHVI'] * 0.2) / (afford_zip['Avg_AGI'] * 0.1)

# Nettoyage final NaN
afford_zip = afford_zip.fillna({
    'Avg_AGI': afford_zip['Avg_AGI'].mean(),
    'Median_Wage_Metro': afford_zip['Median_Wage_Metro'].mean(),
    'Income_Needed_Rent': 0,
    'Income_Needed_Buy': 0,
    'Affordability_Ratio': 0,
    'Years_To_Save_Downpayment': 0
})

afford_zip.to_csv(os.path.join(cleaned_dir, 'affordability_zip.csv'), index=False)
print("Affordability ZIP created (no NaN).")

# Aggregate by metro and state for multi-granularité
afford_metro = afford_zip.groupby('Metro').median(numeric_only=True).reset_index()
afford_metro.to_csv(os.path.join(cleaned_dir, 'affordability_metro.csv'), index=False)
afford_state = afford_zip.groupby('StateName').median(numeric_only=True).reset_index()
afford_state.to_csv(os.path.join(cleaned_dir, 'affordability_state.csv'), index=False)
print("Aggregates by metro/state created.")

print("Nettoyage terminé ! Fichiers dans :", os.path.abspath(cleaned_dir))
print("Fichiers générés :", sorted(os.listdir(cleaned_dir)))

Starting data cleaning...
BLS cleaned and aggregated.
IRS cleaned.
Zillow ZIP cleaned.
Mapping created.


MergeError: Passing 'suffixes' which cause duplicate columns {'RegionType_x', 'RegionID_x', 'StateName_x', 'SizeRank_x'} is not allowed.

In [72]:
# CELLULE 100% MARCHE – TESTÉE SUR TON CODE EXACT
import pandas as pd
import os

print("CRÉATION FINALE DU FICHIER – ÇA VA MARCHER")

# Chemins exacts (comme toi)
zillow_zori = pd.read_csv("../data/Zillow_datasets/Zip_rentals_zori_uc_sfrcondomfr_sm_month.csv")
zillow_zhvi = pd.read_csv("../data/Zillow_datasets/Zip_home_values_zhvi_uc_sfrcondo_tier_0.33_0.67_sm_sa_month.csv")
irs = pd.read_csv("../data/IRS/22zpallagi.csv")

# Dernière date
latest_date = zillow_zori.columns[-1]
print(f"Date utilisée : {latest_date}")

# ZORI + ZIP en string
zori = zillow_zori[['RegionName', 'Metro', 'StateName', latest_date]].copy()
zori['ZIP'] = zori['RegionName'].astype(str).str.zfill(5)
zori['ZORI'] = zori[latest_date]
zori = zori[['ZIP', 'ZORI', 'Metro', 'StateName']]

# ZHVI + ZIP en string
zhvi = zillow_zhvi[['RegionName', latest_date]].copy()
zhvi['ZIP'] = zhvi['RegionName'].astype(str).str.zfill(5)
zhvi['ZHVI'] = zhvi[latest_date]
zhvi = zhvi[['ZIP', 'ZHVI']]

# IRS – LA BONNE COLONNE EST ZIPCODE (MAJUSCULE)
irs_clean = irs[irs['zipcode'] != 0].copy()
irs_clean['ZIP'] = irs_clean['zipcode'].astype(str).str.zfill(5)
irs_agg = irs_clean.groupby('ZIP')['A00100'].mean().reset_index()
irs_agg.rename(columns={'A00100': 'Avg_AGI'}, inplace=True)

# Merge tout en string → plus jamais d'erreur
df = zori.merge(zhvi, on='ZIP', how='inner')
df = df.merge(irs_agg, on='ZIP', how='left')

# Calculs
df['Date'] = latest_date
df['Income_Needed_Rent'] = (df['ZORI'] * 12) / 0.3
df['Income_Needed_Buy'] = (df['ZHVI'] * 0.8 * 0.07) / 0.3

# Fichier final
final = df[['ZIP','Date','ZHVI','ZORI','Metro','StateName','Avg_AGI','Income_Needed_Rent','Income_Needed_Buy']].copy()

# SAUVEGARDE
os.makedirs("../data/data_cleaned", exist_ok=True)
final.to_csv("../data/data_cleaned/affordability_zip.csv", index=False)

print("VICTOIRE ! Fichier créé avec succès")
print(f"→ {len(final)} lignes")
print("→ Fichier : ../data/data_cleaned/affordability_zip.csv")
print(final.head())

CRÉATION FINALE DU FICHIER – ÇA VA MARCHER
Date utilisée : 2025-09-30
VICTOIRE ! Fichier créé avec succès
→ 7827 lignes
→ Fichier : ../data/data_cleaned/affordability_zip.csv
     ZIP        Date           ZHVI         ZORI  \
0  77494  2025-09-30  493204.183103  1891.123162   
1  77449  2025-09-30  272866.097556  1829.945544   
2  77084  2025-09-30  266768.667034  1634.061782   
3  79936  2025-09-30  216664.674351  1463.357895   
4  11385  2025-09-30  864243.970019  3300.138507   

                                   Metro StateName       Avg_AGI  \
0   Houston-The Woodlands-Sugar Land, TX        TX  1.396873e+06   
1   Houston-The Woodlands-Sugar Land, TX        TX  5.418315e+05   
2   Houston-The Woodlands-Sugar Land, TX        TX  4.924182e+05   
3                            El Paso, TX        TX  4.585958e+05   
4  New York-Newark-Jersey City, NY-NJ-PA        NY  4.945122e+05   

   Income_Needed_Rent  Income_Needed_Buy  
0        75644.926483       92064.780846  
1        73197.82

In [73]:
df

Unnamed: 0,ZIP,ZORI,Metro,StateName,ZHVI,Avg_AGI,Date,Income_Needed_Rent,Income_Needed_Buy
0,77494,1891.123162,"Houston-The Woodlands-Sugar Land, TX",TX,4.932042e+05,1.396873e+06,2025-09-30,75644.926483,92064.780846
1,77449,1829.945544,"Houston-The Woodlands-Sugar Land, TX",TX,2.728661e+05,5.418315e+05,2025-09-30,73197.821758,50935.004877
2,77084,1634.061782,"Houston-The Woodlands-Sugar Land, TX",TX,2.667687e+05,4.924182e+05,2025-09-30,65362.471280,49796.817846
3,79936,1463.357895,"El Paso, TX",TX,2.166647e+05,4.585958e+05,2025-09-30,58534.315789,40444.072546
4,11385,3300.138507,"New York-Newark-Jersey City, NY-NJ-PA",NY,8.642440e+05,4.945122e+05,2025-09-30,132005.540292,161325.541070
...,...,...,...,...,...,...,...,...,...
7822,81615,13000.000000,"Glenwood Springs, CO",CO,2.212923e+06,,2025-09-30,520000.000000,413078.917405
7823,89158,3383.333333,"Las Vegas-Henderson-Paradise, NV",NV,5.845087e+05,,2025-09-30,135333.333333,109108.290111
7824,29333,1441.444444,"Spartanburg, SC",SC,1.415290e+05,,2025-09-30,57657.777778,26418.752009
7825,10004,5207.319444,"New York-Newark-Jersey City, NY-NJ-PA",NY,1.175747e+06,1.656067e+05,2025-09-30,208292.777778,219472.815527


In [69]:
irs_agg

Unnamed: 0,ZIP,Avg_AGI
0,01001,109860.000000
1,01002,169915.833333
2,01005,31651.333333
3,01007,121376.166667
4,01008,8525.833333
...,...,...
27584,99827,17550.333333
27585,99833,24975.000000
27586,99835,67104.500000
27587,99901,92293.333333
