In [1]:
import pickle

def load(save_path):
    with open(save_path, "rb") as f:
       data = pickle.load(f)
    print("‚úÖ Data loaded successfully!")

    return data

def show(data):

    """
    Inspect the structure of the 'data' dictionary.
    For each year, it prints the available keys, types, and shapes if applicable.
    """
    print("üîé Exploring data structure\n" + "-"*60)
    for year_key, year_data in data.items():
        print(f"{year_key}")
        
        if not isinstance(year_data, dict):
            print(f"  ‚ö†Ô∏è Expected dict, got {type(year_data)}\n")
            continue
        
        # Loop through subkeys (like imgs_array, iris_index, etc.)
        for key, value in year_data.items():
            info = f"  ‚îú‚îÄ {key:<25} ‚Üí "
            
            # Describe arrays
            if isinstance(value, (list, tuple)):
                info += f"list[{len(value)}]"
            elif isinstance(value, dict):
                info += f"dict[{len(value)}]"
            elif hasattr(value, "shape"):
                info += f"array shape={value.shape}, dtype={getattr(value, 'dtype', 'N/A')}"
            else:
                info += str(type(value))
            
            print(info)
        
        print("-"*60)

def save(data, save_path):
    with open(save_path, "wb") as f:
       pickle.dump(data, f)
    print(f"‚úÖ Data saved successfully at: {save_path}")


                                    Beginning Population

In [2]:
import os
import pandas as pd
from dummy_variables import *

def load_population_data(folder_path):
    """
    Searches for a Population file in the given folder (CSV/XLS/XLSX),
    auto-detects file format, encoding, separator, and correct header row,
    then returns a clean pandas DataFrame.
    """
    # --- Find the file ---
    pop_file = None
    for f in os.listdir(folder_path):
        if "population" in f.lower():
            pop_file = os.path.join(folder_path, f)
            break

    if pop_file is None:
        raise FileNotFoundError(f"‚ùå No Population file found in {folder_path}")

    ext = os.path.splitext(pop_file)[1].lower()
    print(f"üìÇ Found file: {pop_file}")

    # --- Excel files ---
    if ext in [".xls", ".xlsx"]:
        # Try to detect where the real header is (often row 5‚Äì6)
        for skip in range(0, 10):
            df = pd.read_excel(pop_file, skiprows=skip)
            cols = [str(c).lower() for c in df.columns]
            if any(k in cols for k in ["iris", "reg", "dep", "p13_pop"]):
                print(f"‚úÖ Detected header row {skip} in Excel file")
                return df
        print("‚ö†Ô∏è Could not auto-detect header, defaulting to first row.")
        return pd.read_excel(pop_file)

    # --- CSV files ---
    elif ext == ".csv":
        for enc in ["utf-8", "latin1", "cp1252"]:
            try:
                with open(pop_file, "r", encoding=enc) as f:
                    sample = f.read(2048)
                sep = ";" if sample.count(";") > sample.count(",") else ","
                df = pd.read_csv(pop_file, sep=sep, encoding=enc)
                cols = [str(c).lower() for c in df.columns]
                if any(k in cols for k in ["iris", "reg", "dep", "p17_pop"]):
                    print(f"‚úÖ Loaded CSV with sep='{sep}', encoding='{enc}'")
                    return df
            except Exception as e:
                print(f"‚ö†Ô∏è Failed with encoding={enc}: {e}")
        raise ValueError(f"‚ùå Could not read {pop_file} with any common encoding.")

    else:
        raise ValueError(f"‚ùå Unsupported file type: {ext}")


In [3]:
path2013 = r"C:\Users\adamh\Desktop\IRIS\2013"
path2014 = r"C:\Users\adamh\Desktop\IRIS\2014"
path2015 = r"C:\Users\adamh\Desktop\IRIS\2015"
path2016 = r"C:\Users\adamh\Desktop\IRIS\2016"
path2017 = r"C:\Users\adamh\Desktop\IRIS\2017"
path2018 = r"C:\Users\adamh\Desktop\IRIS\2018"
path2019 = r"C:\Users\adamh\Desktop\IRIS\2019"
path2020 = r"C:\Users\adamh\Desktop\IRIS\2020"
path2021 = r"C:\Users\adamh\Desktop\IRIS\2021"

Population2013 = load_population_data(path2013)
Population2014 = load_population_data(path2014)
Population2015 = load_population_data(path2015)
Population2016 = load_population_data(path2016)
Population2017 = load_population_data(path2017)
Population2018 = load_population_data(path2018)
Population2019 = load_population_data(path2019)
Population2020 = load_population_data(path2020)
Population2021 = load_population_data(path2021)






üìÇ Found file: C:\Users\adamh\Desktop\IRIS\2013\Population.xls
‚úÖ Detected header row 4 in Excel file
üìÇ Found file: C:\Users\adamh\Desktop\IRIS\2014\Population.xls
‚úÖ Detected header row 4 in Excel file
üìÇ Found file: C:\Users\adamh\Desktop\IRIS\2015\Population.xls
‚úÖ Detected header row 4 in Excel file
üìÇ Found file: C:\Users\adamh\Desktop\IRIS\2016\Population.xls
‚úÖ Detected header row 4 in Excel file
üìÇ Found file: C:\Users\adamh\Desktop\IRIS\2017\Population.CSV


  df = pd.read_csv(pop_file, sep=sep, encoding=enc)


‚úÖ Loaded CSV with sep=';', encoding='utf-8'
üìÇ Found file: C:\Users\adamh\Desktop\IRIS\2018\Population.CSV


  df = pd.read_csv(pop_file, sep=sep, encoding=enc)


‚úÖ Loaded CSV with sep=';', encoding='utf-8'
üìÇ Found file: C:\Users\adamh\Desktop\IRIS\2019\Population.CSV


  df = pd.read_csv(pop_file, sep=sep, encoding=enc)


‚úÖ Loaded CSV with sep=';', encoding='utf-8'
üìÇ Found file: C:\Users\adamh\Desktop\IRIS\2020\Population.CSV


  df = pd.read_csv(pop_file, sep=sep, encoding=enc)


‚úÖ Loaded CSV with sep=';', encoding='utf-8'
üìÇ Found file: C:\Users\adamh\Desktop\IRIS\2021\Population.CSV
‚úÖ Loaded CSV with sep=';', encoding='utf-8'


  df = pd.read_csv(pop_file, sep=sep, encoding=enc)


In [4]:
# --- Build renamers and clean DataFrames for 2017‚Üí2021 ---

# List of (year, DataFrame) pairs
pop_dfs = {
    2017: Population2017,
    2018: Population2018,
    2019: Population2019,
    2020: Population2020,
    2021: Population2021
}

cleaned_populations = {}

for year, df in pop_dfs.items():
    prefix = f"P{str(year)[-2:]}_"
    cprefix = f"C{str(year)[-2:]}_"

    renamer = {}
    keep_cols = []

    for column in df.columns:
        if column == "IRIS":
            renamer[column] = "IRIS"
            keep_cols.append(column)
        elif column.startswith(prefix) or column.startswith(cprefix):
            new_name = column[4:]  # remove prefix like P17_ or C17_
            renamer[column] = new_name
            keep_cols.append(column)

    # Apply renaming and drop everything else
    cleaned_df = df[keep_cols].rename(columns=renamer)
    cleaned_populations[year] = cleaned_df

    print(f"‚úÖ {year}: kept {len(cleaned_df.columns)} columns ‚Üí {list(cleaned_df.columns)[:10]}")

# Access cleaned DataFrames
Pop2017 = cleaned_populations[2017]
Pop2018 = cleaned_populations[2018]
Pop2019 = cleaned_populations[2019]
Pop2020 = cleaned_populations[2020]
Pop2021 = cleaned_populations[2021]
   


‚úÖ 2017: kept 73 columns ‚Üí ['IRIS', 'POP', 'POP0002', 'POP0305', 'POP0610', 'POP1117', 'POP1824', 'POP2539', 'POP4054', 'POP5564']
‚úÖ 2018: kept 73 columns ‚Üí ['IRIS', 'POP', 'POP0002', 'POP0305', 'POP0610', 'POP1117', 'POP1824', 'POP2539', 'POP4054', 'POP5564']
‚úÖ 2019: kept 73 columns ‚Üí ['IRIS', 'POP', 'POP0002', 'POP0305', 'POP0610', 'POP1117', 'POP1824', 'POP2539', 'POP4054', 'POP5564']
‚úÖ 2020: kept 73 columns ‚Üí ['IRIS', 'POP', 'POP0002', 'POP0305', 'POP0610', 'POP1117', 'POP1824', 'POP2539', 'POP4054', 'POP5564']
‚úÖ 2021: kept 73 columns ‚Üí ['IRIS', 'POP', 'POP0002', 'POP0305', 'POP0610', 'POP1117', 'POP1824', 'POP2539', 'POP4054', 'POP5564']


In [5]:
columns = list(Pop2017.columns)
Pop2013 = Population2013.copy()

for column in Pop2013.columns:
    if column != "IRIS" and "Pop" not in column:
        Pop2013 = Pop2013.drop(column, axis=1)
print(len(Pop2013.columns))
Pop2013.columns = columns


Pop2014 = Population2014.copy()

for column in Pop2014.columns:
    if column != "IRIS" and "Pop" not in column:
        Pop2014 = Pop2014.drop(column, axis=1)
print(len(Pop2014.columns))
Pop2014.columns = columns


Pop2015 = Population2015.copy()

for column in Pop2015.columns:
    if column != "IRIS" and "Pop" not in column:
        Pop2015 = Pop2015.drop(column, axis=1)
print(len(Pop2015.columns))
Pop2015.columns = columns


Pop2016 = Population2016.copy()

for column in Pop2016.columns:
    if column != "IRIS" and "Pop" not in column:
        Pop2016 = Pop2016.drop(column, axis=1)

print(len(Pop2016.columns))
Pop2016.columns = columns


73
73
73
73


In [6]:
for col in list(Pop2016.columns):
    print(col)

IRIS
POP
POP0002
POP0305
POP0610
POP1117
POP1824
POP2539
POP4054
POP5564
POP6579
POP80P
POP0014
POP1529
POP3044
POP4559
POP6074
POP75P
POP0019
POP2064
POP65P
POPH
H0014
H1529
H3044
H4559
H6074
H75P
H0019
H2064
H65P
POPF
F0014
F1529
F3044
F4559
F6074
F75P
F0019
F2064
F65P
POP15P
POP15P_CS1
POP15P_CS2
POP15P_CS3
POP15P_CS4
POP15P_CS5
POP15P_CS6
POP15P_CS7
POP15P_CS8
H15P
H15P_CS1
H15P_CS2
H15P_CS3
H15P_CS4
H15P_CS5
H15P_CS6
H15P_CS7
H15P_CS8
F15P
F15P_CS1
F15P_CS2
F15P_CS3
F15P_CS4
F15P_CS5
F15P_CS6
F15P_CS7
F15P_CS8
POP_FR
POP_ETR
POP_IMM
PMEN
PHORMEN


                                 Finished Population

                                Beginning actifs

In [7]:
import os
import pandas as pd

base_path = r"C:\Users\adamh\Desktop\IRIS"
years = range(2013, 2022)

for year in years:
    folder_path = os.path.join(base_path, str(year))
    df = None  # placeholder
    
    for ext in ['.xlsx', '.xls', '.csv']:
        file_path = os.path.join(folder_path, f"activity{ext}")
        if not os.path.exists(file_path):
            continue

        print(f"\n=== Reading {file_path} ===")
        try:
            if ext == '.csv':
                # Most CSVs are semicolon-separated in later years
                df = pd.read_csv(file_path, sep=';', low_memory=False)
            else:
                # Try to detect the correct header row (where 'IRIS' or 'Code IRIS' appears)
                header_row = None
                for i in range(10):
                    test_df = pd.read_excel(file_path, header=i, nrows=1)
                    if 'IRIS' in test_df.columns or 'Code IRIS' in test_df.columns:
                        header_row = i
                        break
                if header_row is not None:
                    df = pd.read_excel(file_path, header=header_row)
                else:
                    df = pd.read_excel(file_path)

            if df is not None:
                var_name = f"activity_{year}"
                globals()[var_name] = df  # create dynamic variable
                print(f"‚úÖ Created variable '{var_name}' with {len(df)} rows and {len(df.columns)} columns")
                print(f"Columns: {list(df.columns)[:10]} ...")  # show first 10 columns
                
            else:
                print("‚ö†Ô∏è Could not load file.")
                
        except Exception as e:
            print(f"‚ùå Error reading {file_path}: {e}")
            
        break  # Stop checking extensions once one is found
    else:
        print(f"\nNo activity file found for {year}")



=== Reading C:\Users\adamh\Desktop\IRIS\2013\activity.xls ===
‚úÖ Created variable 'activity_2013' with 50844 rows and 115 columns
Columns: ['IRIS', 'R√©gion', 'R√©gion 2016', 'D√©partement', 'Unit√© urbaine', 'Commune ou ARM', 'Libell√© commune ou ARM', 'TRIRIS', 'Grand quartier', "Libell√© de l'IRIS"] ...

=== Reading C:\Users\adamh\Desktop\IRIS\2014\activity.xls ===
‚úÖ Created variable 'activity_2014' with 50095 rows and 114 columns
Columns: ['IRIS', 'R√©gion', 'D√©partement', 'Unit√© urbaine', 'Commune ou ARM', 'Libell√© commune ou ARM', 'TRIRIS', 'Grand quartier', "Libell√© de l'IRIS", "Type d'IRIS"] ...

=== Reading C:\Users\adamh\Desktop\IRIS\2015\activity.xls ===
‚úÖ Created variable 'activity_2015' with 49694 rows and 114 columns
Columns: ['IRIS', 'R√©gion', 'D√©partement', 'Unit√© urbaine', 'Commune ou ARM', 'Libell√© commune ou ARM', 'TRIRIS', 'Grand quartier', "Libell√© de l'IRIS", "Type d'IRIS"] ...

=== Reading C:\Users\adamh\Desktop\IRIS\2016\activity.xls ===
‚úÖ Creat

In [8]:
drop_columns_2013  = [
    "R√©gion",
    "R√©gion 2016",
    "D√©partement",
    "Unit√© urbaine",
    "Commune ou ARM",
    "Libell√© commune ou ARM",
    "TRIRIS",
    "Grand quartier",
    "Libell√© de l'IRIS",
    "Type d'IRIS",
    "Modification de l'IRIS",
    "Label de l'IRIS",
    "Pop 15-64 ans en 2013 (princ)",
    "Pop 15-24 ans en 2013 (princ)",
    "Pop 25-54 ans en 2013 (princ)",
    "Pop 55-64 ans en 2013 (princ)",
    "Pop 15-64 ans Hommes en 2013 (princ)",
    "Pop 15-24 ans Hommes en 2013 (princ)",
    "Pop 25-54 ans Hommes en 2013 (princ)",
    "Pop 55-64 ans Hommes en 2013 (princ)",
    "Pop 15-64 ans Femmes en 2013 (princ)",
    "Pop 15-24 ans Femmes en 2013 (princ)",
    "Pop 25-54 ans Femmes en 2013 (princ)",
    "Pop 55-64 ans Femmes en 2013 (princ)"
]
drop_columns_2014 = [
    "R√©gion",
    "D√©partement",
    "Unit√© urbaine",
    "Commune ou ARM",
    "Libell√© commune ou ARM",
    "TRIRIS",
    "Grand quartier",
    "Libell√© de l'IRIS",
    "Type d'IRIS",
    "Modification de l'IRIS",
    "Label de l'IRIS",
    "Pop 15-64 ans en 2014 (princ)",
    "Pop 15-24 ans en 2014 (princ)",
    "Pop 25-54 ans en 2014 (princ)",
    "Pop 55-64 ans en 2014 (princ)",
    "Pop 15-64 ans Hommes en 2014 (princ)",
    "Pop 15-24 ans Hommes en 2014 (princ)",
    "Pop 25-54 ans Hommes en 2014 (princ)",
    "Pop 55-64 ans Hommes en 2014 (princ)",
    "Pop 15-64 ans Femmes en 2014 (princ)",
    "Pop 15-24 ans Femmes en 2014 (princ)",
    "Pop 25-54 ans Femmes en 2014 (princ)",
    "Pop 55-64 ans Femmes en 2014 (princ)"
]
drop_columns_2015 = [
    "R√©gion",
    "D√©partement",
    "Unit√© urbaine",
    "Commune ou ARM",
    "Libell√© commune ou ARM",
    "TRIRIS",
    "Grand quartier",
    "Libell√© de l'IRIS",
    "Type d'IRIS",
    "Modification de l'IRIS",
    "Label de l'IRIS",
    "Pop 15-64 ans en 2015 (princ)",
    "Pop 15-24 ans en 2015 (princ)",
    "Pop 25-54 ans en 2015 (princ)",
    "Pop 55-64 ans en 2015 (princ)",
    "Pop 15-64 ans Hommes en 2015 (princ)",
    "Pop 15-24 ans Hommes en 2015 (princ)",
    "Pop 25-54 ans Hommes en 2015 (princ)",
    "Pop 55-64 ans Hommes en 2015 (princ)",
    "Pop 15-64 ans Femmes en 2015 (princ)",
    "Pop 15-24 ans Femmes en 2015 (princ)",
    "Pop 25-54 ans Femmes en 2015 (princ)",
    "Pop 55-64 ans Femmes en 2015 (princ)"
]
drop_columns_2016 =  [
    "R√©gion",
    "D√©partement",
    "Unit√© urbaine",
    "Commune ou ARM",
    "Libell√© commune ou ARM",
    "TRIRIS",
    "Grand quartier",
    "Libell√© de l'IRIS",
    "Type d'IRIS",
    "Modification de l'IRIS",
    "Label de l'IRIS",
    "Pop 15-64 ans en 2016 (princ)",
    "Pop 15-24 ans en 2016 (princ)",
    "Pop 25-54 ans en 2016 (princ)",
    "Pop 55-64 ans en 2016 (princ)",
    "Pop 15-64 ans Hommes en 2016 (princ)",
    "Pop 15-24 ans Hommes en 2016 (princ)",
    "Pop 25-54 ans Hommes en 2016 (princ)",
    "Pop 55-64 ans Hommes en 2016 (princ)",
    "Pop 15-64 ans Femmes en 2016 (princ)",
    "Pop 15-24 ans Femmes en 2016 (princ)",
    "Pop 25-54 ans Femmes en 2016 (princ)",
    "Pop 55-64 ans Femmes en 2016 (princ)"
]
drop_columns_2017 = [
    "COM", "TYP_IRIS", "MODIF_IRIS", "LAB_IRIS",
    "P17_POP1564", "P17_POP1524", "P17_POP2554", "P17_POP5564",
    "P17_H1564", "P17_H1524", "P17_H2554", "P17_H5564",
    "P17_F1564", "P17_F1524", "P17_F2554", "P17_F5564"
]
drop_columns_2018 = [
    "COM", "TYP_IRIS", "MODIF_IRIS", "LAB_IRIS",
    "P18_POP1564", "P18_POP1524", "P18_POP2554", "P18_POP5564",
    "P18_H1564", "P18_H1524", "P18_H2554", "P18_H5564",
    "P18_F1564", "P18_F1524", "P18_F2554", "P18_F5564"
]
drop_columns_2019 = [
    "COM", "TYP_IRIS", "MODIF_IRIS", "LAB_IRIS",
    "P19_POP1564", "P19_POP1524", "P19_POP2554", "P19_POP5564",
    "P19_H1564", "P19_H1524", "P19_H2554", "P19_H5564",
    "P19_F1564", "P19_F1524", "P19_F2554", "P19_F5564"
]
drop_columns_2020 = [
    "COM",
    "TYP_IRIS",
    "LAB_IRIS",
    "P20_POP1564",
    "P20_POP1524",
    "P20_POP2554",
    "P20_POP5564",
    "P20_H1564",
    "P20_H1524",
    "P20_H2554",
    "P20_H5564",
    "P20_F1564",
    "P20_F1524",
    "P20_F2554",
    "P20_F5564"
]
drop_columns_2021  = [
    "COM",
    "TYP_IRIS",
    "LAB_IRIS",
    "P21_POP1564",
    "P21_POP1524",
    "P21_POP2554",
    "P21_POP5564",
    "P21_H1564",
    "P21_H1524",
    "P21_H2554",
    "P21_H5564",
    "P21_F1564",
    "P21_F1524",
    "P21_F2554",
    "P21_F5564"
]


cleaned_activity_2013 = activity_2013.drop(columns=drop_columns_2013)
cleaned_activity_2014 = activity_2014.drop(columns=drop_columns_2014)
cleaned_activity_2015 = activity_2015.drop(columns=drop_columns_2015)
cleaned_activity_2016 = activity_2016.drop(columns=drop_columns_2016)
cleaned_activity_2017 = activity_2017.drop(columns=drop_columns_2017)
cleaned_activity_2018 = activity_2018.drop(columns=drop_columns_2018)
cleaned_activity_2019 = activity_2019.drop(columns=drop_columns_2019)
cleaned_activity_2020 = activity_2020.drop(columns=drop_columns_2020)
cleaned_activity_2021 = activity_2021.drop(columns=drop_columns_2021)


In [9]:
print("2013:", len(cleaned_activity_2013.columns))
print("2014:", len(cleaned_activity_2014.columns))
print("2015:", len(cleaned_activity_2015.columns))
print("2016:", len(cleaned_activity_2016.columns))
print("2017:", len(cleaned_activity_2017.columns))
print("2018:", len(cleaned_activity_2018.columns))
print("2019:", len(cleaned_activity_2019.columns))
print("2020:", len(cleaned_activity_2020.columns))
print("2021:", len(cleaned_activity_2021.columns))




2013: 91
2014: 91
2015: 91
2016: 91
2017: 92
2018: 92
2019: 104
2020: 104
2021: 104


In [10]:
# Define standardized columns to keep for 2017‚Äì2021
target_cols = [
    'IRIS', 'ACT1564', 'ACT1524', 'ACT2554', 'ACT5564',
    'HACT1564', 'HACT1524', 'HACT2554', 'HACT5564',
    'FACT1564', 'FACT1524', 'FACT2554', 'FACT5564',
    'ACTOCC1564', 'ACTOCC1524', 'ACTOCC2554', 'ACTOCC5564',
    'HACTOCC1564', 'HACTOCC1524', 'HACTOCC2554', 'HACTOCC5564',
    'FACTOCC1564', 'FACTOCC1524', 'FACTOCC2554', 'FACTOCC5564'
]

# Group your dataframes by year
activity_dfs = {
    2013: cleaned_activity_2013,
    2014: cleaned_activity_2014,
    2015: cleaned_activity_2015,
    2016: cleaned_activity_2016,
    2017: cleaned_activity_2017,
    2018: cleaned_activity_2018,
    2019: cleaned_activity_2019,
    2020: cleaned_activity_2020,
    2021: cleaned_activity_2021
}

# Process and rename
for year, df in activity_dfs.items():
    if year <= 2016:
        # Keep only the first 25 columns
        cleaned_df = df.iloc[:, :25].copy()
    else:
        # Handle prefix like P17_, P18_, etc.
        prefix = f"P{str(year)[-2:]}_"
        cols_to_keep = []
        for col in df.columns:
            if col == 'IRIS':
                cols_to_keep.append(col)
            elif any(col.startswith(prefix + base) for base in target_cols[1:]):
                cols_to_keep.append(col)

        # Keep and rename (remove prefix)
        cleaned_df = df[cols_to_keep].copy()
        cleaned_df.columns = [
            col.replace(prefix, '') if col != 'IRIS' else col
            for col in cleaned_df.columns
        ]
    
    # --- Standardize column names to target structure ---
    if len(cleaned_df.columns) == len(target_cols):
        cleaned_df.columns = target_cols
    else:
        # Safety check: if some columns missing, align with NaNs
        cleaned_df = cleaned_df.reindex(columns=target_cols)
    
    # Save to variable
    locals()[f"cleaned_activity_{year}"] = cleaned_df

# Optional verification
for year in range(2013, 2022):
    df = locals()[f"cleaned_activity_{year}"]
    print(f"{year}: {df.shape[1]} cols | first cols: {df.columns.tolist()} ...")


2013: 25 cols | first cols: ['IRIS', 'ACT1564', 'ACT1524', 'ACT2554', 'ACT5564', 'HACT1564', 'HACT1524', 'HACT2554', 'HACT5564', 'FACT1564', 'FACT1524', 'FACT2554', 'FACT5564', 'ACTOCC1564', 'ACTOCC1524', 'ACTOCC2554', 'ACTOCC5564', 'HACTOCC1564', 'HACTOCC1524', 'HACTOCC2554', 'HACTOCC5564', 'FACTOCC1564', 'FACTOCC1524', 'FACTOCC2554', 'FACTOCC5564'] ...
2014: 25 cols | first cols: ['IRIS', 'ACT1564', 'ACT1524', 'ACT2554', 'ACT5564', 'HACT1564', 'HACT1524', 'HACT2554', 'HACT5564', 'FACT1564', 'FACT1524', 'FACT2554', 'FACT5564', 'ACTOCC1564', 'ACTOCC1524', 'ACTOCC2554', 'ACTOCC5564', 'HACTOCC1564', 'HACTOCC1524', 'HACTOCC2554', 'HACTOCC5564', 'FACTOCC1564', 'FACTOCC1524', 'FACTOCC2554', 'FACTOCC5564'] ...
2015: 25 cols | first cols: ['IRIS', 'ACT1564', 'ACT1524', 'ACT2554', 'ACT5564', 'HACT1564', 'HACT1524', 'HACT2554', 'HACT5564', 'FACT1564', 'FACT1524', 'FACT2554', 'FACT5564', 'ACTOCC1564', 'ACTOCC1524', 'ACTOCC2554', 'ACTOCC5564', 'HACTOCC1564', 'HACTOCC1524', 'HACTOCC2554', 'HACTOCC

                                  Finished activity

In [None]:
import numpy as np
import pandas as pd

# === Configuration ===
pop_data = {
    2013: Pop2013,
    2014: Pop2014,
    2015: Pop2015,
    2016: Pop2016,
    2017: Pop2017,
    2018: Pop2018,
    2019: Pop2019,
    2020: Pop2020,
    2021: Pop2021
}

# --- Helper function ---
def compute_weighted_vector(pop_df, iris_props):
    """Compute a weighted socioeconomic vector for a patch given its iris proportions."""
    vector = []
    cols = [c for c in pop_df.columns if c != "IRIS"]
    pop_df = pop_df.set_index("IRIS")

    for col in cols:
        weighted_sum, total_weight = 0.0, 0.0
        for iris_code, weight in iris_props.items():
            if iris_code in pop_df.index:
                val = pop_df.loc[iris_code, col]
                if pd.notna(val):
                    weighted_sum += val * weight
                    total_weight += weight
        vector.append(weighted_sum / total_weight if total_weight > 0 else np.nan)
    return np.array(vector, dtype=float)


# --- Main computation ---
for year in range(2013, 2022):
    year_str = str(year)
    if year_str not in data:
        print(f"‚ö†Ô∏è Skipping {year} ‚Äî not found in data")
        continue

    if year not in pop_data:
        print(f"‚ö†Ô∏è Missing population data for {year}")
        continue

    print(f"üìä Processing {year} ...")
    pop_df = pop_data[year]

    # Get IRIS proportions per patch
    iris_props_dict = data[year_str]["iris_patch_proportions"]

    socioeconomic_vectors = []
    for patch_id, iris_props in iris_props_dict.items():
        socioeconomic_vector = compute_weighted_vector(pop_df, iris_props)
        socioeconomic_vectors.append(socioeconomic_vector)

    # Stack into an array and attach to data
    socioeconomic_vectors = np.stack(socioeconomic_vectors)
    data[year_str]["socioeconomic_vector"] = socioeconomic_vectors

    print(f"‚úÖ {year}: stored socioeconomic vectors, shape = {socioeconomic_vectors.shape}")

print("\nüéØ All socioeconomic vectors computed and stored in data!")


NameError: name 'data' is not defined

                                    Socioeconomical vector begins

In [11]:
save_path = r"C:\Users\adamh\Desktop\Satelite_images\Montpelier_data.pkl"
data = load(save_path)
show(data)

‚úÖ Data loaded successfully!
üîé Exploring data structure
------------------------------------------------------------
2013
  ‚îú‚îÄ imgs_array                ‚Üí array shape=(500, 128, 128, 3), dtype=uint8
  ‚îú‚îÄ space_coordinates         ‚Üí array shape=(500, 128, 128, 2), dtype=float32
  ‚îú‚îÄ iris_index                ‚Üí array shape=(500, 128, 128), dtype=object
------------------------------------------------------------
2014
  ‚îú‚îÄ imgs_array                ‚Üí array shape=(500, 128, 128, 3), dtype=uint8
  ‚îú‚îÄ space_coordinates         ‚Üí array shape=(500, 128, 128, 2), dtype=float32
  ‚îú‚îÄ iris_index                ‚Üí array shape=(500, 128, 128), dtype=object
------------------------------------------------------------
2015
  ‚îú‚îÄ imgs_array                ‚Üí array shape=(500, 128, 128, 3), dtype=uint8
  ‚îú‚îÄ space_coordinates         ‚Üí array shape=(500, 128, 128, 2), dtype=float32
  ‚îú‚îÄ iris_index                ‚Üí array shape=(500, 128, 128), dtype=ob

In [24]:
from collections import Counter
import numpy as np
import pandas as pd


save_path_Paris = r"C:\Users\adamh\Desktop\Satelite_images\Paris_data.pkl"
save_path_Lyon = r"C:\Users\adamh\Desktop\Satelite_images\Lyon_data.pkl"
save_path_Toulouse = r"C:\Users\adamh\Desktop\Satelite_images\Toulouse_data.pkl"
save_path_Bordeaux = r"C:\Users\adamh\Desktop\Satelite_images\Bordeaux_data.pkl"

datas = [load(save_path_Paris), load(save_path_Lyon), load(save_path_Toulouse), load(save_path_Bordeaux)]
save_paths = [save_path_Paris, save_path_Lyon, save_path_Toulouse, save_path_Bordeaux]

for data in datas:
    for year, content in data.items():
        if not year.isdigit():
            continue
        if "iris_index" not in content:
            print(f"‚ö†Ô∏è No iris_index for {year}, skipping...")
            continue

        print(f"\nüìä Computing per-patch IRIS proportions for {year}...")

        iris_index = content["iris_index"]  # shape (num_patches, H, W)
        num_patches = iris_index.shape[0]

        iris_patch_proportions = {}

        for patch_idx in range(num_patches):
            patch = iris_index[patch_idx]

            # Flatten and clean NaNs
            flat = patch.flatten()
            flat = flat[~pd.isna(flat)]

            if len(flat) == 0:
                iris_patch_proportions[patch_idx] = {}
                continue

            # Count frequency of each IRIS code
            counts = Counter(flat)
            total = sum(counts.values())

            # Compute proportions
            proportions = {code: count / total for code, count in counts.items()}

            iris_patch_proportions[patch_idx] = proportions

        # Store in data structure
        data[year]["iris_patch_proportions"] = iris_patch_proportions

        print(f"‚úÖ Added iris_patch_proportions for {year} ({num_patches} patches)")


‚úÖ Data loaded successfully!
‚úÖ Data loaded successfully!
‚úÖ Data loaded successfully!
‚úÖ Data loaded successfully!

üìä Computing per-patch IRIS proportions for 2013...
‚úÖ Added iris_patch_proportions for 2013 (500 patches)

üìä Computing per-patch IRIS proportions for 2014...
‚úÖ Added iris_patch_proportions for 2014 (500 patches)

üìä Computing per-patch IRIS proportions for 2015...
‚úÖ Added iris_patch_proportions for 2015 (500 patches)

üìä Computing per-patch IRIS proportions for 2016...
‚úÖ Added iris_patch_proportions for 2016 (500 patches)

üìä Computing per-patch IRIS proportions for 2017...
‚úÖ Added iris_patch_proportions for 2017 (500 patches)

üìä Computing per-patch IRIS proportions for 2018...
‚úÖ Added iris_patch_proportions for 2018 (500 patches)

üìä Computing per-patch IRIS proportions for 2019...
‚úÖ Added iris_patch_proportions for 2019 (500 patches)

üìä Computing per-patch IRIS proportions for 2020...
‚úÖ Added iris_patch_proportions for 2020 (500 p

In [12]:
from collections import Counter
import numpy as np
import pandas as pd


save_path_Lyon = r"C:\Users\adamh\Desktop\Satelite_images\Montpelier_data.pkl"


datas = [load(save_path_Lyon)]
save_paths = [save_path_Lyon]

for data in datas:
    for year, content in data.items():
        if not year.isdigit():
            continue
        if "iris_index" not in content:
            print(f"‚ö†Ô∏è No iris_index for {year}, skipping...")
            continue

        print(f"\nüìä Computing per-patch IRIS proportions for {year}...")

        iris_index = content["iris_index"]  # shape (num_patches, H, W)
        num_patches = iris_index.shape[0]

        iris_patch_proportions = {}

        for patch_idx in range(num_patches):
            patch = iris_index[patch_idx]

            # Flatten and clean NaNs
            flat = patch.flatten()
            flat = flat[~pd.isna(flat)]

            if len(flat) == 0:
                iris_patch_proportions[patch_idx] = {}
                continue

            # Count frequency of each IRIS code
            counts = Counter(flat)
            total = sum(counts.values())

            # Compute proportions
            proportions = {code: count / total for code, count in counts.items()}

            iris_patch_proportions[patch_idx] = proportions

        # Store in data structure
        data[year]["iris_patch_proportions"] = iris_patch_proportions

        print(f"‚úÖ Added iris_patch_proportions for {year} ({num_patches} patches)")


‚úÖ Data loaded successfully!

üìä Computing per-patch IRIS proportions for 2013...
‚úÖ Added iris_patch_proportions for 2013 (500 patches)

üìä Computing per-patch IRIS proportions for 2014...
‚úÖ Added iris_patch_proportions for 2014 (500 patches)

üìä Computing per-patch IRIS proportions for 2015...
‚úÖ Added iris_patch_proportions for 2015 (500 patches)

üìä Computing per-patch IRIS proportions for 2016...
‚úÖ Added iris_patch_proportions for 2016 (500 patches)

üìä Computing per-patch IRIS proportions for 2017...
‚úÖ Added iris_patch_proportions for 2017 (500 patches)

üìä Computing per-patch IRIS proportions for 2018...
‚úÖ Added iris_patch_proportions for 2018 (500 patches)

üìä Computing per-patch IRIS proportions for 2019...
‚úÖ Added iris_patch_proportions for 2019 (500 patches)

üìä Computing per-patch IRIS proportions for 2020...
‚úÖ Added iris_patch_proportions for 2020 (500 patches)

üìä Computing per-patch IRIS proportions for 2021...
‚úÖ Added iris_patch_propor

In [13]:
import numpy as np
import pandas as pd

# === Configuration ===
pop_data = {
    2013: Pop2013,
    2014: Pop2014,
    2015: Pop2015,
    2016: Pop2016,
    2017: Pop2017,
    2018: Pop2018,
    2019: Pop2019,
    2020: Pop2020,
    2021: Pop2021
}

activity_data = {
    2013: cleaned_activity_2013,
    2014: cleaned_activity_2014,
    2015: cleaned_activity_2015,
    2016: cleaned_activity_2016,
    2017: cleaned_activity_2017,
    2018: cleaned_activity_2018,
    2019: cleaned_activity_2019,
    2020: cleaned_activity_2020,
    2021: cleaned_activity_2021
}


# --- Helper function ---
def compute_weighted_vector(df, iris_props):
    """
    Compute a weighted socioeconomic vector for a patch given its IRIS proportions.
    Works for any dataframe with 'IRIS' as key and numeric columns.
    """
    vector = []
    cols = [c for c in df.columns if c != "IRIS"]
    df = df.set_index("IRIS")

    for col in cols:
        weighted_sum, total_weight = 0.0, 0.0
        for iris_code, weight in iris_props.items():
            if iris_code in df.index:
                val = df.loc[iris_code, col]
                if pd.notna(val):
                    weighted_sum += val * weight
                    total_weight += weight
        vector.append(weighted_sum / total_weight if total_weight > 0 else np.nan)
    return np.array(vector, dtype=float)

for i, data in enumerate(datas):
    # --- Main computation ---
    for year in range(2013, 2022):
        year_str = str(year)
        if year_str not in data:
            print(f"‚ö†Ô∏è Skipping {year} ‚Äî not found in data")
            continue

        if year not in pop_data or year not in activity_data:
            print(f"‚ö†Ô∏è Missing pop or activity data for {year}")
            continue

        print(f"üìä Processing {year} ...")
        pop_df = pop_data[year]
        act_df = activity_data[year]

        # Ensure same IRIS formatting
        pop_df["IRIS"] = pop_df["IRIS"].astype(str)
        act_df["IRIS"] = act_df["IRIS"].astype(str)

        # Merge population and activity data on IRIS
        merged_df = pd.merge(pop_df, act_df, on="IRIS", how="inner")

        # Get IRIS proportions per patch
        iris_props_dict = data[year_str]["iris_patch_proportions"]

        socioeconomic_vectors = []
        for patch_id, iris_props in iris_props_dict.items():
            socioeconomic_vector = compute_weighted_vector(merged_df, iris_props)
            socioeconomic_vectors.append(socioeconomic_vector)

        # Stack into array and attach to data
        socioeconomic_vectors = np.stack(socioeconomic_vectors)
        data[year_str]["socioeconomic_vector"] = socioeconomic_vectors

        print(f"‚úÖ {year}: stored socioeconomic vectors, shape = {socioeconomic_vectors.shape}")

    print("\nüéØ All socioeconomic vectors (population + activity) computed and stored in data!")
    save(data ,save_paths[i])


üìä Processing 2013 ...
‚úÖ 2013: stored socioeconomic vectors, shape = (500, 96)
üìä Processing 2014 ...
‚úÖ 2014: stored socioeconomic vectors, shape = (500, 96)
üìä Processing 2015 ...
‚úÖ 2015: stored socioeconomic vectors, shape = (500, 96)
üìä Processing 2016 ...
‚úÖ 2016: stored socioeconomic vectors, shape = (500, 96)
üìä Processing 2017 ...
‚úÖ 2017: stored socioeconomic vectors, shape = (500, 96)
üìä Processing 2018 ...
‚úÖ 2018: stored socioeconomic vectors, shape = (500, 96)
üìä Processing 2019 ...
‚úÖ 2019: stored socioeconomic vectors, shape = (500, 96)
üìä Processing 2020 ...
‚úÖ 2020: stored socioeconomic vectors, shape = (500, 96)
üìä Processing 2021 ...
‚úÖ 2021: stored socioeconomic vectors, shape = (500, 96)

üéØ All socioeconomic vectors (population + activity) computed and stored in data!
‚úÖ Data saved successfully at: C:\Users\adamh\Desktop\Satelite_images\Montpelier_data.pkl


In [19]:
show(datas[0])

üîé Exploring data structure
------------------------------------------------------------
2013
  ‚îú‚îÄ imgs_array                ‚Üí array shape=(500, 128, 128, 3), dtype=uint8
  ‚îú‚îÄ space_coordinates         ‚Üí array shape=(500, 128, 128, 2), dtype=float32
  ‚îú‚îÄ iris_index                ‚Üí array shape=(500, 128, 128), dtype=object
  ‚îú‚îÄ iris_patch_proportions    ‚Üí dict[500]
  ‚îú‚îÄ socioeconomic_vector      ‚Üí array shape=(500, 96), dtype=float64
------------------------------------------------------------
2014
  ‚îú‚îÄ imgs_array                ‚Üí array shape=(500, 128, 128, 3), dtype=uint8
  ‚îú‚îÄ space_coordinates         ‚Üí array shape=(500, 128, 128, 2), dtype=float32
  ‚îú‚îÄ iris_index                ‚Üí array shape=(500, 128, 128), dtype=object
  ‚îú‚îÄ iris_patch_proportions    ‚Üí dict[500]
  ‚îú‚îÄ socioeconomic_vector      ‚Üí array shape=(500, 96), dtype=float64
------------------------------------------------------------
2015
  ‚îú‚îÄ imgs_array     

                                    Socioeconomocal vector ends

In [62]:
import pickle
import os

# Path to where you want to save
save_path = r"C:\Users\adamh\Desktop\Satelite_images\patches_data.pkl"

# Save the full data object
with open(save_path, "wb") as f:
    pickle.dump(data, f)

print(f"‚úÖ Data saved successfully at: {save_path}")


‚úÖ Data saved successfully at: C:\Users\adamh\Desktop\Satelite_images\patches_data.pkl


In [22]:
show(data)

üîé Exploring data structure
------------------------------------------------------------
2008
  ‚îú‚îÄ imgs_array                ‚Üí array shape=(500, 128, 128, 3), dtype=uint8
  ‚îú‚îÄ space_coordinates         ‚Üí array shape=(500, 128, 128, 2), dtype=float32
------------------------------------------------------------
2009
  ‚îú‚îÄ imgs_array                ‚Üí array shape=(500, 128, 128, 3), dtype=uint8
  ‚îú‚îÄ space_coordinates         ‚Üí array shape=(500, 128, 128, 2), dtype=float32
------------------------------------------------------------
2010
  ‚îú‚îÄ imgs_array                ‚Üí array shape=(500, 128, 128, 3), dtype=uint8
  ‚îú‚îÄ space_coordinates         ‚Üí array shape=(500, 128, 128, 2), dtype=float32
------------------------------------------------------------
2011
  ‚îú‚îÄ imgs_array                ‚Üí array shape=(500, 128, 128, 3), dtype=uint8
  ‚îú‚îÄ space_coordinates         ‚Üí array shape=(500, 128, 128, 2), dtype=float32
  ‚îú‚îÄ iris_index               

In [5]:
path = r"C:\Users\adamh\Desktop\Satelite_images\pkl_files\Paris.pkl"
show(load(path))

AttributeError: Can't get attribute 'TemporalPatch' on <module '__main__'>