In [1]:
import polars as pl
import random
import numpy as np
random.seed(2059)

In [2]:
Household_ids = list(range(1, 251))
# Generate a random number of people in each household (between 2 and 6)
Household_sizes = [random.randint(2, 6) for i in range(250)]


In [3]:
household_data = pl.DataFrame(
    {
        "Household_id": (Household_ids),
        "Household_size": (Household_sizes),
    }
)

In [4]:
# Create the expanded dataset where each household ID is repeated based on the number of people
expanded_rows = []
for household_id, people_count in zip(Household_ids, Household_sizes):
    for person_id in range(1, people_count + 1):
        expanded_rows.append((household_id, person_id))

# Create the full table as a polars DataFrame
full_table = pl.DataFrame(expanded_rows, schema=["Household_ids", "Person_id"])

# Print the first few rows of the full table
print(full_table.head())

shape: (5, 2)
┌───────────────┬───────────┐
│ Household_ids ┆ Person_id │
│ ---           ┆ ---       │
│ i64           ┆ i64       │
╞═══════════════╪═══════════╡
│ 1             ┆ 1         │
│ 1             ┆ 2         │
│ 1             ┆ 3         │
│ 1             ┆ 4         │
│ 1             ┆ 5         │
└───────────────┴───────────┘


In [5]:
#getting the number of rows in the full table
n_total = full_table.shape[0]
# Generate Household_Targetted with a 70-30 probability
Household_Targeted = np.random.choice([0, 1], size=len(Household_ids), replace=True, p=[0.70, 0.30])

# Create the targets DataFrame using polars
targets = pl.DataFrame({
    "Household_ids": Household_ids,
    "Household_Targeted": Household_Targeted
})


In [6]:
#merge the full table with the targets
full_table = full_table.join(targets, on="Household_ids", how="left")
#create a new column in the full table to indicate if the person is targetted
individual_targeted = full_table["Household_Targeted"] * np.random.choice([0, 1], size=full_table.shape[0], replace=True)
full_table = full_table.with_columns(pl.Series(name="Individual_targeted", values=individual_targeted))


In [7]:
# Calculate the proportion of targeted individuals for each household
prop_targeted_individuals = (
    full_table.groupby("Household_ids")
    .agg(pl.col("Individual_targeted").sum().alias("sum_targeted"),
         pl.count().alias("total"))
    .with_columns((pl.col("sum_targeted") / pl.col("total")).alias("Prop"))
    .select(["Household_ids", "Prop"])
)

# Join the proportions back to the full table
full_table = full_table.join(prop_targeted_individuals, on="Household_ids")


  full_table.groupby("Household_ids")
  pl.count().alias("total"))


In [8]:
# Initialize Spending column with NaN values
full_table = full_table.with_columns(pl.lit(None).alias("Spending"))

# Calculate Spending for each individual
spending_values = []
for i in range(full_table.shape[0]):
    prop_value = full_table[i, "Prop"]
    household_targeted_value = full_table[i, "Household_Targeted"]
    individual_targeted_value = full_table[i, "Individual_targeted"]
    
    spending = (
        np.random.normal(prop_value + 5, 2) * 2 +
        np.random.normal(household_targeted_value + 5, 2) * 3 +
        np.random.normal(individual_targeted_value + 5, 2) * 5
    )
    spending_values.append(spending)

# Add the Spending column to the full_table
full_table = full_table.with_columns(pl.Series("Spending", spending_values))


In [9]:
full_table.head(25)

Household_ids,Person_id,Household_Targeted,Individual_targeted,Prop,Spending
i64,i64,i32,i32,f64,f64
1,1,1,0,0.2,35.855788
1,2,1,0,0.2,60.161033
1,3,1,0,0.2,57.036896
1,4,1,0,0.2,45.500101
1,5,1,1,0.2,81.608664
2,1,0,0,0.0,55.300047
2,2,0,0,0.0,40.061803
3,1,0,0,0.0,33.614603
3,2,0,0,0.0,34.472399
3,3,0,0,0.0,40.711113
