## This Code is to generate UR based on the type, The UR will be generated here and will be hardcoded into the test cases.

In [26]:

import pandas as pd


df= pd.read_csv("/Users/faresa/Desktop/TVD/data/Movie_Lens/movielens-1m-full.csv")
COLS = ["Title","Genres","Occupation","Zip-code","Gender","Age","Rating"]

n = len(df)

for c in COLS:
    vc = df[c].value_counts()
    freq = vc / n

    rich = freq[freq > 0.10]
    sparse = freq[freq < 0.01]

    print(f"\n{c}")
    print("  RICH (>10%):")
    print(rich)
    print("  SPARSE (<1%):")
    print(sparse.head(10))  # just to inspect



Title
  RICH (>10%):
Series([], Name: count, dtype: float64)
  SPARSE (<1%):
Title
American Beauty (1999)                                   0.003427
Star Wars: Episode IV - A New Hope (1977)                0.002990
Star Wars: Episode V - The Empire Strikes Back (1980)    0.002989
Star Wars: Episode VI - Return of the Jedi (1983)        0.002882
Jurassic Park (1993)                                     0.002671
Saving Private Ryan (1998)                               0.002652
Terminator 2: Judgment Day (1991)                        0.002648
Matrix, The (1999)                                       0.002589
Back to the Future (1985)                                0.002582
Silence of the Lambs, The (1991)                         0.002577
Name: count, dtype: float64

Genres
  RICH (>10%):
Genres
Comedy    0.116859
Drama     0.111400
Name: count, dtype: float64
  SPARSE (<1%):
Genres
Comedy|Horror                       0.009679
Comedy|Sci-Fi                       0.009309
Children's|Comedy  

In [75]:
import random

# ----------------------------
# CONFIG
# ----------------------------
COLS = ["Title", "Genres", "Occupation", "Zip-code", "Gender", "Age", "Rating"]
K_RICH = 3

# Exclude low-cardinality attributes from rich URs
RICH_ALLOWED_COLS = ["Title", "Genres", "Occupation", "Zip-code", "Age"]
SPARSE_ALLOWED_COLS = ["Title", "Genres", "Occupation", "Zip-code", "Age"]  # exclude Gender, Rating


# ----------------------------
# POOLS: top-K rich, tail sparse (rank-based, per column)
# ----------------------------
def build_pools_topk(df, cols, k=3):
    pools = {}
    for c in cols:
        vc = df[c].value_counts(dropna=False)
        k_eff = min(k, len(vc))

        rich = vc.head(k_eff).index.tolist()
        tail = vc.index[k_eff:].tolist()

        # If no tail exists, reuse rich as sparse so sparse sampling never crashes
        sparse = tail

        pools[c] = {"rich": rich, "sparse": sparse}
    return pools

POOLS = build_pools_topk(df, COLS, k=K_RICH)

# ----------------------------
# Sampling (NO repeats within a single attribute list)
# ----------------------------
def sample_vals(col, kind, k):
    vals = POOLS[col][kind]
    if len(vals) == 0:
        raise ValueError(f"No {kind} values for column {col}")
    k = min(k, len(vals))          # cap, still unique
    return random.sample(vals, k)  # unique sample

# ----------------------------
# UR Generator
# Deep:    1–3 attrs
# Shallow: 4–7 attrs
# Values per attribute:
#   deep:    2–3
#   shallow: 2–3   (change if you want different)
# ----------------------------
def print_ur(name, ur, indent=12):
    print(f"{name} = {{")
    for k, vals in ur.items():
        print(" " * 8 + f'"{k}": [')
        for v in vals:
            if isinstance(v, (int, float)):
                print(" " * indent + f"{v},")
            else:
                print(" " * indent + f'"{v}",')
        print(" " * 8 + "],")
    print("}")

def gen_ur(df, kind="rich", shape="deep"):
    # column choice: exclude Gender/Rating from rich URs
    cols_pool = RICH_ALLOWED_COLS if kind == "rich" else SPARSE_ALLOWED_COLS

    if shape == "deep":
        n_cols = random.randint(1, 3)
        k_low, k_high = 3, 6
    else:  # shallow
        n_cols = random.randint(4, 7)
        k_low, k_high = 1,2 

    n_cols = min(n_cols, len(cols_pool))
    cols = random.sample(cols_pool, n_cols)

    ur = {}
    for c in cols:
        k = random.randint(k_low, k_high)
        ur[c] = sample_vals(c, kind, k)
    return ur

# ----------------------------
# Quick test
# ----------------------------
random.seed(48)
ur = gen_ur(df, kind="rich", shape="deep")
print_ur("UR_sparse_deep", ur)





UR_sparse_deep = {
        "Occupation": [
            7,
            4,
            0,
        ],
        "Genres": [
            "Comedy",
            "Comedy|Romance",
            "Drama",
        ],
        "Age": [
            25,
            18,
            35,
        ],
}


In [76]:
ALL_URS = {}

for seed in range(1, 6):
    random.seed(seed)

    ALL_URS[f"deep_rich_{seed}"] = gen_ur(df, kind="rich",   shape="deep")
    ALL_URS[f"deep_sparse_{seed}"] = gen_ur(df, kind="sparse", shape="deep")
    ALL_URS[f"shallow_rich_{seed}"] = gen_ur(df, kind="rich",   shape="shallow")
    ALL_URS[f"shallow_sparse_{seed}"] = gen_ur(df, kind="sparse", shape="shallow")


In [77]:
for name, ur in ALL_URS.items():
    print_ur(name, ur)
    print()



deep_rich_1 = {
        "Age": [
            35,
            25,
            18,
        ],
}

deep_sparse_1 = {
        "Zip-code": [
            "85210",
            "27510",
            "06880",
            "76707",
        ],
        "Age": [
            1,
            56,
            45,
            50,
        ],
}

shallow_rich_1 = {
        "Genres": [
            "Comedy|Romance",
        ],
        "Title": [
            "Star Wars: Episode IV - A New Hope (1977)",
        ],
        "Age": [
            35,
        ],
        "Zip-code": [
            "98103",
        ],
        "Occupation": [
            0,
        ],
}

shallow_sparse_1 = {
        "Age": [
            56,
            45,
        ],
        "Genres": [
            "Romance|Western",
            "Action|Crime|Thriller",
        ],
        "Zip-code": [
            "98632",
        ],
        "Title": [
            "Whole Nine Yards, The (2000)",
            "Ballad of Narayama, The (Narayama Bushiko) (1982

In [65]:
def load_fixed_movielens_case(self, csv_path=None):
    """
    MovieLens: auto-generate URs (rich/sparse × deep/shallow) for seeds 1..5,
    and register them in self.cases starting from case_id = 1.

    Definitions (as agreed):
      - Pools are rank-based per column (top-K for rich, tail for sparse)
      - Exclude Gender/Rating from RICH (not meaningful)
      - Also exclude Gender/Rating from SPARSE (avoid "Gender is sparse" nonsense)
      - Deep:    1–3 attributes, 3–6 values per attribute
      - Shallow: 4–7 attributes, 1–2 values per attribute
      - No repeats within a single attribute list
      - Naming:  {kind}_{shape}_{seed}  e.g., sparse_deep_3
    """
    import os
    import random
    import pandas as pd

    # -------- load data --------
    if csv_path is None:
        csv_path = os.path.join("/Users/faresa/Desktop/TVD/data/Movie_Lens/movielens-1m-full.csv")
    df = pd.read_csv(csv_path)
    df["synthetic_id"] = df["UserID"].astype(str) + "_" + df["MovieID"].astype(str)
    id_col = "synthetic_id"

    # -------- config --------
    K_RICH = 3
    COLS = ["Title", "Genres", "Occupation", "Zip-code", "Gender", "Age", "Rating"]

    # Exclude low-cardinality / misleading columns from rich & sparse URs
    RICH_ALLOWED_COLS = ["Title", "Genres", "Occupation", "Zip-code", "Age"]
    SPARSE_ALLOWED_COLS = ["Title", "Genres", "Occupation", "Zip-code", "Age"]

    # -------- build pools (rank-based) --------
    def build_pools_topk(df_, cols_, k=3):
        pools_ = {}
        for c in cols_:
            vc = df_[c].value_counts(dropna=False)
            k_eff = min(k, len(vc))

            rich_vals = vc.head(k_eff).index.tolist()
            sparse_vals = vc.index[k_eff:].tolist()  # tail only; may be empty for low-card columns

            pools_[c] = {"rich": rich_vals, "sparse": sparse_vals}
        return pools_

    POOLS = build_pools_topk(df, COLS, k=K_RICH)

    # -------- helpers --------
    def sample_vals(col, kind, k):
        vals = POOLS[col][kind]
        if len(vals) == 0:
            # For sparse, some columns may have no tail (e.g., Gender/Rating) but we exclude them anyway.
            raise ValueError(f"No {kind} values available for column '{col}' (pool empty).")
        k = min(k, len(vals))          # cap, still unique
        return random.sample(vals, k)  # no repeats within one attribute list

    def gen_ur(kind="rich", shape="deep"):
        cols_pool = RICH_ALLOWED_COLS if kind == "rich" else SPARSE_ALLOWED_COLS

        if shape == "deep":
            n_cols = random.randint(1, 3)
            k_low, k_high = 3, 6
        else:  # shallow
            n_cols = random.randint(4, 7)
            k_low, k_high = 1, 2

        n_cols = min(n_cols, len(cols_pool))
        cols = random.sample(cols_pool, n_cols)

        ur = {}
        for c in cols:
            k = random.randint(k_low, k_high)
            ur[c] = sample_vals(c, kind, k)
        return ur

    # -------- generate URs over seeds --------
    ALL_URS = {}
    for seed in range(1, 6):
        random.seed(seed)

        # naming: {kind}_{shape}_{seed}
        ALL_URS[f"rich_deep_{seed}"] = gen_ur(kind="rich", shape="deep")
        ALL_URS[f"rich_shallow_{seed}"] = gen_ur(kind="rich", shape="shallow")
        ALL_URS[f"sparse_deep_{seed}"] = gen_ur(kind="sparse", shape="deep")
        ALL_URS[f"sparse_shallow_{seed}"] = gen_ur(kind="sparse", shape="shallow")

    # -------- register into self.cases starting at 1 --------
    self.cases = {}
    self.case_names = {}

    case_id = 1
    for name in sorted(ALL_URS.keys()):  # sorted for deterministic IDs
        ur_dict = ALL_URS[name]

        UR = self.create_flexible_dataframe(ur_dict)
        base_cols = list(ur_dict.keys()) + [id_col]
        T = df[base_cols].copy()

        self.cases[case_id] = (T, UR)
        self.case_names[case_id] = name
        case_id += 1


ModuleNotFoundError: No module named 'helpers'