In [None]:
import pandas as pd
import numpy as np
from ucimlrepo import fetch_ucirepo
import re

#  fetch
bc = fetch_ucirepo(id=14)  # Breast Cancer Wisconsin (Original)

#  assemble
features = bc.data.features.reset_index(drop=True)
targets  = bc.data.targets.reset_index(drop=True)

# target is a 1-col DataFrame, turn it into a Series named 'target'
if isinstance(targets, pd.DataFrame) and targets.shape[1] == 1:
    targets = targets.iloc[:, 0].rename("target")

df = pd.concat([features, targets], axis=1)
df["patient_id"] = np.arange(1, len(df) + 1)

# normalize headers to snake_case 
def norm(s: str) -> str:
    s = s.strip()
    s = re.sub(r"[^\w\s-]", "", s)          # remove punctuation except underscore/hyphen
    s = s.replace("-", " ")                 # hyphen -> space
    s = re.sub(r"\s+", "_", s.lower())      # spaces -> underscore, lowercase
    return s

df.columns = [norm(c) for c in df.columns]


print("Columns now:", df.columns.tolist())

# trying to avoid name mismatches
id_vars = [c for c in ["patient_id", "target"] if c in df.columns]
value_vars = [c for c in df.columns if c not in id_vars]

# safety check
assert id_vars, f"No id_vars found among: {df.columns.tolist()}"
assert value_vars, "No value_vars found to melt."

# performing melt function
long_bc = pd.melt(
    frame=df,
    id_vars=id_vars,
    value_vars=value_vars,
    var_name="feature",
    value_name="value",
)

print("answer to running the melt function:", long_bc.head())

# running the pivot table function
pt = pd.pivot_table(
    long_bc,
    index="target",                # benign/malignant
    columns=["feature", "value"],  # show each category under its feature
    values="patient_id",
    aggfunc="count",               # count how many cases per category
    fill_value=0
)
print("answer to running the pivot table function:",pt)

#running the aggregation function
num_cols = df.select_dtypes(include="number").columns

agg_stats = df[num_cols].agg(["mean", "std", "min", "max"])
print("answer to running the aggregation function:",agg_stats)

#running the groupBy function
group_summary = (
    df.groupby("menopause")
      .agg(mean_deg_malig=("deg_malig", "mean"),
           count=("patient_id", "nunique"))
      .sort_values("mean_deg_malig", ascending=False)
)
print("answer to running the mgroup summary function:",group_summary)

#running an iteration example
flags = []
for r in df.itertuples(index=False):
    flags.append(r.deg_malig >= 3)

df = df.assign(high_risk=flags)
df[["patient_id", "deg_malig", "high_risk"]].head()
print("answer to running an interation function:",flags)




Columns now: ['age', 'menopause', 'tumor_size', 'inv_nodes', 'node_caps', 'deg_malig', 'breast', 'breast_quad', 'irradiat', 'target', 'patient_id']
answer to running the melt function:    patient_id                target feature  value
0           1  no-recurrence-events     age  30-39
1           2  no-recurrence-events     age  40-49
2           3  no-recurrence-events     age  40-49
3           4  no-recurrence-events     age  60-69
4           5  no-recurrence-events     age  40-49
answer to running the pivot table function: feature                age                               breast        \
value                20-29 30-39 40-49 50-59 60-69 70-79   left right   
target                                                                  
no-recurrence-events     1    21    63    71    40     5    103    98   
recurrence-events        0    15    27    25    17     1     49    36   

feature              breast_quad           ... tumor_size                    \
value               