# 🗞️ User Category Splits Report

This report is supposed to normalize columns that will be used for profiling users according to their behaviours.

It expects an export of the User Visit Frequency Report and cleans its colums.

<img src="https://img.lemde.fr/2024/03/20/354/0/4064/2032/1342/671/60/0/8672e76_1710973858299-000-nic6404406.jpg" alt="Bike Store" width="200" align="right">


# Step 1: This expects you to upload raw files

In [None]:
!rm -rf ./*


In [None]:
# === Clean User Category Split Report (robust loop → save in /content/cleaned) ===
import os
import pandas as pd
from google.colab import files
from datetime import date
from IPython.display import display

# ---------- Safe base and output directory ----------
BASE_DIR = "/content" if os.path.isdir("/content") else os.getcwd()
CLEAN_DIR = os.path.join(BASE_DIR, "cleaned")
os.makedirs(CLEAN_DIR, exist_ok=True)
print("Saving outputs to:", CLEAN_DIR)

# ---------- Upload files ----------
uploaded = files.upload()                 # multi-select ok
names = sorted(uploaded.keys())
today = date.today().isoformat()

# ---------- Helpers ----------
def read_csv_robust(path):
    """Try utf-8-sig, then utf-16; if still 1 col, manual 'quoted line' split."""
    last_err = None
    for enc in ("utf-8-sig", "utf-16"):
        try:
            tmp = pd.read_csv(path, encoding=enc, sep=",", engine="python")
            if tmp.shape[1] > 1:
                return tmp
        except Exception as e:
            last_err = e
    # Manual fallback for all-in-one quoted column
    with open(path, "r", encoding="utf-8-sig", errors="replace") as f:
        lines = [ln.strip() for ln in f if ln.strip()]
    if not lines:
        raise ValueError(f"Empty file: {path} (last pandas error: {last_err})")
    rows = []
    for ln in lines:
        if ln.startswith('"') and ln.endswith('"'):
            ln = ln[1:-1]
        rows.append(ln.split(","))
    header = [h.strip().strip('"').replace("\ufeff","") for h in rows[0]]
    data = rows[1:]
    return pd.DataFrame(data, columns=header)

def seconds_to_hr_min(seconds):
    try:
        seconds = float(seconds)
        hours = int(seconds // 3600)
        minutes = int((seconds % 3600) // 60)
        return f"{hours} hr {minutes} min"
    except Exception:
        return ""

def is_blank(x):
    return (pd.isna(x)) or (str(x).strip() == "")

# ---------- Process each uploaded file ----------
for idx, SRC in enumerate(names, start=1):
    print(f"\nProcessing {SRC} as week{idx}...")
    try:
        df = read_csv_robust(SRC)

        # (1) Standardize column names if present
        df = df.rename(columns={
            "Label": "UserId-Category",
            "sum_corehome_visittotaltime": "Visit Total Time (seconds)",
            "Metadata: CoreHome_UserId": "UserId",
            "Metadata: CustomDimension_CustomDimension1": "CategoryId"

        })

        # (2) Drop unwanted columns if they exist (case/space tolerant)
        to_drop = [c for c in df.columns if str(c).lower().strip() in {"level", "i", "metadata: is_summary", "metadata: segment"}]
        df = df.drop(columns=to_drop, errors="ignore")

        # (3) Remove rows containing "others" or "undefined" (any column, case-insensitive)
        lower_str = df.applymap(lambda v: str(v).strip().lower() if not pd.isna(v) else v)
        mask_bad_words = lower_str.apply(lambda row: row.isin(["others", "undefined",  "", "-", "nan", "none", "null", "na", "n/a", "not available"]).any(), axis=1)
        df = df.loc[~mask_bad_words].copy()

        # (4) Drop rows with any NaN anywhere (as requested)
        df = df.dropna(how="any").copy()

        # (5) Drop rows where CategoryId is missing/blank but UserId exists
        if "UserId" in df.columns and "CategoryId" in df.columns:
            mask_us = df["UserId"].apply(lambda x: not is_blank(x))
            mask_cat_blank = df["CategoryId"].apply(is_blank)
            df = df.loc[~(mask_us & mask_cat_blank)].copy()

        # (6) Add pretty time columns if present
        if "Visit Total Time (seconds)" in df.columns:
            df["Visit Total Time (hr/min)"] = df["Visit Total Time (seconds)"].apply(seconds_to_hr_min)

        avg_dur_col = None
        for cand in ["Avg. Visit Duration (in seconds) per Visit", "Avg.VisitDuration(inseconds)perVisit"]:
            if cand in df.columns:
                avg_dur_col = cand
                break
        if avg_dur_col:
            df["Avg. Visit Duration per Visit (hr/min)"] = df[avg_dur_col].apply(seconds_to_hr_min)


        # (7) Reorganize → tidy by UserId, CategoryId when both exist
        if "UserId" in df.columns and "CategoryId" in df.columns:
          # convert to integers safely
          df["UserId"] = pd.to_numeric(df["UserId"], errors="coerce").astype("Int64")
          df["CategoryId"] = pd.to_numeric(df["CategoryId"], errors="coerce").astype("Int64")

          # drop any rows that failed conversion (NaN after coercion)
          df = df.dropna(subset=["UserId", "CategoryId"])

          df_sorted = df.sort_values(["UserId", "CategoryId"])
          df_tidy = df_sorted.set_index(["UserId", "CategoryId"])
        else:
          df_tidy = df

        # (8) Save to /content/cleaned (or ./cleaned)
        out_xlsx = os.path.join(CLEAN_DIR, f"user_category_split_report_week{idx}_{today}.xlsx")

        df.to_excel(out_xlsx, index=False)

        # Optional: auto-download
        files.download(out_xlsx)

        print(f"  ✓ Saved {len(df):,} rows → {out_xlsx}")
        display(df_tidy.head(20))

    except Exception as e:
        print(f"  ✗ Skipped {SRC} (week{idx}) due to {type(e).__name__}: {e}")
        continue



Saving outputs to: /content/cleaned


Saving Export _ User Visit Frequency Report _ week August 4 – 10, 2025.csv to Export _ User Visit Frequency Report _ week August 4 – 10, 2025.csv
Saving Export _ User Visit Frequency Report _ week August 11 – 17, 2025.csv to Export _ User Visit Frequency Report _ week August 11 – 17, 2025.csv
Saving Export _ User Visit Frequency Report _ week August 18 – 24, 2025.csv to Export _ User Visit Frequency Report _ week August 18 – 24, 2025.csv
Saving Export _ User Visit Frequency Report _ week August 25 – 31, 2025.csv to Export _ User Visit Frequency Report _ week August 25 – 31, 2025.csv
Saving Export _ User Visit Frequency Report _ week July 7 – 13, 2025.csv to Export _ User Visit Frequency Report _ week July 7 – 13, 2025.csv
Saving Export _ User Visit Frequency Report _ week July 14 – 20, 2025.csv to Export _ User Visit Frequency Report _ week July 14 – 20, 2025.csv
Saving Export _ User Visit Frequency Report _ week July 21 – 27, 2025.csv to Export _ User Visit Frequency Report _ week Jul

  lower_str = df.applymap(lambda v: str(v).strip().lower() if not pd.isna(v) else v)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

  ✓ Saved 41,552 rows → /content/cleaned/user_category_split_report_week1_2025-10-02.xlsx


Unnamed: 0_level_0,Unnamed: 1_level_0,UserId-Category,Visits,Total Days since last visit,Visit Total Time (seconds),Max Days since last visit,Avg. Days since last visit per Visit,Avg. Visit Duration (in seconds) per Visit,Visit Total Time (hr/min),Avg. Visit Duration per Visit (hr/min)
UserId,CategoryId,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,5,0 - 5,2,11,360,11,5.5,180.0,0 hr 6 min,0 hr 3 min
0,19,0 - 19,1,0,1435,0,0.0,1435.0,0 hr 23 min,0 hr 23 min
0,31,0 - 31,2,0,115,0,0.0,57.5,0 hr 1 min,0 hr 0 min
0,33,0 - 33,4,0,1245,0,0.0,311.25,0 hr 20 min,0 hr 5 min
0,742,0 - 742,3,0,667,0,0.0,222.33,0 hr 11 min,0 hr 3 min
0,744,0 - 744,1,0,839,0,0.0,839.0,0 hr 13 min,0 hr 13 min
0,747,0 - 747,5,0,3557,0,0.0,711.4,0 hr 59 min,0 hr 11 min
0,755,0 - 755,4,0,384,0,0.0,96.0,0 hr 6 min,0 hr 1 min
0,760,0 - 760,1,180,144,180,180.0,144.0,0 hr 2 min,0 hr 2 min
0,804,0 - 804,1,0,330,0,0.0,330.0,0 hr 5 min,0 hr 5 min



Processing Export _ User Visit Frequency Report _ week August 18 – 24, 2025.csv as week2...


  lower_str = df.applymap(lambda v: str(v).strip().lower() if not pd.isna(v) else v)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

  ✓ Saved 42,081 rows → /content/cleaned/user_category_split_report_week2_2025-10-02.xlsx


Unnamed: 0_level_0,Unnamed: 1_level_0,UserId-Category,Visits,Total Days since last visit,Visit Total Time (seconds),Max Days since last visit,Avg. Days since last visit per Visit,Avg. Visit Duration (in seconds) per Visit,Visit Total Time (hr/min),Avg. Visit Duration per Visit (hr/min)
UserId,CategoryId,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,2,0 - 2,4,0,2458,0,0.0,614.5,0 hr 40 min,0 hr 10 min
0,5,0 - 5,1,1,686,1,1.0,686.0,0 hr 11 min,0 hr 11 min
0,19,0 - 19,4,4,3904,3,1.0,976.0,1 hr 5 min,0 hr 16 min
0,25,0 - 25,1,0,1,0,0.0,1.0,0 hr 0 min,0 hr 0 min
0,30,0 - 30,1,0,12471,0,0.0,12471.0,3 hr 27 min,3 hr 27 min
0,31,0 - 31,2,0,489,0,0.0,244.5,0 hr 8 min,0 hr 4 min
0,33,0 - 33,4,0,13162,0,0.0,3290.5,3 hr 39 min,0 hr 54 min
0,168,0 - 168,1,3,1297,3,3.0,1297.0,0 hr 21 min,0 hr 21 min
0,742,0 - 742,5,21,21212,21,4.2,4242.4,5 hr 53 min,1 hr 10 min
0,743,0 - 743,1,0,12471,0,0.0,12471.0,3 hr 27 min,3 hr 27 min



Processing Export _ User Visit Frequency Report _ week August 25 – 31, 2025.csv as week3...


  lower_str = df.applymap(lambda v: str(v).strip().lower() if not pd.isna(v) else v)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

  ✓ Saved 40,244 rows → /content/cleaned/user_category_split_report_week3_2025-10-02.xlsx


Unnamed: 0_level_0,Unnamed: 1_level_0,UserId-Category,Visits,Total Days since last visit,Visit Total Time (seconds),Max Days since last visit,Avg. Days since last visit per Visit,Avg. Visit Duration (in seconds) per Visit,Visit Total Time (hr/min),Avg. Visit Duration per Visit (hr/min)
UserId,CategoryId,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,2,0 - 2,1,0,375,0,0.0,375.0,0 hr 6 min,0 hr 6 min
0,5,0 - 5,4,0,3474,0,0.0,868.5,0 hr 57 min,0 hr 14 min
0,8,0 - 8,1,0,375,0,0.0,375.0,0 hr 6 min,0 hr 6 min
0,18,0 - 18,2,0,694,0,0.0,347.0,0 hr 11 min,0 hr 5 min
0,19,0 - 19,6,0,6894,0,0.0,1149.0,1 hr 54 min,0 hr 19 min
0,31,0 - 31,4,0,1651,0,0.0,412.75,0 hr 27 min,0 hr 6 min
0,33,0 - 33,1,0,375,0,0.0,375.0,0 hr 6 min,0 hr 6 min
0,168,0 - 168,1,0,1066,0,0.0,1066.0,0 hr 17 min,0 hr 17 min
0,178,0 - 178,1,0,375,0,0.0,375.0,0 hr 6 min,0 hr 6 min
0,182,0 - 182,1,0,375,0,0.0,375.0,0 hr 6 min,0 hr 6 min



Processing Export _ User Visit Frequency Report _ week August 4 – 10, 2025.csv as week4...


  lower_str = df.applymap(lambda v: str(v).strip().lower() if not pd.isna(v) else v)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

  ✓ Saved 43,924 rows → /content/cleaned/user_category_split_report_week4_2025-10-02.xlsx


Unnamed: 0_level_0,Unnamed: 1_level_0,UserId-Category,Visits,Total Days since last visit,Visit Total Time (seconds),Max Days since last visit,Avg. Days since last visit per Visit,Avg. Visit Duration (in seconds) per Visit,Visit Total Time (hr/min),Avg. Visit Duration per Visit (hr/min)
UserId,CategoryId,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,5,0 - 5,3,0,3193,0,0.0,1064.33,0 hr 53 min,0 hr 17 min
0,19,0 - 19,1,14,500,14,14.0,500.0,0 hr 8 min,0 hr 8 min
0,24,0 - 24,1,0,351,0,0.0,351.0,0 hr 5 min,0 hr 5 min
0,31,0 - 31,8,0,3045,0,0.0,380.63,0 hr 50 min,0 hr 6 min
0,33,0 - 33,3,0,1087,0,0.0,362.33,0 hr 18 min,0 hr 6 min
0,573,0 - 573,1,71,490,71,71.0,490.0,0 hr 8 min,0 hr 8 min
0,742,0 - 742,6,30,4999,14,5.0,833.17,1 hr 23 min,0 hr 13 min
0,743,0 - 743,6,2,3986,2,0.33,664.33,1 hr 6 min,0 hr 11 min
0,747,0 - 747,4,73,1717,71,18.25,429.25,0 hr 28 min,0 hr 7 min
0,751,0 - 751,2,0,721,0,0.0,360.5,0 hr 12 min,0 hr 6 min



Processing Export _ User Visit Frequency Report _ week July 14 – 20, 2025.csv as week5...


  lower_str = df.applymap(lambda v: str(v).strip().lower() if not pd.isna(v) else v)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

  ✓ Saved 39,773 rows → /content/cleaned/user_category_split_report_week5_2025-10-02.xlsx


Unnamed: 0_level_0,Unnamed: 1_level_0,UserId-Category,Visits,Total Days since last visit,Visit Total Time (seconds),Max Days since last visit,Avg. Days since last visit per Visit,Avg. Visit Duration (in seconds) per Visit,Visit Total Time (hr/min),Avg. Visit Duration per Visit (hr/min)
UserId,CategoryId,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,1,0 - 1,1,0,217,0,0.0,217.0,0 hr 3 min,0 hr 3 min
0,2,0 - 2,2,0,1258,0,0.0,629.0,0 hr 20 min,0 hr 10 min
0,5,0 - 5,1,0,217,0,0.0,217.0,0 hr 3 min,0 hr 3 min
0,7,0 - 7,1,0,217,0,0.0,217.0,0 hr 3 min,0 hr 3 min
0,18,0 - 18,1,0,1612,0,0.0,1612.0,0 hr 26 min,0 hr 26 min
0,19,0 - 19,6,36,3685,25,6.0,614.17,1 hr 1 min,0 hr 10 min
0,24,0 - 24,1,0,2827,0,0.0,2827.0,0 hr 47 min,0 hr 47 min
0,31,0 - 31,3,0,1312,0,0.0,437.33,0 hr 21 min,0 hr 7 min
0,180,0 - 180,1,0,1002,0,0.0,1002.0,0 hr 16 min,0 hr 16 min
0,196,0 - 196,1,24,2,24,24.0,2.0,0 hr 0 min,0 hr 0 min



Processing Export _ User Visit Frequency Report _ week July 21 – 27, 2025.csv as week6...


  lower_str = df.applymap(lambda v: str(v).strip().lower() if not pd.isna(v) else v)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

  ✓ Saved 37,645 rows → /content/cleaned/user_category_split_report_week6_2025-10-02.xlsx


Unnamed: 0_level_0,Unnamed: 1_level_0,UserId-Category,Visits,Total Days since last visit,Visit Total Time (seconds),Max Days since last visit,Avg. Days since last visit per Visit,Avg. Visit Duration (in seconds) per Visit,Visit Total Time (hr/min),Avg. Visit Duration per Visit (hr/min)
UserId,CategoryId,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,2,0 - 2,8,11,3517,7,1.38,439.63,0 hr 58 min,0 hr 7 min
0,5,0 - 5,1,22,106,22,22.0,106.0,0 hr 1 min,0 hr 1 min
0,18,0 - 18,1,42,748,42,42.0,748.0,0 hr 12 min,0 hr 12 min
0,19,0 - 19,1,0,2,0,0.0,2.0,0 hr 0 min,0 hr 0 min
0,31,0 - 31,5,14,3022,14,2.8,604.4,0 hr 50 min,0 hr 10 min
0,33,0 - 33,1,0,216,0,0.0,216.0,0 hr 3 min,0 hr 3 min
0,742,0 - 742,1,4,337,4,4.0,337.0,0 hr 5 min,0 hr 5 min
0,743,0 - 743,1,1,272,1,1.0,272.0,0 hr 4 min,0 hr 4 min
0,747,0 - 747,1,0,156,0,0.0,156.0,0 hr 2 min,0 hr 2 min
0,752,0 - 752,1,9,679,9,9.0,679.0,0 hr 11 min,0 hr 11 min



Processing Export _ User Visit Frequency Report _ week July 28 – August 3, 2025.csv as week7...


  lower_str = df.applymap(lambda v: str(v).strip().lower() if not pd.isna(v) else v)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

  ✓ Saved 19 rows → /content/cleaned/user_category_split_report_week7_2025-10-02.xlsx


Unnamed: 0_level_0,Unnamed: 1_level_0,UserId-Category,Visits,Total Days since last visit,Visit Total Time (seconds),Max Days since last visit,Avg. Days since last visit per Visit,Avg. Visit Duration (in seconds) per Visit,Visit Total Time (hr/min),Avg. Visit Duration per Visit (hr/min)
UserId,CategoryId,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
7223,742,7223 - 742,27,0,47742,0,0.0,1768.22,13 hr 15 min,0 hr 29 min
9405,747,9405 - 747,61,0,50673,0,0.0,830.7,14 hr 4 min,0 hr 13 min
9704,742,9704 - 742,29,0,23331,0,0.0,804.52,6 hr 28 min,0 hr 13 min
27508,742,27508 - 742,33,0,66144,0,0.0,2004.36,18 hr 22 min,0 hr 33 min
32132,742,32132 - 742,27,0,78320,0,0.0,2900.74,21 hr 45 min,0 hr 48 min
427464,742,427464 - 742,29,0,49292,0,0.0,1699.72,13 hr 41 min,0 hr 28 min
434128,19,434128 - 19,28,0,22392,0,0.0,799.71,6 hr 13 min,0 hr 13 min
434677,19,434677 - 19,60,0,174719,0,0.0,2911.98,48 hr 31 min,0 hr 48 min
435044,742,435044 - 742,31,0,7551,0,0.0,243.58,2 hr 5 min,0 hr 4 min
435770,19,435770 - 19,222,0,221150,0,0.0,996.17,61 hr 25 min,0 hr 16 min



Processing Export _ User Visit Frequency Report _ week July 7 – 13, 2025.csv as week8...


  lower_str = df.applymap(lambda v: str(v).strip().lower() if not pd.isna(v) else v)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

  ✓ Saved 9,998 rows → /content/cleaned/user_category_split_report_week8_2025-10-02.xlsx


Unnamed: 0,UserId-Category,Visits,Total Days since last visit,Visit Total Time (seconds),Max Days since last visit,Avg. Days since last visit per Visit,Avg. Visit Duration (in seconds) per Visit,Visit Total Time (hr/min),Avg. Visit Duration per Visit (hr/min)
0,559303,278,0,126778,0,0,456.04,35 hr 12 min,0 hr 7 min
1,435770,237,0,348425,0,0,1470.15,96 hr 47 min,0 hr 24 min
2,434677,117,0,171660,0,0,1467.18,47 hr 41 min,0 hr 24 min
3,537581,107,0,173249,0,0,1619.15,48 hr 7 min,0 hr 26 min
4,29340,85,0,306549,0,0,3606.46,85 hr 9 min,1 hr 0 min
5,18868,79,0,87463,0,0,1107.13,24 hr 17 min,0 hr 18 min
6,469537,77,0,68715,0,0,892.4,19 hr 5 min,0 hr 14 min
7,502191,77,0,28587,0,0,371.26,7 hr 56 min,0 hr 6 min
8,6412,76,0,69417,0,0,913.38,19 hr 16 min,0 hr 15 min
9,473566,74,0,83273,0,0,1125.31,23 hr 7 min,0 hr 18 min



Processing Export _ User Visit Frequency Report _ week June 30 – July 6, 2025.csv as week9...


  lower_str = df.applymap(lambda v: str(v).strip().lower() if not pd.isna(v) else v)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

  ✓ Saved 39,571 rows → /content/cleaned/user_category_split_report_week9_2025-10-02.xlsx


Unnamed: 0_level_0,Unnamed: 1_level_0,UserId-Category,Visits,Total Days since last visit,Visit Total Time (seconds),Max Days since last visit,Avg. Days since last visit per Visit,Avg. Visit Duration (in seconds) per Visit,Visit Total Time (hr/min),Avg. Visit Duration per Visit (hr/min)
UserId,CategoryId,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,1,0 - 1,2,0,70,0,0.0,35.0,0 hr 1 min,0 hr 0 min
0,2,0 - 2,2,12,949,12,6.0,474.5,0 hr 15 min,0 hr 7 min
0,5,0 - 5,2,0,903,0,0.0,451.5,0 hr 15 min,0 hr 7 min
0,30,0 - 30,1,0,68,0,0.0,68.0,0 hr 1 min,0 hr 1 min
0,31,0 - 31,3,80,1067,51,26.67,355.67,0 hr 17 min,0 hr 5 min
0,33,0 - 33,2,12,1088,12,6.0,544.0,0 hr 18 min,0 hr 9 min
0,742,0 - 742,1,1,142,1,1.0,142.0,0 hr 2 min,0 hr 2 min
0,743,0 - 743,2,0,254,0,0.0,127.0,0 hr 4 min,0 hr 2 min
0,747,0 - 747,7,37,4512,25,5.29,644.57,1 hr 15 min,0 hr 10 min
0,751,0 - 751,1,2,13,2,2.0,13.0,0 hr 0 min,0 hr 0 min



Processing Export _ User Visit Frequency Report _ week September 1 – 7, 2025.csv as week10...


  lower_str = df.applymap(lambda v: str(v).strip().lower() if not pd.isna(v) else v)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

  ✓ Saved 23 rows → /content/cleaned/user_category_split_report_week10_2025-10-02.xlsx


Unnamed: 0_level_0,Unnamed: 1_level_0,UserId-Category,Visits,Total Days since last visit,Visit Total Time (seconds),Max Days since last visit,Avg. Days since last visit per Visit,Avg. Visit Duration (in seconds) per Visit,Visit Total Time (hr/min),Avg. Visit Duration per Visit (hr/min)
UserId,CategoryId,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
7223,742,7223 - 742,47,0,34224,0,0.0,728.17,9 hr 30 min,0 hr 12 min
9405,747,9405 - 747,62,0,42013,0,0.0,677.63,11 hr 40 min,0 hr 11 min
33053,742,33053 - 742,36,0,35166,0,0.0,976.83,9 hr 46 min,0 hr 16 min
34257,742,34257 - 742,34,0,26736,0,0.0,786.35,7 hr 25 min,0 hr 13 min
294246,1032,294246 - 1032,36,0,6821,0,0.0,189.47,1 hr 53 min,0 hr 3 min
427980,742,427980 - 742,37,0,71616,0,0.0,1935.57,19 hr 53 min,0 hr 32 min
434677,19,434677 - 19,40,1,163802,1,0.03,4095.05,45 hr 30 min,1 hr 8 min
435770,19,435770 - 19,206,0,415986,0,0.0,2019.35,115 hr 33 min,0 hr 33 min
435770,742,435770 - 742,42,0,114296,0,0.0,2721.33,31 hr 44 min,0 hr 45 min
435770,747,435770 - 747,36,0,72198,0,0.0,2005.5,20 hr 3 min,0 hr 33 min


In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Step 2: This is the data modeling phase

In [2]:
import os, glob, re
import pandas as pd

# Look for your cleaned exports inside /content/cleaned
xlsx_files = sorted(
    glob.glob("/content/cleaned/user_category_split_report_week*_*.xlsx"),
    key=lambda f: int(re.search(r"week(\d+)", os.path.basename(f)).group(1))
)
csv_files = sorted(
    glob.glob("/content/cleaned/user_category_split_report_week*_*.csv"),
    key=lambda f: int(re.search(r"week(\d+)", os.path.basename(f)).group(1))
)

# Prefer the Excel outputs; fall back to CSV if none
files = xlsx_files if xlsx_files else csv_files
if not files:
    raise FileNotFoundError("No cleaned exports found in /content/cleaned. Expected files like 'user_category_split_report_weekN_YYYY-MM-DD.xlsx'.")

dfs = []
for f in files:
    # Extract the week number from the filename
    m = re.search(r"week(\d+)", os.path.basename(f))
    week_num = int(m.group(1)) if m else None

    # Read Excel or CSV
    if f.lower().endswith(".xlsx"):
        df_week = pd.read_excel(f)
    else:
        df_week = pd.read_csv(f)

    # Ensure numeric IDs (nullable Int64 so we don't get floats)
    for col in ("UserId", "CategoryId"):
        if col in df_week.columns:
            df_week[col] = pd.to_numeric(df_week[col], errors="coerce").astype("Int64")

    # Tag week from filename (more reliable than enumerate)
    if week_num is not None:
        df_week["Week"] = week_num

    dfs.append(df_week)

# Combine all weeks
all_weeks_df = pd.concat(dfs, ignore_index=True)

# Count distinct weeks per user
user_weeks_active = (
    all_weeks_df[["UserId", "Week"]]
    .dropna(subset=["UserId", "Week"])
    .drop_duplicates()
    .groupby("UserId", as_index=False)["Week"]
    .nunique()
    .rename(columns={"Week": "WeeksActive"})
)

# Turn WeeksActive to Int
user_weeks_active["WeeksActive"] = (
    user_weeks_active["WeeksActive"].astype("Int64")
)



# Merge back for convenience
all_weeks_df = all_weeks_df.merge(user_weeks_active, on="UserId", how="left")

# Quick preview with Week + WeeksActive highlighted
styled = (
    all_weeks_df.head()
    .style
    .apply(
        lambda row: [
            'background-color: lightyellow' if col in ["Week", "WeeksActive"] else ''
            for col in row.index
        ],
        axis=1
    )
)
styled


Unnamed: 0,UserId-Category,Visits,Total Days since last visit,Visit Total Time (seconds),Max Days since last visit,Avg. Days since last visit per Visit,Avg. Visit Duration (in seconds) per Visit,UserId,CategoryId,Visit Total Time (hr/min),Avg. Visit Duration per Visit (hr/min),Week,WeeksActive
0,435770 - 19,219,0,248020,0,0.0,1132.51,435770,19,68 hr 53 min,0 hr 18 min,1,9
1,9405 - 747,83,0,79666,0,0.0,959.83,9405,747,22 hr 7 min,0 hr 15 min,1,9
2,434677 - 19,67,0,197890,0,0.0,2953.58,434677,19,54 hr 58 min,0 hr 49 min,1,9
3,507625 - 742,62,0,98777,0,0.0,1593.18,507625,742,27 hr 26 min,0 hr 26 min,1,9
4,435770 - 742,58,0,78231,0,0.0,1348.81,435770,742,21 hr 43 min,0 hr 22 min,1,9


In [3]:
import numpy as np
import pandas as pd

# ==== Column names (adjust if needed) ====
COL_USER = "UserId"
COL_CAT  = "CategoryId"
COL_WEEK = "Week"
COL_VIS  = "Visits"
COL_TIME = "Visit Total Time (seconds)"
COL_AVG  = "Avg. Visit Duration (in seconds) per Visit"




# -------- Setup: work on a copy + numeric coercion --------
all_weeks_copy = all_weeks_df.copy()

# Drop rows with missing CategoryId (these are likely aggregates across categories)
all_weeks_copy = all_weeks_copy.dropna(subset=[COL_CAT])

for col in [COL_VIS, COL_TIME, COL_AVG]:
    all_weeks_copy[col] = pd.to_numeric(all_weeks_copy[col], errors="coerce")


In [4]:
import numpy as np
import pandas as pd


# Step1: simple reliability tag from visit count
def _reliability(v):
    if pd.isna(v) or v == 1: return "Low"
    if v <= 3: return "Medium"
    return "High"
all_weeks_copy["duration_reliability"] = all_weeks_copy[COL_VIS].apply(_reliability)

In [5]:
import numpy as np
import pandas as pd

#flag with MAD, replace with q25” pipeline
#Step2: Flag suspicious rows based on calculated z-scores and replace Avg. Visit time with lower q25%. Calculate Visit total time accordingly

# -------- Robust z helper (median & MAD) --------
def robust_z_series(s: pd.Series) -> pd.Series:
    med = s.median()
    mad = (s - med).abs().median()
    if mad == 0 or np.isnan(mad):
        return pd.Series(0.0, index=s.index)  # flat group -> no outliers
    return 0.6745 * (s - med) / mad

# -------- Step 1 — Robust z per Category×Week --------
Z_THR = 3.0  # flag if |z| >= 3 (tune to 2.5/2.0 for more sensitivity)
all_weeks_copy["z_idx_catweek"] = (
    all_weeks_copy
      .groupby([COL_CAT, COL_WEEK])[COL_AVG]
      .transform(robust_z_series)
)
all_weeks_copy["flag_z_idx"] = all_weeks_copy["z_idx_catweek"].abs() >= Z_THR

# -------- Step 2 — Per-group 25th percentile (q25) --------
# We’ll replace suspiciously high averages with this lower-end, plausible value.
p25 = (
    all_weeks_copy
      .groupby([COL_CAT, COL_WEEK])[COL_AVG]
      .quantile(0.25)
      .reset_index()
      .rename(columns={COL_AVG: "p25"})
)


# Join group stats back
all_weeks_copy = all_weeks_copy.merge(p25, on=[COL_CAT, COL_WEEK], how="left")


# -------- Step 3 — Adjust suspicious values using q25 --------
# Only adjust rows that are flagged *and* above q25 (never increase a value).
mask_adjust = (
    all_weeks_copy["flag_z_idx"].fillna(False)
    & all_weeks_copy["p25"].notna()
    & (all_weeks_copy[COL_AVG] > all_weeks_copy["p25"])
)

# Replace Avg Duration with q25 for flagged rows
all_weeks_copy[COL_AVG] = np.where(
    mask_adjust,
    all_weeks_copy["p25"],
    all_weeks_copy[COL_AVG]
)

# Recompute Total Time = Visits × adjusted Avg
all_weeks_copy[COL_TIME] = np.where(
    all_weeks_copy[COL_VIS].notna() & all_weeks_copy[COL_AVG].notna(),
    (all_weeks_copy[COL_VIS] * all_weeks_copy[COL_AVG]).round(0),
    all_weeks_copy[COL_TIME]
)

# Mark adjustments
all_weeks_copy["flagged_and_changed"] = mask_adjust

# Recompute human-readable hr/min columns from updated seconds values
all_weeks_copy["Visit Total Time (hr/min)"] = pd.to_timedelta(
    all_weeks_copy[COL_TIME], unit="s"
)

all_weeks_copy["Avg. Visit Duration per Visit (hr/min)"] = pd.to_timedelta(
    all_weeks_copy[COL_AVG], unit="s"
)



# Make a copy for reference / auditing that keeps z-scores
visit_time_audits = all_weeks_copy.copy()


all_weeks_copy.rename(
    columns={"flagged_and_changed": "flagged_and_changed_visit_time"},
    inplace=True
)


# Drop all other columns
all_weeks_copy.drop(columns=["z_idx_catweek", "flag_z_idx", "p25"], inplace=True)


# -------- Summary --------
print(f"Rows input:                {len(all_weeks_copy)}")
print(f"Flagged by robust z (|z|≥{Z_THR}): {int(visit_time_audits['flag_z_idx'].sum())}")
print(f"Adjusted rows (replaced w/ q25):   {int(visit_time_audits['flagged_and_changed'].sum())}")
print("Replacement rule: AvgDuration := group q25; TotalTime := Visits × AvgDuration")

# How many groups have missing p25?
print("Number of NaN p25 values:", visit_time_audits["p25"].isna().sum())





Rows input:                284832
Flagged by robust z (|z|≥3.0): 28694
Adjusted rows (replaced w/ q25):   28687
Replacement rule: AvgDuration := group q25; TotalTime := Visits × AvgDuration
Number of NaN p25 values: 0


In [6]:
# Show all columns, but mark the new one for visibility
df_preview = all_weeks_copy.head(10)

# Print normally
print(df_preview)

# OR, if you are in Jupyter / notebook, use style to highlight the new column
df_preview.style.set_properties(
    subset=["flagged_and_changed_visit_time"],
    **{"background-color": "orange", "font-weight": "bold"}
)


  UserId-Category  Visits  Total Days since last visit  \
0     435770 - 19     219                            0   
1      9405 - 747      83                            0   
2     434677 - 19      67                            0   
3    507625 - 742      62                            0   
4    435770 - 742      58                            0   
5     26324 - 747      46                            0   
6    539023 - 742      45                            0   
7     17834 - 742      44                            0   
8     19327 - 742      44                            0   
9    435770 - 747      41                            0   

   Visit Total Time (seconds)  Max Days since last visit  \
0                    248020.0                          0   
1                     79666.0                          0   
2                     12395.0                          0   
3                     98777.0                          0   
4                     78231.0                          0   
5

Unnamed: 0,UserId-Category,Visits,Total Days since last visit,Visit Total Time (seconds),Max Days since last visit,Avg. Days since last visit per Visit,Avg. Visit Duration (in seconds) per Visit,UserId,CategoryId,Visit Total Time (hr/min),Avg. Visit Duration per Visit (hr/min),Week,WeeksActive,duration_reliability,flagged_and_changed_visit_time
0,435770 - 19,219,0,248020.0,0,0.0,1132.51,435770,19,2 days 20:53:40,0 days 00:18:52.510000,1,9,High,False
1,9405 - 747,83,0,79666.0,0,0.0,959.83,9405,747,0 days 22:07:46,0 days 00:15:59.830000,1,9,High,False
2,434677 - 19,67,0,12395.0,0,0.0,185.0,434677,19,0 days 03:26:35,0 days 00:03:05,1,9,High,True
3,507625 - 742,62,0,98777.0,0,0.0,1593.18,507625,742,1 days 03:26:17,0 days 00:26:33.180000,1,9,High,False
4,435770 - 742,58,0,78231.0,0,0.0,1348.81,435770,742,0 days 21:43:51,0 days 00:22:28.810000,1,9,High,False
5,26324 - 747,46,0,35710.0,0,0.0,776.3,26324,747,0 days 09:55:10,0 days 00:12:56.300000,1,7,High,False
6,539023 - 742,45,0,9315.0,0,0.0,207.0,539023,742,0 days 02:35:15,0 days 00:03:27,1,8,High,True
7,17834 - 742,44,0,44844.0,0,0.0,1019.18,17834,742,0 days 12:27:24,0 days 00:16:59.180000,1,7,High,False
8,19327 - 742,44,0,53507.0,0,0.0,1216.07,19327,742,0 days 14:51:47,0 days 00:20:16.070000,1,7,High,False
9,435770 - 747,41,0,59677.0,0,0.0,1455.54,435770,747,0 days 16:34:37,0 days 00:24:15.540000,1,9,High,False


In [None]:
#User Preferences 🎃🎃

# === Integrated pipeline: % splits, entropy/focus, weekly quartiles ===
# Requires: all_weeks_copy with columns:
#   ["UserId", "CategoryId", "Visit Total Time (seconds)", "Week"]

import numpy as np
import pandas as pd

# ---------- Helpers ----------
def _shannon_entropy(pcts: pd.Series) -> float:
    p = (pcts / 100.0).to_numpy(dtype=float)
    p = p[p > 0]  # ignore zeros
    if p.size == 0:
        return 0.0
    return float(-(p * np.log2(p)).sum())

def _quartiles_within_group(x: pd.Series) -> pd.Series:
    # Robust quartiles via percent rank → 1..4
    pct_rank = x.rank(pct=True, method="average")
    q = np.ceil(pct_rank * 4).astype(int)
    return q.clip(1, 4)

def highlight_new_cols(df: pd.DataFrame, color="#fff3cd"):
    """Return a Styler that highlights columns starting with NEW_."""
    new_cols = [c for c in df.columns if c.startswith("NEW_")]
    styler = df.style
    if new_cols:
        styler = styler.set_properties(
            **{"background-color": color},
            subset=pd.IndexSlice[:, new_cols]
        )
    return styler

# ---------- A) All-weeks aggregation: % splits + entropy + focus ----------
# A1) Aggregate time per User x Category
user_category_time = (
    all_weeks_copy
    .groupby(["UserId", "CategoryId"], as_index=False)
    .agg(**{"Visit Total Time (seconds)": ("Visit Total Time (seconds)", "sum")})
)

# A2) Total time per user + % split across categories
user_category_time["NEW_TotalUserTime"] = (
    user_category_time.groupby("UserId")["Visit Total Time (seconds)"].transform("sum")
)
user_category_time["NEW_PctTime"] = 0.0
nz = user_category_time["NEW_TotalUserTime"] > 0
user_category_time.loc[nz, "NEW_PctTime"] = (
    user_category_time.loc[nz, "Visit Total Time (seconds)"] /
    user_category_time.loc[nz, "NEW_TotalUserTime"] * 100.0
)

# A3) Entropy per user
user_entropy = (
    user_category_time
    .groupby("UserId")["NEW_PctTime"]
    .apply(_shannon_entropy)
    .reset_index(name="NEW_EntropyScore")
)

# A4) Top-category % and user type
top_share = (
    user_category_time
    .sort_values(["UserId", "NEW_PctTime"], ascending=[True, False])
    .groupby("UserId", as_index=False)
    .agg(NEW_TopCategoryPct=("NEW_PctTime", "max"))
)

FOCUSED_THRESHOLD = 70.0  # tweak if needed
top_share["Interest_Spread"] = np.where(
    top_share["NEW_TopCategoryPct"] > FOCUSED_THRESHOLD, "Focused", "Multi-explorer"
)

# A5) Final per-user preferred category profile
user_preferred_category = (
    user_entropy
    .merge(top_share, on="UserId", how="left")
    .sort_values(["Interest_Spread", "NEW_EntropyScore"], ascending=[True, False])
    .reset_index(drop=True)
)


# ============================================================
# B) Per-user, per-category: percentile ranks + composite quartile
# ============================================================

# 1. Aggregate visits + total time per User × Category
user_category_time = (
    all_weeks_copy
    .groupby(["UserId", "CategoryId"], as_index=False)
    .agg(
        NEW_Visits=("Visits", "sum"),
        VisitTotalTime=("Visit Total Time (seconds)", "sum")
    )
)

# 2. Total time per user + % split across categories
user_category_time["NEW_TotalUserTime"] = (
    user_category_time.groupby("UserId")["VisitTotalTime"].transform("sum")
)
user_category_time["NEW_PctTime"] = 0.0
nz = user_category_time["NEW_TotalUserTime"] > 0
user_category_time.loc[nz, "NEW_PctTime"] = (
    user_category_time.loc[nz, "VisitTotalTime"] /
    user_category_time.loc[nz, "NEW_TotalUserTime"] * 100.0
)

# 3. Percentile ranks within each CategoryId
user_category_time["NEW_PctRank_Visits"] = (
    user_category_time
    .groupby("CategoryId")["NEW_Visits"]
    .rank(pct=True, method="average")
)

user_category_time["NEW_PctRank_Time"] = (
    user_category_time
    .groupby("CategoryId")["VisitTotalTime"]
    .rank(pct=True, method="average")
)

# 4. Composite score (blend of visits + time ranks)
W_VISITS = 0.5
W_TIME = 0.5
user_category_time["NEW_Composite"] = (
    W_VISITS * user_category_time["NEW_PctRank_Visits"] +
    W_TIME   * user_category_time["NEW_PctRank_Time"]
)

# 5. Quartiles within each CategoryId (1 = lowest, 4 = highest)
user_category_time["NEW_Quartile"] = (
    user_category_time
    .groupby("CategoryId")["NEW_Composite"]
    .transform(_quartiles_within_group)  # from your helpers
    .astype(int)
)

# 6. Final category mix table (User, Category, % time, Quartile)
user_category_mix = (
    user_category_time[["UserId", "CategoryId", "NEW_PctTime", "NEW_Quartile"]]
    .sort_values(["UserId", "NEW_PctTime"], ascending=[True, False])
    .reset_index(drop=True)
)
user_category_mix["NEW_PctTime"] = user_category_mix["NEW_PctTime"].round(2)

# Optional: style to highlight the new quartile col
highlight_new_cols(user_category_mix)



In [None]:
from openpyxl import Workbook
import pandas as pd
from google.colab import files  # for autodownload in Colab

# === Export top_share and user_category_mix ===
output_path = "user_preferences.xlsx"

with pd.ExcelWriter(output_path, engine="openpyxl") as writer:
    # A4 result: top_share (user-level profile)
    top_share.to_excel(
        writer, sheet_name="UserProfiles", index=False
    )

    # B6 result: user_category_mix (user-category mix)
    user_category_mix.to_excel(
        writer, sheet_name="CategoryMix", index=False
    )

print(f"✅ Exported top_share and category_mix to {output_path}")

# === Trigger download to local machine ===
files.download(output_path)


✅ Exported top_share and category_mix to user_preferences.xlsx


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>