# 1. Imports

In [6]:
# === Cell 1: Imports ===
import os
import csv
import pandas as pd
from collections import defaultdict


# 2. Load user profiles and keep first 500 users

In [7]:
# === Cell 2: Load user profiles and keep first 500 users ===
# Paths
profiles_path = "Original-lastfm-dataset-1K/userid-profile.tsv"

# Load profiles (small file)
profiles_df = pd.read_csv(
    profiles_path,
    sep="\t",
    names=["user_id", "gender", "age", "country", "signup"],
    header=None,
    dtype=str
)

# Keep only the first 500 users
users_keep = profiles_df["user_id"].head(500).tolist()
profiles_df = profiles_df.head(500)                     

print("Profiles shape (reduced):", profiles_df.shape)
display(profiles_df.head())


Profiles shape (reduced): (500, 5)


Unnamed: 0,user_id,gender,age,country,signup
0,#id,gender,age,country,registered
1,user_000001,m,,Japan,"Aug 13, 2006"
2,user_000002,f,,Peru,"Feb 24, 2006"
3,user_000003,m,22,United States,"Oct 30, 2005"
4,user_000004,f,,,"Apr 26, 2006"


# 3. Stream-reduce listening history while keeping 500 users

In [8]:
# === Cell 3: Stream-reduce listening history while keeping 500 users ===
# Goal:
# - Keep exactly your selected 500 users (users_keep)
# - Shrink the listening history via CAP_PER_USER (primary knob)
#
# This streams the big TSV in chunks and writes out a reduced CSV
# without loading all 19M rows into RAM.

# Input/Output paths
DATA_DIR = "Original-lastfm-dataset-1K"
SRC = os.path.join(DATA_DIR, "userid-timestamp-artid-artname-traid-traname.tsv")

OUT_DIR = "Datasets-lastfm-reduced"
os.makedirs(OUT_DIR, exist_ok=True)

OUT_LISTENS = os.path.join(OUT_DIR, "listens_first500_CAP.csv")  # reduced listens CSV

# Schema & reader options
COLS = ["user_id", "timestamp", "artist_id", "artist_name", "track_id", "track_name"]
CHUNKSIZE = 500_000  # tune RAM 

# Reduction knobs 
CAP_PER_USER = 5000   

users_keep_set = set(users_keep)

# Prepare output with header
with open(OUT_LISTENS, "w", encoding="utf-8", newline="") as f:
    f.write(",".join(COLS) + "\n")

kept_counts = defaultdict(int)
rows_written = 0

reader = pd.read_csv(
    SRC,
    sep="\t",
    header=None,
    engine="python",        
    dtype=str,
    na_filter=False,
    quoting=csv.QUOTE_NONE,
    on_bad_lines="skip",    # skip malformed lines
    encoding="utf-8",
    chunksize=CHUNKSIZE,
)

for chunk in reader:
    # Normalize to exactly 6 columns (some rows may have extra tabs)
    if chunk.shape[1] < 6:
        continue
    if chunk.shape[1] > 6:
        chunk = chunk.iloc[:, :6]
    chunk.columns = COLS

    # Keep only the selected 500 users
    sub = chunk[chunk["user_id"].isin(users_keep_set)]
    if sub.empty:
        continue

    # Enforce per-user cap across chunks
    parts = []
    for u, grp in sub.groupby("user_id", sort=False):
        remain = CAP_PER_USER - kept_counts[u]
        if remain > 0:
            take = grp.iloc[:remain]
            if not take.empty:
                parts.append(take)
                kept_counts[u] += len(take)

    if parts:
        out = pd.concat(parts, ignore_index=True)
        out.to_csv(OUT_LISTENS, mode="a", index=False, header=False, encoding="utf-8")
        rows_written += len(out)

print(f"[Done] Wrote {rows_written:,} rows to {OUT_LISTENS}")


[Done] Wrote 2,057,492 rows to Datasets-lastfm-reduced\listens_first500_CAP.csv


# 4. Save reduced profiles (aligned with the 500 users)

In [9]:
# === Cell 4: Save reduced profiles (aligned with the 500 users) ===
OUT_PROFILES = os.path.join(OUT_DIR, "profiles_first500.csv") 
profiles_df.to_csv(OUT_PROFILES, index=False, encoding="utf-8")

print("Saved reduced datasets:")
print("Profiles ->", OUT_PROFILES)
print("Listens  ->", OUT_LISTENS)


Saved reduced datasets:
Profiles -> Datasets-lastfm-reduced\profiles_first500.csv
Listens  -> Datasets-lastfm-reduced\listens_first500_CAP.csv


# 5. Load reduced listens into a DataFrame for immediate use

In [10]:
# === Cell 5: Load reduced listens into a DataFrame for immediate use ===

listens_df = pd.read_csv(OUT_LISTENS)
print("Reduced listens_df shape:", listens_df.shape)
display(listens_df.head())

Reduced listens_df shape: (2057492, 6)


Unnamed: 0,user_id,timestamp,artist_id,artist_name,track_id,track_name
0,user_000001,2009-05-04T23:08:57Z,f1b1cf71-bd35-4e99-8624-24a6e15f133a,Deep Dish,,Fuck Me Im Famous (Pacha Ibiza)-09-28-2007
1,user_000001,2009-05-04T13:54:10Z,a7f7df4a-77d8-4f12-8acd-5c60c93f4de8,坂本龍一,,Composition 0919 (Live_2009_4_15)
2,user_000001,2009-05-04T13:52:04Z,a7f7df4a-77d8-4f12-8acd-5c60c93f4de8,坂本龍一,,Mc2 (Live_2009_4_15)
3,user_000001,2009-05-04T13:42:52Z,a7f7df4a-77d8-4f12-8acd-5c60c93f4de8,坂本龍一,,Hibari (Live_2009_4_15)
4,user_000001,2009-05-04T13:42:11Z,a7f7df4a-77d8-4f12-8acd-5c60c93f4de8,坂本龍一,,Mc1 (Live_2009_4_15)
