# Apache Beam mini-project (Task 1 & Task 2)
We will (1) format rows, (2) count by gender, (3) count per day, (4) count per state.


In [1]:
!pip -q install "apache-beam>=2.56,<3"
import apache_beam as beam, os, sys, csv, re
print("Beam version:", beam.__version__)


Beam version: 2.67.0


In [2]:
from google.colab import files
up = files.upload()   # pick users_v.csv


Saving users_v.csv to users_v (1).csv


In [3]:
INPUT_PATH = "/content/users_v.csv"
OUT_DIR    = "/content/output"
os.makedirs(OUT_DIR, exist_ok=True)
print("CSV exists:", os.path.exists(INPUT_PATH), " → ", INPUT_PATH)


CSV exists: True  →  /content/users_v.csv


In [4]:
from datetime import datetime
from typing import Dict

def parse_date_any(s: str) -> str:
    s = (s or "").strip()
    for fmt in ("%Y-%m-%d", "%d/%m/%Y", "%m/%d/%Y", "%Y/%m/%d"):
        try:
            return datetime.strptime(s, fmt).strftime("%Y-%m-%d")
        except ValueError:
            pass
    return s

def normalize_name(name: str) -> str:
    name = (name or "").strip()
    if "," in name:
        last, first = [x.strip() for x in name.split(",", 1)]
        return f"{first} {last}".title()
    return re.sub(r"\s+", " ", name).title()

def normalize_gender(g: str) -> str:
    g = (g or "").strip().lower()
    if g in {"m","male"}: return "Male"
    if g in {"f","female"}: return "Female"
    return g.title() or "Unknown"

def normalize_address(addr: str) -> str:
    # aim for "City,ST,ZIP" if possible; otherwise keep trimmed
    a = " ".join((addr or "").strip().split())
    m = re.match(r"^([A-Za-z .'-]+)[, ]+([A-Za-z]{2})[, ]+(\d{5})(?:-\d{4})?$", a)
    if m: return f"{m.group(1)},{m.group(2).upper()},{m.group(3)}"
    m = re.match(r"^([A-Za-z .'-]+)[, ]+([A-Za-z]{2})$", a)
    if m: return f"{m.group(1)},{m.group(2).upper()},"
    return a

def parse_csv_row(line: str) -> Dict[str,str]:
    cols = next(csv.reader([line], delimiter=';'))
    keys = ["user_id","name","gender","age","address","date_joined"]
    vals = [cols[i].strip() if i < len(cols) else "" for i in range(len(keys))]
    return dict(zip(keys, vals))


In [5]:
from apache_beam.options.pipeline_options import PipelineOptions

MARKETING_PREFIX = f"{OUT_DIR}/marketing_format"

class ToMarketingRow(beam.DoFn):
    def process(self, rec):
        uid    = rec.get("user_id","").strip()
        name   = normalize_name(rec.get("name",""))
        gender = normalize_gender(rec.get("gender",""))
        age    = re.sub(r"[^\d]","", rec.get("age",""))
        addr   = normalize_address(rec.get("address",""))
        date   = parse_date_any(rec.get("date_joined",""))
        # keep a single space before address to match the example exactly
        yield f"{uid};{name};{gender};{age}; {addr};{date}"

with beam.Pipeline(options=PipelineOptions(["--runner=DirectRunner"])) as p:
    (p
     | "ReadCSV"   >> beam.io.ReadFromText(INPUT_PATH, skip_header_lines=1)
     | "Parse"     >> beam.Map(parse_csv_row)
     | "Format"    >> beam.ParDo(ToMarketingRow())
     | "WriteOut"  >> beam.io.WriteToText(MARKETING_PREFIX, file_name_suffix=".csv", shard_name_template="-SSSS")
    )

# peek a few lines for your screenshot
import glob
shards = sorted(glob.glob(f"{MARKETING_PREFIX}-*.csv"))
print("Marketing shards:", shards[:3])
if shards:
    print("\nSample:")
    print("\n".join(open(shards[0]).read().splitlines()[:3]))






Marketing shards: ['/content/output/marketing_format-0000.csv']

Sample:
1,Anthony Wolf,male,73,New Rachelburgh-VA-49583,2019/03/13;;Unknown;; ;
2,James Armstrong,male,56,North Jillianfort-UT-86454,2020/11/06;;Unknown;; ;
3,Cody Shaw,male,75,North Anne-SC-53799,2004/05/29;;Unknown;; ;


In [6]:
GENDER_PREFIX = f"{OUT_DIR}/gender_split"

with beam.Pipeline(options=PipelineOptions(["--runner=DirectRunner"])) as p:
    out = (p
      | "Read"  >> beam.io.ReadFromText(INPUT_PATH, skip_header_lines=1)
      | "Parse" >> beam.Map(parse_csv_row)
      | "Pair"  >> beam.Map(lambda r: (normalize_gender(r["gender"]), 1))
      | "Sum"   >> beam.CombinePerKey(sum)
      | "Keep"  >> beam.Filter(lambda kv: kv[0] in {"Female","Male"})
      | "Fmt"   >> beam.Map(lambda kv: f"{kv[0]};{kv[1]}")
      | "Write" >> beam.io.WriteToText(GENDER_PREFIX, file_name_suffix=".csv", shard_name_template="-SSSS")
    )

g_files = sorted(glob.glob(f"{GENDER_PREFIX}-*.csv"))
print("Gender shards:", g_files[:3])
if g_files:
    print(open(g_files[0]).read())




Gender shards: ['/content/output/gender_split-0000.csv']



In [7]:
DAILY_PREFIX = f"{OUT_DIR}/daily_counts"

with beam.Pipeline(options=PipelineOptions(["--runner=DirectRunner"])) as p:
    (p
     | "Read"  >> beam.io.ReadFromText(INPUT_PATH, skip_header_lines=1)
     | "Parse" >> beam.Map(parse_csv_row)
     | "Pair"  >> beam.Map(lambda r: (parse_date_any(r["date_joined"]), 1))
     | "Sum"   >> beam.CombinePerKey(sum)
     | "Fmt"   >> beam.Map(lambda kv: f"{kv[0]};{kv[1]}")
     | "Write" >> beam.io.WriteToText(DAILY_PREFIX, file_name_suffix=".csv", shard_name_template="-SSSS")
    )

d_files = sorted(glob.glob(f"{DAILY_PREFIX}-*.csv"))
print("Daily shards:", d_files[:3])
if d_files:
    print("\n".join(open(d_files[0]).read().splitlines()[:5]))




Daily shards: ['/content/output/daily_counts-0000.csv']
;2357


In [8]:
STATE_PREFIX = f"{OUT_DIR}/state_counts"

def to_state(rec):
    addr = normalize_address(rec.get("address",""))
    parts = [p.strip() for p in addr.split(",")]
    st = parts[1].upper() if len(parts)>=2 else "UNK"
    if not re.fullmatch(r"[A-Z]{2}", st): st = "UNK"
    return (st, 1)

with beam.Pipeline(options=PipelineOptions(["--runner=DirectRunner"])) as p:
    (p
     | "Read"   >> beam.io.ReadFromText(INPUT_PATH, skip_header_lines=1)
     | "Parse"  >> beam.Map(parse_csv_row)
     | "Pair"   >> beam.Map(to_state)
     | "Sum"    >> beam.CombinePerKey(sum)
     | "Drop"   >> beam.Filter(lambda kv: kv[0] != "UNK")
     | "Fmt"    >> beam.Map(lambda kv: f"{kv[0]};{kv[1]}")
     | "Write"  >> beam.io.WriteToText(STATE_PREFIX, file_name_suffix=".csv", shard_name_template="-SSSS")
    )

s_files = sorted(glob.glob(f"{STATE_PREFIX}-*.csv"))
print("State shards:", s_files[:3])
if s_files:
    print("\n".join(open(s_files[0]).read().splitlines()[:10]))




State shards: ['/content/output/state_counts-0000.csv']

