In [4]:

import pandas as pd
import numpy as np
from pathlib import Path
from typing import List


INPUT_PATH = Path(r"C:\Users\Admin\Downloads\events merged.csv")                 

def read_any_csv(path: Path) -> pd.DataFrame:
    for enc in ["utf-8", "utf-8-sig", "cp1252", "latin1"]:
        try:
            return pd.read_csv(path, encoding=enc)
        except Exception:
            continue
    return pd.read_csv(path)

def load_inputs(input_path: Path) -> pd.DataFrame:
    """If input_path is a dir: load & concat all CSVs. If file: load that one."""
    if input_path.is_file():
        df = read_any_csv(input_path)
        df["_source_file"] = input_path.name
        print(f"Loaded single file: {input_path}")
        return df
    elif input_path.is_dir():
        csvs = sorted(input_path.glob("*.csv"))
        if not csvs:
            raise FileNotFoundError(f"No CSV files found in folder: {input_path}")
        frames: List[pd.DataFrame] = []
        for p in csvs:
            try:
                df = read_any_csv(p)
                df["_source_file"] = p.name
                frames.append(df)
            except Exception as e:
                print(f"Skipped unreadable file: {p.name} ({e})")
        if not frames:
            raise RuntimeError("No readable CSVs found.")
        print(f"Merging {len(frames)} CSV files from: {input_path}")
        return pd.concat(frames, ignore_index=True, sort=True)
    else:
        raise FileNotFoundError(f"Path does not exist: {input_path}")

def normalize_columns(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    df.columns = (
        df.columns
        .str.strip()
        .str.replace(r"[\s\-]+", "_", regex=True)
        .str.replace(r"[^\w_]", "", regex=True)
        .str.lower()
    )

    colmap = {
        "title": ["title","event_title","name","eventname","event_name"],
        "description": ["description","details","summary"],
        "start": ["start","start_time","start_datetime","startdate","start_date","event_start","begins"],
        "end": ["end","end_time","end_datetime","enddate","end_date","event_end","ends"],
        "venue": ["venue","venue_name","place"],
        "city": ["city","town"],
        "country": ["country"],
        "address": ["address","location","address_line"],
        "url": ["url","link","event_url","permalink"],
        "source": ["source","source_site","origin"],
        "price": ["price","cost","amount"],
        "currency": ["currency","price_currency"],
        "category": ["category","type","event_type"],
        "organizer": ["organizer","host"],
        "latitude": ["latitude","lat"],
        "longitude": ["longitude","lon","lng"],
    }
    for target, cands in colmap.items():
        if target not in df.columns:
            for c in cands:
                if c in df.columns:
                    df[target] = df[c]; break
        if target not in df.columns:
            df[target] = np.nan

    # Clean text & nulls
    for c in df.select_dtypes(include=["object"]).columns:
        s = df[c].astype(str).str.replace(r"\s+", " ", regex=True).str.strip()
        s = s.mask(s.str.lower().isin(["nan","none","null","na",""]))
        df[c] = s

    # Datetimes to UTC
    for t in ["start","end"]:
        if t in df.columns:
            df[t] = pd.to_datetime(df[t], errors="coerce", utc=True)

    if "url" in df.columns:
        s = df["url"].astype(str).str.strip()
        s = s.str.replace(r"/+$", "", regex=True).str.lower()
        df["url"] = s.mask(s.isin(["","nan"]))

   
    for col in ["venue","city","country","address","category","organizer"]:
        if col in df.columns:
            s = df[col].astype("string")
            df[col] = s.where(s.isna(), s.str.title())

    if "currency" in df.columns:
        s = df["currency"].astype("string")
        df["currency"] = s.where(s.isna(), s.str.upper())

    # Numeric price extraction
    if "price" in df.columns:
        df["price_raw"] = df["price"]
        cleaned = (
            df["price"].astype(str)
            .str.replace(r"[^\d\.,\-]", "", regex=True)
            .str.replace(",", ".", regex=False)
        )
        def to_float(s):
            if pd.isna(s): return np.nan
            s = str(s)
            parts = s.split(".")
            if len(parts) > 2:
                s = "".join(parts[:-1]) + "." + parts[-1]
            try: return float(s)
            except: return np.nan
        df["price_value"] = cleaned.apply(to_float)

    #  dedup
    df["std_title"] = df["title"].astype(str).str.lower().str.replace(r"\s+", " ", regex=True).str.strip()
    df["std_city"]  = (df["city"].astype(str).str.lower().str.replace(r"\s+", " ", regex=True).str.strip()
                       if "city" in df.columns else np.nan)
    df["start_date"] = df["start"].dt.date if "start" in df.columns else pd.NaT
    return df

def build_dedup_key(df: pd.DataFrame) -> pd.Series:
    url_key = df["url"]
    fallback = (
        df["std_title"].fillna("")
        + " | " + df["start_date"].astype(str).fillna("")
        + " | " + df["std_city"].fillna("")
    )
    return pd.Series(
        np.where(url_key.notna(), "url::" + url_key.astype(str), "tsc::" + fallback.astype(str)),
        index=df.index
    )

def pick_best_record(df: pd.DataFrame) -> pd.DataFrame:
    score_cols = ["description","venue","address","city","country","price_value","category","organizer"]
    for c in score_cols:
        if c not in df.columns:
            df[c] = np.nan
    score = df[score_cols].notna().sum(axis=1) + df["start"].notna().astype(int)
    df = df.assign(_score=score)
    keep_idx = df.groupby("dedup_key")["_score"].idxmax()
    return df.loc[keep_idx].drop(columns=["_score"]).sort_index()

#  Run 
raw = load_inputs(INPUT_PATH)
print("Rows before merge:", len(raw))

norm = normalize_columns(raw)
norm["dedup_key"] = build_dedup_key(norm)

dedup = pick_best_record(norm)
print("Rows after dedup: ", len(dedup))

preferred = [
    "title","description","start","end","venue","address","city","country",
    "category","organizer","price_raw","price_value","currency","url","source",
    "latitude","longitude","start_date","dedup_key","_source_file"
]
cols = [c for c in preferred if c in dedup.columns] + [c for c in dedup.columns if c not in preferred]

# Output 
out_dir = INPUT_PATH.parent if INPUT_PATH.is_file() else INPUT_PATH
out_path = out_dir / "events_clean_dedup.csv"
dedup[cols].to_csv(out_path, index=False)
print("Saved:", out_path)


Loaded single file: C:\Users\Admin\Downloads\events merged.csv
Rows before merge: 5034
Rows after dedup:  3559
Saved: C:\Users\Admin\Downloads\events_clean_dedup.csv
