**Imports**

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import json
import simplejson
import matplotlib.pyplot as plt
from pathlib import Path
from typing import List, Dict, Tuple
from sklearn.linear_model import LinearRegression

**Code**

In [16]:
# Raw data
df12 = pd.DataFrame({
    "raw_city": [" new york", "New york ", "NEW YORK", "nyc", "NYC"]
})

# Basic cleaning
df12["city_clean_basic"] = (
    df12["raw_city"]
        .str.strip()      # remove leading/trailing spaces
        .str.lower()      # lowercase
)

print(df12)

    raw_city city_clean_basic
0   new york         new york
1  New york          new york
2   NEW YORK         new york
3        nyc              nyc
4        NYC              nyc


In [None]:
df12 = pd.DataFrame({"raw_city": [" new york", "New york ", "NEW YORK", "nyc", "NYC"]})

df12["city_clean_sep"] = (
    df12["raw_city"]
        .str.lower()
        .str.replace("-", " ", regex=False)
        .str.replace(r"[^a-z\s]", "", regex=True)
        .str.replace(r"\s+", " ", regex=True)
        .str.strip()
)

print(df12)

    raw_city city_clean_sep
0   new york       new york
1  New york        new york
2   NEW YORK       new york
3        nyc            nyc
4        NYC            nyc


In [None]:
# Canonical city mapping
canonical_map = {
    "new york": "new york",
    "nyc": "new york",
    "ny": "new york",
    "san francisco": "san francisco",
    "sanfrancisco": "san francisco",
}

# Normalize city tokens (remove spaces)
df12["city_token"] = (
    df12["city_clean_sep"]
    .str.lower()
    .str.replace(" ", "", regex=False)
)

# Map to canonical city names
df12["city_canonical"] = (
    df12["city_token"]
    .map(canonical_map)
    .fillna(df12["city_clean_sep"])
)

print(df12)

    raw_city city_clean_basic city_clean_sep city_token city_canonical
0   new york         new york       new york    newyork       new york
1  New york          new york       new york    newyork       new york
2   NEW YORK         new york       new york    newyork       new york
3        nyc              nyc            nyc        nyc       new york
4        NYC              nyc            nyc        nyc       new york


In [10]:
df12 = pd.DataFrame({
    "raw_signup": [
        "2024-01-01 10:00",
        "01/01/2024 15:00",
        "2024/01/01"
    ]
})

# normalize separators + parse safely
df12["signup_dt_raw"] = pd.to_datetime(
    df12["raw_signup"].str.replace("/", "-", regex=False),
    errors="coerce",
    dayfirst=True
)

print(df12[["raw_signup", "signup_dt_raw"]])
print("NaT count:", df12["signup_dt_raw"].isna().sum())

         raw_signup       signup_dt_raw
0  2024-01-01 10:00 2024-01-01 10:00:00
1  01/01/2024 15:00                 NaT
2        2024/01/01                 NaT
NaT count: 2
