<a href="https://colab.research.google.com/github/GQ131/24-Frames-in-Data-Film-Data-Set-Analysis/blob/main/Untitled1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd

In [2]:
# Upload  Excel file
from google.colab import files
uploaded = files.upload()

Saving LWDA_Farmworker_Engagement_MASTER_3_28_23_Tableau .xlsx to LWDA_Farmworker_Engagement_MASTER_3_28_23_Tableau .xlsx


In [3]:
# After uploading, `uploaded` is a dict like {'agencies.xlsx': b'...bytes...'}
# Grab the filename:
excel_filename = list(uploaded.keys())[0]
print(f"✔️  Uploaded: {excel_filename}\n")

✔️  Uploaded: LWDA_Farmworker_Engagement_MASTER_3_28_23_Tableau .xlsx



## Clean and Standardize

In [4]:
# 3) Read Each Sheet into a pandas DataFrame
# -------------------------------------------
# Adjust sheet_name= accordingly if your tab names differ exactly.
#
# Per your description:
# - Sheet 1 contains CBOs
# - Sheet 2 contains LEAs locations
# - Sheet 3 contains LWDA locations
#
# If your workbook tabs are named differently, replace with your actual tab names.

cbos_raw = pd.read_excel(excel_filename, sheet_name='CBOs')    # first sheet → CBOs
leas_raw = pd.read_excel(excel_filename, sheet_name='LEAs')    # second sheet → LEAs
lwdas_raw = pd.read_excel(excel_filename, sheet_name= 'LWDAs')   # third sheet → LWDAs

In [5]:
# 4) Define a Utility Function to Standardize Column Names & Trim Text
# ---------------------------------------------------------------------
def normalize_and_trim(df):
    """
    - Strips whitespace from column names
    - Replaces spaces and hyphens with underscores
    - Converts to lowercase
    - Trims leading/trailing whitespace in every text column
    """
    df = df.copy()
    # 4a) Normalize column headers
    df.columns = (
        df.columns
          .str.strip()
          .str.replace(" ", "_")
          .str.replace("-", "_")
          .str.lower()
    )
    # 4b) Trim whitespace in every object‐dtype (string) column
    for col in df.select_dtypes(include="object"):
        df[col] = df[col].astype(str).str.strip()
    return df

# Apply normalization to each “raw” DataFrame
cbos = normalize_and_trim(cbos_raw)
leas = normalize_and_trim(leas_raw)
lwdas = normalize_and_trim(lwdas_raw)

# At this point:
# - cbos.columns might look like:
#   [ "name", "counties", "lwda", "dir", "lco", "dosh", "edd", "alrb",
#     "street_address", "city", "zip_code", "phone_number", "email", "contact_person" ]
#
# - leas.columns might look like:
#   [ "lea_name", "street_address", "city", "zipcode", "area_covered", "contact", "phone", "email" ]
#
# - lwdas.columns might look like:
#   [ "street_address", "city", "zipcode", "partner", "location_type", "phone", "email" ]


## Clean LWDAs

In [6]:
# 5) CLEAN THE LWDA SHEET
# ------------------------
# Raw columns (after normalization) should be:
#   "street_address", "city", "zipcode", "partner", "location_type", "phone", "email"
#
# Desired final columns:
#   type, name, county, affiliated_agency, street_address, city, zipcode, phone, email, contact, location_type

# 5a) Rename to our target schema
lwdas = lwdas.rename(columns={
    "partner": "name",
    "street_address": "street_address",
    "city": "city",
    "zipcode": "zipcode",
    "phone": "phone",
    "email": "email",
    "location_type": "location_type"
})


In [7]:
# 5b) Create the “Type” column = "LWDA"
lwdas['type'] = "LWDA"

# 5c) Create “Affiliated_Agency” = same as “name”
lwdas['affiliated_agency'] = lwdas['name']

# 5d) If you have a separate mapping of Partner → County, you can insert it here.
#     Otherwise, leave county blank (NaN). You can always fill in manually later in Excel/Tableau.
lwdas['county'] = pd.NA

# 5e) Add a “contact” column, even if blank, to match the final schema
lwdas['contact'] = pd.NA


In [8]:
# 5f) Reorder columns exactly as we want for “LWDA” entries
lwdas_clean = lwdas[[
    'type',
    'name',
    'county',
    'affiliated_agency',
    'street_address',
    'city',
    'zipcode',
    'phone',
    'email',
    'contact',
    'location_type'
]]

print("LWDA cleaned schema:\n", lwdas_clean.head(), "\n")

LWDA cleaned schema:
    type name county affiliated_agency           street_address           city  \
0  LWDA  LCO   <NA>               LCO           7718 Meany Ave    Bakersfield   
1  LWDA  LCO   <NA>               LCO  7575 Metropolitan Drive      San Diego   
2  LWDA  LCO   <NA>               LCO          1550 W. Main St      El Centro   
3  LWDA  LCO   <NA>               LCO      455 Golden Gate Ave  San Francisco   
4  LWDA  LCO   <NA>               LCO  770 E. Shaw Ave Ste 222         Fresno   

   zipcode         phone                         email contact location_type  
0    93308  661 587 3060  LaborComm.WCA.BAK@dir.ca.gov    <NA>  State Office  
1    92108  619-220-5451  LaborComm.WCA.SDO@dir.ca.gov    <NA>  State Office  
2    92243  760-353-0607                           nan    <NA>  State Office  
3    94102  415-703-5300  LaborComm.WCA.SFO@dir.ca.gov    <NA>  State Office  
4    93710  559-244-5340  LaborComm.WCA.FRE@dir.ca.gov    <NA>  State Office   



## Clean LEAs

In [9]:
# 6) CLEAN THE LEA SHEET
# -----------------------
# Raw columns (after normalization) should be something like:
#   "lea_name", "street_address", "city", "zipcode", "area_covered", "contact", "phone", "email"
#
# Desired final columns:
#   type, name, county, affiliated_agency, street_address, city, zipcode, phone, email, contact, area_covered

# 6a) Rename to match our target schema
leas = leas.rename(columns={
    "lea_name": "name",
    "street_address": "street_address",
    "city": "city",
    "zipcode": "zipcode",
    "area_covered": "area_covered",
    "contact": "contact",
    "phone": "phone",
    "email": "email"
})


In [10]:
# 6b) Create “Type” = "LEA"
leas['type'] = "LEA"

# 6c) Create "Affiliated_Agency" = same as “name”
leas['affiliated_agency'] = leas['name']

# 6d) If “area_covered” is exactly the county name, set county = area_covered
#     Otherwise, you might need to parse it.
#     For now, assume “area_covered” is one county (e.g. "Alameda").
leas['county'] = leas['area_covered']

# 6e) Reorder columns to match the final schema for LEA
leas_clean = leas[[
    'type',
    'name',
    'county',
    'affiliated_agency',
    'street_address',
    'city',
    'zipcode',
    'phone',
    'email',
    'contact',
    'area_covered'
]]

print("LEA cleaned schema:\n", leas_clean.head(), "\n")


LEA cleaned schema:
   type                                         name             county  \
0  LEA                           Emergency Services   City of Gonzales   
1  LEA                Environmental Health Services   San Mateo County   
2  LEA                         Environmental Health    Monterey County   
3  LEA                Environmental Health Division  Santa Cruz County   
4  LEA  Planning, Building & Environmental Services        Napa County   

                             affiliated_agency  \
0                           Emergency Services   
1                Environmental Health Services   
2                         Environmental Health   
3                Environmental Health Division   
4  Planning, Building & Environmental Services   

                         street_address        city  zipcode         phone  \
0                        147 Fourth St.    Gonzales    93926  831-675-5000   
1  2000 Alameda de las Pulgas, Ste. 100   San Mateo    94403  650-372-6200   

## Clean CBO Sheet

In [11]:
# 7) CLEAN THE CBO SHEET
# -----------------------
# Raw columns (after normalization) likely look like:
#   "name", "counties", "lwda", "dir", "lco", "dosh", "edd", "alrb",
#   "street_address", "city", "zip_code", "phone_number", "email", "contact_person"
#
# Steps:
#   7a) Rename columns to our target schema
#   7b) Split “counties” (comma‐separated) into separate rows
#   7c) Unpivot the six agency columns → keep only rows where the flag is “X”
#   7d) Add “Type” = "CBO"
#   7e) Reorder columns to match final schema:
#        type, name, county, affiliated_agency, street_address, city, zipcode, phone, email, contact

# 7a) Rename:
cbos = cbos.rename(columns={
    "name": "name",
    "counties": "counties",
    "lwda": "lwda",
    "dir": "dir",
    "lco": "lco",
    "dosh": "dosh",
    "edd": "edd",
    "alrb": "alrb",
    "street_address": "street_address",
    "city": "city",
    "zip_code": "zipcode",
    "phone_number": "phone",
    "email": "email",
    "contact_person": "contact"
})

In [12]:
# 7b) Split “counties” string into a list and explode into separate rows
#     so each CBO‐County becomes its own row
#     (e.g. "Alameda, Contra Costa" → two rows: one with "Alameda", one with "Contra Costa")
cbos['county'] = cbos['counties'].str.split(',')    # split on comma
cbos = cbos.explode('county')                       # one row per county
# Trim whitespace around each county name, and convert to Title Case
cbos['county'] = cbos['county'].str.strip().str.title()

# 7c) Unpivot the six affiliated‐agency columns [“lwda”, “dir”, “lco”, “dosh”, “edd”, “alrb”]
agency_cols = ['lwda', 'dir', 'lco', 'dosh', 'edd', 'alrb']

cbos_melted = (
    cbos
    .melt(
        id_vars=['name', 'county', 'street_address', 'city', 'zipcode', 'phone', 'email', 'contact'],
        value_vars=agency_cols,
        var_name='affiliated_agency',
        value_name='flag'
    )
    # Keep only rows where flag == "X" or "x"
    .loc[lambda df: df['flag'].str.lower() == 'x']
    .drop(columns=['flag'])
)

In [13]:
# 7d) Add “Type” = "CBO"
cbos_melted['type'] = "CBO"

# 7e) Reorder to our final CBO schema
cbos_clean = cbos_melted[[
    'type',
    'name',
    'county',
    'affiliated_agency',
    'street_address',
    'city',
    'zipcode',
    'phone',
    'email',
    'contact'
]]

print("CBO cleaned schema (first 5 rows):\n", cbos_clean.head(), "\n")

CBO cleaned schema (first 5 rows):
     type                                               name      county  \
147  CBO       California Rural Legal Assistance Foundation      Fresno   
148  CBO       California Rural Legal Assistance Foundation  Stanislaus   
149  CBO            California Rural Legal Assistance, Inc.   Statewide   
156  CBO                      Center for Community Advocacy    Monterey   
172  CBO  Central Coast Alliance United for a Sustainabl...               

    affiliated_agency              street_address           city  zipcode  \
147               dir    2210 K Street, Suite 201     Sacramento  95816.0   
148               dir    2210 K Street, Suite 201     Sacramento  95816.0   
149               dir  1430 Franklin St Suite 103        Oakland  94612.0   
156               dir             22 W Gabilan St        Salinas  93901.0   
172               dir       126 E. Haley St. #A17  Santa Barbara  93101.0   

              phone              email           c

## Combine All

In [14]:
# 8) COMBINE ALL THREE CLEANED TABLES into ONE MASTER
# ----------------------------------------------------
# The target schema for the master table is (in order):
#   type, name, county, affiliated_agency, street_address, city, zipcode, phone, email, contact,
#   [plus any sheet-specific fields: area_covered for LEA; location_type for LWDA]
#
# Since LEAs have “area_covered” and LWDAs have “location_type”, those columns will appear as NaN for the other types.
# That’s fine—Tableau can handle blank values. If you want to drop them later, you can.

master = pd.concat(
    [
        lwdas_clean,
        leas_clean,
        cbos_clean
    ],
    axis=0,
    ignore_index=True,
    sort=False  # keep columns in the order we specified
)

# Optional: Drop exact duplicates if any
master = master.drop_duplicates()

## Tableau Formatting

In [15]:

# 9) Add “State” and/or Build a “Full_Address” Column
# ---------------------------------------------------------------
# If you want to let Tableau geocode by county + state, add:
master['state'] = "California"

# If you want Tableau to geocode street‐level pins, build a Full_Address:
master['full_address'] = (
    master['street_address'].fillna('') + ", " +
    master['city'].fillna('') + ", CA " +
    master['zipcode'].fillna('').astype(str)
)


## Preview and Save to csv

In [16]:

# 10) SAVE THE MASTER TABLE as CSV (for Tableau)
# ------------------------------------------------
# This will create “All_Agencies_Master.csv” in your Colab environment,
# which you can then download and point Tableau to.
output_filename = "All_Agencies_Master.csv"
master.to_csv(output_filename, index=False)
print(f"✔️  Master file written to: {output_filename}")

# 11) (Optional) Download the CSV directly from Colab to your local machine
files.download(output_filename)

# 12) QUICK SANITY CHECK: print out column names + row counts
print("\n--- Master Schema & Row Counts ---")
print("Columns:", master.columns.tolist())
print("Total Rows:", len(master))
print("\nSample rows:\n", master.head(10))


✔️  Master file written to: All_Agencies_Master.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


--- Master Schema & Row Counts ---
Columns: ['type', 'name', 'county', 'affiliated_agency', 'street_address', 'city', 'zipcode', 'phone', 'email', 'contact', 'location_type', 'area_covered', 'state', 'full_address']
Total Rows: 312

Sample rows:
    type name county affiliated_agency                     street_address  \
0  LWDA  LCO    NaN               LCO                     7718 Meany Ave   
1  LWDA  LCO    NaN               LCO            7575 Metropolitan Drive   
2  LWDA  LCO    NaN               LCO                    1550 W. Main St   
3  LWDA  LCO    NaN               LCO                455 Golden Gate Ave   
4  LWDA  LCO    NaN               LCO            770 E. Shaw Ave Ste 222   
5  LWDA  LCO    NaN               LCO  100 Paseo De San Antonio Room 120   
6  LWDA  LCO    NaN               LCO              1500 Hughes Way C-202   
7  LWDA  LCO    NaN               LCO          2 Macarthur Place Ste 800   
8  LWDA  LCO    NaN               LCO           320 W. Fourth St Ste

In [18]:
import re
csv_filename = "All_Agencies_Master.csv"

# Read it into pandas
df = pd.read_csv(csv_filename, dtype=str)  # read everything as str to avoid surprises
print("Original row count:", len(df))
print("Columns:", df.columns.tolist())

# 3) Define a Function to Clean “Street_Address”
# ------------------------------------------------
def clean_address(addr: str) -> str:
    """
    - Remove any non-printable/control characters
    - Remove odd Unicode characters outside basic ASCII
    - Collapse multiple spaces into one
    - Strip leading/trailing whitespace
    """
    if pd.isna(addr):
        return ""
    # 3a) Ensure it's a Python str
    s = str(addr)
    # 3b) Remove control characters: keep only printable ASCII (U+0020 to U+007E) plus common punctuation
    #     [\x20-\x7E] covers space through tilde. We’ll allow commas, periods, #, -, /, and letters/numbers.
    s = re.sub(r"[^\x20-\x7E]", " ", s)
    # 3c) Remove any stray multiple spaces
    s = re.sub(r"\s+", " ", s)
    return s.strip()

# 4) Define a Function to Standardize “Phone”
# ---------------------------------------------
def clean_phone_number(phone: str) -> str:
    """
    - Strip out all non-digit characters
    - If 10 digits remain, format as (XXX) XXX-XXXX
    - If 11 digits and leading '1', drop the '1' and format the remaining 10 digits
    - Otherwise, return the digits string unformatted
    """
    if pd.isna(phone):
        return ""
    s = str(phone)
    # 4a) Extract digits only
    digits = re.sub(r"\D", "", s)
    # 4b) If leading country code “1” + 10 digits, drop leading 1
    if len(digits) == 11 and digits.startswith("1"):
        digits = digits[1:]
    # 4c) If exactly 10 digits, format as (XXX) XXX-XXXX
    if len(digits) == 10:
        return f"({digits[0:3]}) {digits[3:6]}-{digits[6:10]}"
    # 4d) Otherwise, just return whatever digits are left (so you can spot anomalies)
    return digits

# 5) Apply Cleaning Functions to the DataFrame
# ----------------------------------------------
# Make copies of the original columns, just in case
df["Street_Address_Original"] = df.get("street_address", "").fillna("")
df["Phone_Original"] = df.get("phone", "").fillna("")

# 5a) Clean “street_address”
df["street_address"] = df["street_address"].apply(clean_address)

# 5b) Clean “phone”
df["phone"] = df["phone"].apply(clean_phone_number)

# 5c) (Optional) If you want to similarly clean “City” or “Email”, you can apply .str.strip() or lowercasing:
# df["city"] = df["city"].str.strip().str.title()
# df["email"] = df["email"].str.strip().str.lower()

# 6) Inspect a Few Examples to Confirm
# --------------------------------------
print("\n--- Sample Before & After Cleaning ---")
preview = df[[
    "Street_Address_Original", "street_address",
    "Phone_Original", "phone"
]].head(10)
print(preview.to_string(index=False))

# 7) Write Out a New “Cleaned” CSV
# ---------------------------------
output_filename = "All_Agencies_Master_Cleaned.csv"
df.to_csv(output_filename, index=False)
print(f"\n✔️  Cleaned CSV written to: {output_filename}")

# 8) Download the Cleaned CSV to Your Local Machine
# ---------------------------------------------------
files.download(output_filename)


Original row count: 312
Columns: ['type', 'name', 'county', 'affiliated_agency', 'street_address', 'city', 'zipcode', 'phone', 'email', 'contact', 'location_type', 'area_covered', 'state', 'full_address']

--- Sample Before & After Cleaning ---
          Street_Address_Original                    street_address Phone_Original          phone
                   7718 Meany Ave                    7718 Meany Ave   661 587 3060 (661) 587-3060
          7575 Metropolitan Drive           7575 Metropolitan Drive   619-220-5451 (619) 220-5451
                  1550 W. Main St                   1550 W. Main St   760-353-0607 (760) 353-0607
              455 Golden Gate Ave               455 Golden Gate Ave   415-703-5300 (415) 703-5300
          770 E. Shaw Ave Ste 222           770 E. Shaw Ave Ste 222   559-244-5340 (559) 244-5340
100 Paseo De San Antonio Room 120 100 Paseo De San Antonio Room 120   408-277-1266 (408) 277-1266
            1500 Hughes Way C-202             1500 Hughes Way C-202  

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>