In [7]:
import pandas as pd
import inflection
import re



# Update with your actual CSV path
pledge_data_path = "/Users/borismartinez/Documents/GitHub/engage/data/pledge_data.csv"

# Load the pledge data
pledge_df = pd.read_csv(pledge_data_path, low_memory=False)
print(f"Total pledge records loaded: {len(pledge_df)}")

# 1. Rename columns: lower case and underscores, no spaces
pledge_df.columns = [inflection.underscore(col.strip()) for col in pledge_df.columns]
pledge_df.columns = [("name" if col.startswith("name (first last or common name)") else col) for col in pledge_df.columns]

# Additional renaming fixes:
new_columns = []
for col in pledge_df.columns:
    # Remove any trailing _(engage_miami) pattern
    col = re.sub(r'_\((engage_miami)\)$', '', col)
    # Rename 'date_created' (assuming lowercase now)
    if col == 'date created':
        col = 'date_created'  # already good if this is correct, else modify as needed
    # Normalize zip/postal variations to 'zip'
    elif col in ['zip/postal']:
        col = 'zip'
    # Normalize state/province variations to 'state'
    elif col in ['state/province']:
        col = 'state'
    new_columns.append(col)

pledge_df.columns = new_columns

# 1.1 Add Year, Month Column based on date_contacted
if 'date_contacted' in pledge_df.columns:
    pledge_df['date_contacted'] = pd.to_datetime(pledge_df['date_contacted'], errors='coerce')
    pledge_df['year'] = pledge_df['date_contacted'].dt.year
    pledge_df['month'] = pledge_df['date_contacted'].dt.month
else:
    print("Warning: 'date_contacted' column not found in the pledge data.")



# 2. Convert all string columns to lower case (skip numeric columns)
for col in pledge_df.columns:
    if pd.api.types.is_string_dtype(pledge_df[col]):
        pledge_df[col] = pledge_df[col].astype(str).str.lower().str.strip()

# 3. Count total rows and count of distinct rows by all columns combined
total_rows = len(pledge_df)
distinct_rows = pledge_df.drop_duplicates().shape[0]

print(f"Total pledge records: {total_rows}")
print(f"Distinct pledge records (all columns): {distinct_rows}")


# Show first few rows for preliminary understanding
pledge_df.head()


# 4. Usable & missing values count and percent per key column(s)
# Define key columns to assess usability (example names, adjust as needed)
key_cols = ['name', 'city', 'county', 'address' ]

# Define strings that count as missing
null_equivs = ['', 'na', 'nan', 'null', 'none', 'undefined']

def is_missing_strict(series):
    # True if value is NaN or one of null-equivalent strings (case-insensitive)
    return series.isna() | series.astype(str).str.strip().str.lower().isin(null_equivs)


# Calculate missing counts and percentages per key col
missing_counts = {}
for col in key_cols:
    missing_counts[col] = is_missing_strict(pledge_df[col]).sum()

missing_df = pd.DataFrame.from_dict(missing_counts, orient='index', columns=['missing_count'])
missing_df['missing_pct'] = (missing_df['missing_count'] / total_rows) * 100

# Compute usable mask per row (all key cols non-missing)
usable_mask = ~pd.concat([is_missing_strict(pledge_df[col]) for col in key_cols], axis=1).any(axis=1)

# Add usable column to dataframe
pledge_df['usable'] = usable_mask

print("\nMissing counts and percentages per key column:")
print(missing_df)

print(f"\nTotal usable records based on keys {key_cols}: {pledge_df['usable'].sum()} ({pledge_df['usable'].mean()*100:.2f}%)")


Total pledge records loaded: 10698
Total pledge records: 10698
Distinct pledge records (all columns): 10698

Missing counts and percentages per key column:
         missing_count  missing_pct
name                 0         0.00
city               943         8.81
county            1005         9.39
address           5863        54.80

Total usable records based on keys ['name', 'city', 'county', 'address']: 4803 (44.90%)


In [11]:
import pandas as pd

# Assuming pledge_df loaded and columns renamed earlier

# Select issue columns based on position
issue_cols = pledge_df.columns[15:31]  # inclusive column 14 to 32 (zero-indexed)

# Clean and create binary flags for 'x' (case-insensitive)
for col in issue_cols:
    # Convert to string, lower case and strip
    pledge_df[col] = pledge_df[col].astype(str).str.lower().str.strip()
    # Create binary column: 1 if 'x', else 0
    pledge_df[col + '_flag'] = (pledge_df[col] == 'x').astype(int)

# Summarize total and percentage for each issue flag
summary_list = []
total_pledges = len(pledge_df)

for col in issue_cols:
    flag_col = col + '_flag'
    total_marked = pledge_df[flag_col].sum()
    pct_marked = total_marked / total_pledges * 100
    summary_list.append({'issue': col, 'total_marked': total_marked, 'pct_marked': pct_marked})

issues_summary_df = pd.DataFrame(summary_list).sort_values(by='pct_marked', ascending=False)

print("Issue coverage summary:")
print(issues_summary_df)

Issue coverage summary:
                    issue  total_marked  pct_marked
7   housing_affordability          4619       43.18
1   affordable_healthcare          4100       38.32
0         abortion_access          4065       38.00
6    gun_violence_prevent          3987       37.27
5      education_(public)          3915       36.60
11          racial_equity          3122       29.18
14       updates_interest          3121       29.17
4       economic_mobility          2580       24.12
2         climate_and_env          2373       22.18
15     volunteer_interest          2292       21.42
8         lgbtqia+_rights          2288       21.39
10         public_transit          2287       21.38
12          voting_rights          1900       17.76
9    pro_immigrant_policy          1630       15.24
3        decrim_marijuana          1537       14.37
13        member_interest           532        4.97


In [10]:
# Sort by total_marked descending and add rank starting at 1
issues_summary_df = issues_summary_df.sort_values(by='total_marked', ascending=False)
issues_summary_df['rank'] = range(1, len(issues_summary_df) + 1)

# Optionally reorder columns for readability
issues_summary_df = issues_summary_df[['rank', 'issue', 'total_marked', 'pct_marked']]

print("Ranked Issue Coverage Summary:")
print(issues_summary_df)

Ranked Issue Coverage Summary:
    rank                  issue  total_marked  pct_marked
7      1  housing_affordability          4619       43.18
1      2  affordable_healthcare          4100       38.32
0      3        abortion_access          4065       38.00
6      4   gun_violence_prevent          3987       37.27
5      5     education_(public)          3915       36.60
11     6          racial_equity          3122       29.18
14     7       updates_interest          3121       29.17
4      8      economic_mobility          2580       24.12
2      9        climate_and_env          2373       22.18
15    10     volunteer_interest          2292       21.42
8     11        lgbtqia+_rights          2288       21.39
10    12         public_transit          2287       21.38
12    13          voting_rights          1900       17.76
9     14   pro_immigrant_policy          1630       15.24
3     15       decrim_marijuana          1537       14.37
13    16        member_interest          

In [16]:
import numpy as np
import pandas as pd
pd.options.display.float_format = '{:.2f}'.format

def compute_conditional_cooccurrence(co_occurrence, filter_desc="overall"):
    """
    Takes a co-occurrence DataFrame, computes the conditional co-occurrence by normalizing rows,
    formats numbers to 2 decimals (including trailing zeros), and prints the result.
    
    Args:
        co_occurrence (pd.DataFrame): Square co-occurrence counts matrix.
        filter_desc (str): Description to print about the filtering for context.
    
    Returns:
        pd.DataFrame: Formatted conditional co-occurrence matrix as strings.
    """
    # Get diagonal as a NumPy array (counts of each issue alone)
    diag = np.diag(co_occurrence.values)
    
    # Replace zeros on diagonal with 1 to avoid division by zero errors
    diag_safe = np.where(diag == 0, 1, diag)
    
    # Normalize rows by diagonal
    conditional_cooccurrence = co_occurrence.div(diag_safe, axis=0)
    
    # Format with 2 decimals including trailing zeros
    conditional_cooccurrence_str = conditional_cooccurrence.applymap(lambda x: f"{x:.2f}")
    
    print(f"\nConditional co-occurrence matrix ({filter_desc}):")
    print(conditional_cooccurrence)
    
    return conditional_cooccurrence_str


def compute_yearly_cooccurrence(df, year_column, year, issue_cols):
    """
    Filters the DataFrame by contact_year, computes co-occurrence matrix on flagged issues,
    then returns the formatted conditional co-occurrence matrix.

    Args:
        df (pd.DataFrame): Full dataframe including flag columns and year column.
        year_column (str): Column to filter years on e.g., 'contact_year'.
        year (int): The year value to filter on.
        issue_cols (list): List of binary flag columns.

    Returns:
        pd.DataFrame: Conditional co-occurrence matrix formatted as strings.
    """
    df_filtered = df[df[year_column] == year]

    # Select issue flag columns only
    df_issues = df_filtered[issue_cols]

    co_occurrence = df_issues.T.dot(df_issues)
    return compute_conditional_cooccurrence(co_occurrence, filter_desc=f"year {year}")

In [18]:
issue_flag_cols = [col for col in pledge_df.columns if col.endswith('_flag')]

# Compute for 2023 using the 'contact_year' column
formatted_2023 = compute_yearly_cooccurrence(
    pledge_df,
    year_column='year',
    year=2023,
    issue_cols=issue_flag_cols
)


Conditional co-occurrence matrix (year 2023):
                            abortion_access_flag  affordable_healthcare_flag  \
abortion_access_flag                        1.00                        0.47   
affordable_healthcare_flag                  0.44                        1.00   
climate_and_env_flag                        0.51                        0.52   
decrim_marijuana_flag                       0.51                        0.48   
economic_mobility_flag                      0.41                        0.51   
education_(public)_flag                     0.39                        0.46   
gun_violence_prevent_flag                   0.47                        0.43   
housing_affordability_flag                  0.38                        0.51   
lgbtqia+_rights_flag                        0.65                        0.45   
pro_immigrant_policy_flag                   0.50                        0.51   
public_transit_flag                         0.42                        0

  conditional_cooccurrence_str = conditional_cooccurrence.applymap(lambda x: f"{x:.2f}")


In [20]:
city_year_filtered = pledge_df[(pledge_df['city'] == 'Miami') & (pledge_df['year'] == 2023)]
miami_ls = compute_yearly_cooccurrence(
    city_year_filtered,
    year_column='year',
    year=2023,
    issue_cols=issue_flag_cols
)


Conditional co-occurrence matrix (year 2023):
                            abortion_access_flag  affordable_healthcare_flag  \
abortion_access_flag                        1.00                        0.44   
affordable_healthcare_flag                  0.43                        1.00   
climate_and_env_flag                        0.44                        0.52   
decrim_marijuana_flag                       0.51                        0.48   
economic_mobility_flag                      0.38                        0.51   
education_(public)_flag                     0.38                        0.45   
gun_violence_prevent_flag                   0.43                        0.41   
housing_affordability_flag                  0.35                        0.48   
lgbtqia+_rights_flag                        0.68                        0.44   
pro_immigrant_policy_flag                   0.48                        0.49   
public_transit_flag                         0.39                        0

  conditional_cooccurrence_str = conditional_cooccurrence.applymap(lambda x: f"{x:.2f}")
