In [2]:
import pandas as pd

# Load your CSV file (replace 'your_file.csv' with the actual file path in Colab)
df = pd.read_csv('/content/llm-human-readable__content_perfect2_data_part-00000-e392e962-1bf5-400a-be27-ee691b69834e-c000.csv')

# Mapping of state names to state codes
state_code_mapping = {
    'Alabama': 'AL', 'Alaska': 'AK', 'Arizona': 'AZ', 'Arkansas': 'AR', 'California': 'CA',
    'Colorado': 'CO', 'Connecticut': 'CT', 'Delaware': 'DE', 'Florida': 'FL', 'Georgia': 'GA',
    'Hawaii': 'HI', 'Idaho': 'ID', 'Illinois': 'IL', 'Indiana': 'IN', 'Iowa': 'IA',
    'Kansas': 'KS', 'Kentucky': 'KY', 'Louisiana': 'LA', 'Maine': 'ME', 'Maryland': 'MD',
    'Massachusetts': 'MA', 'Michigan': 'MI', 'Minnesota': 'MN', 'Mississippi': 'MS', 'Missouri': 'MO',
    'Montana': 'MT', 'Nebraska': 'NE', 'Nevada': 'NV', 'New Hampshire': 'NH', 'New Jersey': 'NJ',
    'New Mexico': 'NM', 'New York': 'NY', 'North Carolina': 'NC', 'North Dakota': 'ND', 'Ohio': 'OH',
    'Oklahoma': 'OK', 'Oregon': 'OR', 'Pennsylvania': 'PA', 'Rhode Island': 'RI', 'South Carolina': 'SC',
    'South Dakota': 'SD', 'Tennessee': 'TN', 'Texas': 'TX', 'Utah': 'UT', 'Vermont': 'VT',
    'Virginia': 'VA', 'Washington': 'WA', 'West Virginia': 'WV', 'Wisconsin': 'WI', 'Wyoming': 'WY',
    'District of Columbia': 'DC',
    # U.S. Territories
    'Puerto Rico': 'PR', 'Guam': 'GU', 'U.S. Virgin Islands': 'VI', 'American Samoa': 'AS',
    'Northern Mariana Islands': 'MP'
}

# Mapping for broadband types to their full names
bb_type_mapping = {
    'Fixed Broadband': 'Fixed Broadband',
    'Fixed Wireless Access': 'Fixed Wireless Access'
}

# Function to generate the new summary based on the proposed template
def create_uniform_summary(row):
    try:
        # Extract values from the row using the correct column names
        state = row['State level']  # e.g., California
        state_code = state_code_mapping.get(state, 'Unknown')  # Map state name to code
        carrier = row['hh_bb_carrier']  # e.g., Spectrum
        bb_type = bb_type_mapping.get(row['bb_type'], row['bb_type'])  # Map broadband type
        distinct_hh = int(row['distinct_hhid_count'])  # Number of distinct households
        total_hh = int(row['total_hhids'])  # Total households
        carrier_share = float(row['carrier_share']) * 100  # Convert to percentage (e.g., 0.3183 → 31.83)

        # Format the summary using the proposed template
        summary = (f"In the state of {state} ({state_code}) in USA, during 2024, July, "
                   f"{carrier} providing {bb_type} services served {distinct_hh:,} households "
                   f"out of a total of {total_hh:,} in the country, resulting in a market share of {carrier_share:.2f}%.")

        return summary
    except Exception as e:
        return f"Error processing row: {str(e)}"

# Apply the function to the Human_Readable_Summary column
df['Human_Readable_Summary'] = df.apply(create_uniform_summary, axis=1)

# Save the updated CSV (replace 'updated_file.csv' with your desired output file name)
df.to_csv('updated_file.csv', index=False)

# Display the updated summaries to verify
print(df[['Human_Readable_Summary']])

# Check for any errors in the summaries
errors = df[df['Human_Readable_Summary'].str.contains("Error processing row")]
if not errors.empty:
    print("\nRows with errors:")
    print(errors)
else:
    print("\nNo errors found in the summaries.")

                                 Human_Readable_Summary
0     In the state of California (CA) in USA, during...
1     In the state of California (CA) in USA, during...
2     In the state of Florida (FL) in USA, during 20...
3     In the state of Texas (TX) in USA, during 2024...
4     In the state of Texas (TX) in USA, during 2024...
...                                                 ...
1596  In the state of Ohio (OH) in USA, during 2024,...
1597  In the state of Florida (FL) in USA, during 20...
1598  In the state of Alaska (AK) in USA, during 202...
1599  In the state of Georgia (GA) in USA, during 20...
1600  In the state of Rhode Island (RI) in USA, duri...

[1601 rows x 1 columns]

No errors found in the summaries.
