# Preprocessing

In [None]:
# pad the zipcode with leading zeros
df['location_zipcode'] = df['location_zipcode'].fillna(0)
df['location_zipcode'] = df['location_zipcode'].astype(int).astype(str)
df['location_zipcode'] = df['location_zipcode'].apply(lambda x: x.zfill(5))

# Fill in the missing zipcodes by longitude and latitude
def find_zipcode_if_missing(row):
    # Check if the zipcode is already present (not empty or NaN)
    if row['location_zipcode'] == "00000":
        lat, lon = row['latitude'], row['longitude']
        point = Point(lon, lat)
        for feature in geojson['features']:
            polygon = shape(feature['geometry'])
            if polygon.contains(point):
                return feature['properties']['ZIP5']
    # Return the original zipcode if it's already there, or NaN if not found
    return row['location_zipcode']

# Apply the modified function to each row
df['location_zipcode'] = df.apply(find_zipcode_if_missing, axis=1)
# save the csv back to the file
df.to_csv('../Datasets/animal_cases.csv', index=False)
with pd.option_context('display.max_columns', None, 'display.width', None):

    print(df[df['location_zipcode'] == "00000"])
df['location_zipcode'].value_counts()

## Combine the animal columns

In [None]:
animal_df = pd.read_csv('../Datasets/req_with_animals.csv')
df['animal']=animal_df['animal']
# save the csv back to the file
df.to_csv('../Datasets/animal_cases.csv', index=False)

# Geojson Processing

In [None]:
import json

def print_structure(obj, indent=0):
    """Recursive function to print the structure of a JSON object."""
    # Determine the type of the `obj` parameter
    if isinstance(obj, dict):
        for key, value in obj.items():
            # Print the current key with indentation
            print('  ' * indent + str(key))
            # Recursively print the structure of the value
            print_structure(value, indent + 1)
    elif isinstance(obj, list):
        # Print the indication of a list and its size
        print('  ' * indent + f'List - {len(obj)} items')
        # Optionally, handle lists: you can choose to print the structure of the first item, all items, or none
        if len(obj) > 0:
            # Example: print the structure of the first item
            print_structure(obj[0], indent + 1)
    else:
        # Print the type of the primitive
        print('  ' * indent + f'Value type: {type(obj).__name__}')

# Load your JSON file
filename = '../Datasets/USA_ZIP_Code_Boundaries.geojson'
with open(filename, 'r') as file:
    data = json.load(file)

# Print the structure of the loaded JSON data
print("JSON Structure:")
print_structure(data)


In [None]:
import json

def extract_zipcode_data(json_data, selected_zipcodes):
    """Extract data for specified zip codes from JSON data."""
    # Filter features by selected zip codes
    filtered_features = [
        feature for feature in json_data['features']
        if feature['properties']['ZIP_CODE'] in selected_zipcodes
    ]
    
    # Return a new JSON-like structure with filtered features
    return {
        "type": json_data['type'],
        "name": json_data['name'],
        "crs": json_data['crs'],
        "features": filtered_features
    }

# Load your JSON file
filename = '../Datasets/USA_ZIP_Code_Boundaries.geojson'
with open(filename, 'r') as file:
    data = json.load(file)

# List of zip codes you are interested in
selected_zipcodes = ["02108", "02109", "02110", "02111", "02113", "02114", "02115", "02116", "02118", "02119", "02120", "02121", "02122", "02124", "02125", "02126", "02127", "02128", "02129", "02130", "02131", "02132", "02133", "02134", "02135", "02136", "02151", "02163", "02199", "02201", "02203", "02210", "02215", "02446", "02467"]

# Extract the data for selected zip codes
filtered_data = extract_zipcode_data(data, selected_zipcodes)

# Optionally, save the filtered data to a new JSON file
filtered_filename = '../Datasets/boston_zipcodes.geojson'
with open(filtered_filename, 'w') as outfile:
    json.dump(filtered_data, outfile, indent=2)

# If you need to directly work with filtered_data in Python, you can do so as well
