In [None]:
pip install googlemaps

Collecting googlemaps
  Downloading googlemaps-4.10.0.tar.gz (33 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: googlemaps
  Building wheel for googlemaps (setup.py) ... [?25l[?25hdone
  Created wheel for googlemaps: filename=googlemaps-4.10.0-py3-none-any.whl size=40715 sha256=4ecbf23dc05314a9020146220ebde4153ce025dee1671cddcd193111c5b3e64c
  Stored in directory: /root/.cache/pip/wheels/17/f8/79/999d5d37118fd35d7219ef57933eb9d09886c4c4503a800f84
Successfully built googlemaps
Installing collected packages: googlemaps
Successfully installed googlemaps-4.10.0


In [None]:
import pandas as pd
import googlemaps
import time

# **Import the Data Set**

In [None]:
#name the file 'df' and adjust for the fact that the file was encoded in a format different from UTF-8
df = pd.read_csv('/content/comprehensive5.csv', encoding='ISO-8859-1')

# Reverse Geocode Addresses based on Lon/Lat Values




In [None]:
import pandas as pd
import googlemaps
import time

# Initialize Google Maps client
api_key = "myapikey"
gmaps = googlemaps.Client(key=api_key)

# Add 'full_address' column if it doesn't exist
if 'full_address' not in df.columns:
    df['full_address'] = None

# Function to check if a value can be converted to float
def is_float(value):
    try:
        float(value)
        return True
    except ValueError:
        return False

# Identify invalid rows
invalid_lat = df[~df['Latitude'].apply(is_float)]
invalid_lon = df[~df['Longitude'].apply(is_float)]

# Report invalid data
if not invalid_lat.empty:
    print("Invalid Latitude values found:")
    print(invalid_lat[['Latitude']])

if not invalid_lon.empty:
    print("Invalid Longitude values found:")
    print(invalid_lon[['Longitude']])

# Convert Latitude and Longitude columns to float
df['Latitude'] = pd.to_numeric(df['Latitude'], errors='coerce')
df['Longitude'] = pd.to_numeric(df['Longitude'], errors='coerce')

# Reverse geocoding function
def reverse_geocode(lat, lon):
    result = gmaps.reverse_geocode((lat, lon))
    if result and len(result) > 0:
        return result[0]['formatted_address']
    else:
        return None

# Identify rows with valid coordinates
valid_coords = df[df['Latitude'].notnull() & df['Longitude'].notnull()]

# Populate addresses for all rows with valid coordinates
for index, row in valid_coords.iterrows():
    lat, lon = row['Latitude'], row['Longitude']
    address = reverse_geocode(lat, lon)
    if address:
        df.at[index, 'full_address'] = address
        print(f"Row {index + 1} address populated: {address}")
    else:
        print(f"Row {index + 1} address not found.")
    time.sleep(0.2)  # Slight delay to avoid hitting API rate limits

# Save the updated dataset
df.to_csv('updated_dataset.csv', index=False)


Row 1 address populated: 10 Clay St, Oakland, CA 94607, USA
Row 2 address populated: Main Street Alameda Ferry Terminal, 2990 Main St, Alameda, CA 94501, USA
Row 3 address populated: 1 San Francisco Bay Trail, San Francisco, CA 94111, USA
Row 4 address populated: San Francisco Pier 41 Ferry Terminal, San Francisco, CA 94133, USA
Row 5 address populated: 830 E 14th St #1, San Leandro, CA 94577, USA
Row 6 address populated: Davis St & Clarke St, San Leandro, CA 94577, USA
Row 7 address populated: San Leandro, 1401 San Leandro Blvd, San Leandro, CA 94577, USA
Row 8 address populated: 530 E 14th St, San Leandro, CA 94577, USA
Row 9 address populated: Durant Avenue, San Leandro, CA 94577, USA
Row 10 address populated: 103rd Avenue, Oakland, CA 94603, USA
Row 11 address populated: 95th Avenue, Oakland, CA 94603, USA
Row 12 address populated: 8940 International Blvd, Oakland, CA 94621, USA
Row 13 address populated: 8525 International Blvd, Oakland, CA 94621, USA
Row 14 address populated: 98th

# **Extract City Information**

In [None]:
# Import necessary libraries
import pandas as pd

# Load your dataset
df = pd.read_csv('/content/comprehesive5_w_address.csv')

# Function to extract city name from the address
def extract_city(address):
    try:
        # Split the address by commas
        parts = address.split(', ')
        if len(parts) > 1:
            # Return the city (assumes the city is the second element in the split list)
            return parts[1]
        else:
            return None  # Return None if the city cannot be extracted
    except AttributeError:
        return None  # Return None if the address is not a string

# Add a new 'City' column
df['City'] = df['full_address'].apply(extract_city)

# Save the updated dataset
df.to_csv('updated_dataset_with_city.csv', index=False)

print("City names extracted and saved to 'updated_dataset_with_city.csv'")


City names extracted and saved to 'updated_dataset_with_city.csv'


# **Extract State Information**

In [None]:
# Load your dataset
df = pd.read_csv('/content/updated_dataset_with_city.csv')

# Function to extract state abbreviation from the address
def extract_state(address):
    try:
        # Split the address by commas
        parts = address.split(', ')
        if len(parts) > 2:
            # Further split the third part (e.g., "CA 94607") by space and return the first part
            return parts[2].split(' ')[0]
        else:
            return None  # Return None if the state cannot be extracted
    except AttributeError:
        return None  # Return None if the address is not a string

# Add a new 'State' column
df['State'] = df['full_address'].apply(extract_state)

# Save the updated dataset
df.to_csv('updated_dataset_with_state.csv', index=False)

print("State abbreviations extracted and saved to 'updated_dataset_with_state.csv'")


State abbreviations extracted and saved to 'updated_dataset_with_state.csv'


## **Generate County information from the Lon/Lat Coordinates**

In [None]:
# Import necessary libraries
import pandas as pd
import googlemaps
import time

# Load your dataset
df = pd.read_csv('/content/updated_dataset_with_state.csv')

# Initialize Google Maps client
api_key = "AIzaSyBnqCaL5Nn8hGkDfgwDIwMUjsirRG-18L0"  # Replace with your API key
gmaps = googlemaps.Client(key=api_key)

# Function to get county from address
def get_county(address):
    try:
        # Geocode the address
        geocode_result = gmaps.geocode(address)
        if geocode_result and len(geocode_result) > 0:
            for component in geocode_result[0]['address_components']:
                # Check for "administrative_area_level_2" (county)
                if 'administrative_area_level_2' in component['types']:
                    return component['long_name']
        return None  # Return None if no county is found
    except Exception as e:
        print(f"Error for address {address}: {e}")
        return None

# Add a new 'County' column if it doesn't exist
if 'County' not in df.columns:
    df['County'] = None

# Populate counties and monitor progress in real time
for index, row in df.iterrows():
    if pd.isnull(row['County']):  # Skip rows where the county is already filled
        address = row['full_address']
        county = get_county(address)
        df.at[index, 'County'] = county
        print(f"Row {index + 1}: Address = {address}, County = {county}")
        time.sleep(0.2)  # Slight delay to avoid hitting API rate limits

# Save the updated dataset
df.to_csv('updated_dataset_with_county.csv', index=False)

print("County information extracted and saved to 'updated_dataset_with_county.csv'")


Row 1: Address = 10 Clay St, Oakland, CA 94607, USA, County = Alameda County
Row 2: Address = Main Street Alameda Ferry Terminal, 2990 Main St, Alameda, CA 94501, USA, County = Alameda County
Row 3: Address = 1 San Francisco Bay Trail, San Francisco, CA 94111, USA, County = San Francisco County
Row 4: Address = San Francisco Pier 41 Ferry Terminal, San Francisco, CA 94133, USA, County = San Francisco County
Row 5: Address = 830 E 14th St #1, San Leandro, CA 94577, USA, County = Alameda County
Row 6: Address = Davis St & Clarke St, San Leandro, CA 94577, USA, County = Alameda County
Row 7: Address = San Leandro, 1401 San Leandro Blvd, San Leandro, CA 94577, USA, County = Alameda County
Row 8: Address = 530 E 14th St, San Leandro, CA 94577, USA, County = Alameda County
Row 9: Address = Durant Avenue, San Leandro, CA 94577, USA, County = Alameda County
Row 10: Address = 103rd Avenue, Oakland, CA 94603, USA, County = Alameda County
Row 11: Address = 95th Avenue, Oakland, CA 94603, USA, Cou