In [None]:
# Import libraries
import requests
import pandas as pd
from geopy.geocoders import GoogleV3
import numpy as np
import os

# Read csv file
df = pd.read_csv('Cleaned_Data.csv', encoding='latin-1')

# Start working on the dataframe to be able to send it to the API
# Prepare the data 
df['prov'] = df.apply(lambda x: "Region de Murcia" if x['province'] == "murcia" else x['province'], axis=1)
df['type_of_street'] = df['type_of_street'].apply(lambda x: x if pd.notnull(x) else '')
df['street_number'] = df['street_number'].apply(lambda x: int(x) if pd.notnull(x) else '')
df['prov'] = df['prov'].astype(str)

# Format the full address to send to the API
df['full'] = df['type_of_street'] + ' ' + df['street_name2'] + ', ' + df['street_number'].astype(str) + ', ' + df['prov'] + ', España'

# Start with the API part
# Let's start with Google Places API

# Your Google API Key. It is stored in an environment variable passed
# to the program as an argument. This is to avoid exposing the key.
API_KEY = os.environ.get("api_key")

def validate_address(address):
    """
    Validate an address using Google Places API. 
    """
    try:
        # Prepare the API request
        url = 'https://maps.googleapis.com/maps/api/place/autocomplete/json'
        params = {
            'input': address,
            'key': API_KEY
        }

        # Send the request
        response = requests.get(url, params=params)

        # Parse the response
        data = response.json()
        if data['status'] == 'OK':
            # Return the first autocomplete prediction if available
            if len(data['predictions']) > 0:
                # print(data['predictions'][0]['description'])
                return data['predictions'][0]['description']
            else:
                return None
        else:
            return None

    except Exception as e:
        print("Didn't Work: ", e)
        return None

# Apply the function to the dataframe
df['FORMATED_ADDRESS'] = df.apply(lambda x: validate_address(x['full']), axis=1)

# Now we already have a Formated Address column, but we need to get the coordinates
# Let's start with Google Geocoding API

# Create the Geolocator
geolocator2 = GoogleV3(api_key=API_KEY)

def extract_clean_address(row):
    """
    This function calls the API, gets all the data, separates it into columns and returns it.
    """

    address = row['FORMATED_ADDRESS']
    
    try:
        location = geolocator2.geocode(address)
        data = location.raw
        type_street = ''
        neighborhood = ''
        street = ''
        locality = ''
        province = ''
        region = ''
        country = ''
        postal_code = ''
        streetnumber = ''
        lat = ''
        long = ''

        for row in data['address_components']:
            if row['types'] == ['route']:
                street_parts = row['long_name'].split(' ', 1) # This splits the string at the first space
                if len(street_parts) > 1 and street_parts[0] in ['Calle', 'Avenida']:
                    type_street = street_parts[0] # This is 'Calle', 'Avenida', etc.
                    street = street_parts[1] # This is the rest of the string
                else:
                    street = row['long_name'] # If there was 
                # street = row['long_name']
                # print(street)
            elif row['types'] == ['locality', 'political']:
                locality = row['long_name']
                # print(locality)
            elif row['types'] == ['administrative_area_level_2', 'political']:
                province = row['long_name']
                # print(province)
            elif row['types'] == ['administrative_area_level_1', 'political']:
                region = row['long_name']
                # print(region)
            elif row['types'] == ['country', 'political']:
                country = row['long_name']
                # print(country)
            elif row['types'] == ['postal_code']:
                postal_code = row['long_name']
                # print(postal_code)
            elif row['types'] == ['street_number']:
                streetnumber = row['long_name']
                # print(streetnumber)
            elif row['types'] == ['neighborhood', 'political']:
                neighborhood = row['long_name']
                # print(neighborhood)
        try:
            lat = data['geometry']['location']['lat']
            long = data['geometry']['location']['lng']
        except:
            pass
        
        return pd.Series((type_street, street, streetnumber, locality, province, region, country, postal_code, neighborhood, lat, long))
    except:
        print("Didn't Work")
        return pd.Series((None, None, None, None, None, None, None, None, None, None, None))



# Apply the extract_clean_address function to 'clean address' column and assign it back to the column
df[['TYPE_STREET','STREET_NAME', 'STREET_NUMBER', \
    'LOCALITY', 'PROVINCE', 'REGION', 'COUNTRY', 'POSTAL_CODE',\
    'NEIGHBOURHOOD', 'LAT', 'LONG']] = df.apply(extract_clean_address, axis=1)


# As we have a lot of urbanizaciones and are having problems with it, we will make it easier to read
def clean_urba(row):
    """
    This adds the urbanization name to the neighbourhood column
    in case it exists.
    """
    if 'urbaniz' in str(row['FORMATED_ADDRESS']).lower() or 'aldeas' in str(row['FORMATED_ADDRESS']).lower():
        neigh = row['FORMATED_ADDRESS'].split(',')[0]
        return neigh
    return ''

# Apply the clean_urba function
df['NEIGHBOURHOOD'] = df.apply(clean_urba, axis=1)

# Add the OBSERVATIONS column with extra data not included in the API.
df['OBSERVATIONS'] = df['floor'].apply(lambda x: x if pd.notna(x) else '')

# Function to add floor and street number if urbanization
def add_observ_urba(row):
    """
    This function adds the floor and street number to the 
    observations column if the address is an urbanization.
    """
    if 'urbaniz' in str(row['NEIGHBOURHOOD']).lower() or 'aldeas' in str(row['NEIGHBOURHOOD']).lower():
        number = str(row['street_number']).split('.')[0]
        # number = int(number)
        return  number + ' ' + str(row['floor'])
    
    return row['OBSERVATIONS']

# Add observations    
df['OBSERVATIONS'] = df.apply(add_observ_urba, axis=1)

# Take the nan out in observations
df['OBSERVATIONS'] = df['OBSERVATIONS'].apply(lambda x: x.replace('nan', '') if 'nan' in x else x)

# Add street number if it's not automatically added
def add_street_number(row):
    """"
    This function adds the street number in case it was not added before
    """
    
    if str(row['STREET_NUMBER']) == 'nan' or str(row['STREET_NUMBER']) == '':
        number = row['street_number']
        return number
    else:
        return row['STREET_NUMBER']

df['STREET_NUMBER'] = df.apply(add_street_number, axis=1)

# Create a clean dataframe with only the columns we want
clean_df = df[['FORMATED_ADDRESS', 'TYPE_STREET', 'STREET_NAME', 'STREET_NUMBER', \
               'LOCALITY', 'PROVINCE', 'REGION', 'COUNTRY', 'POSTAL_CODE', 'NEIGHBOURHOOD',\
                  'OBSERVATIONS', 'LAT', 'LONG']].copy()

df.to_csv('full_df.csv', encoding='utf-8-sig', index=False)
clean_df.to_csv('clean_full_df.csv', encoding='utf-8-sig', index=False)
