# Import Libraries

In [1]:
print("Importing required libraries...")
import pandas as pd
print("Succesfully imported pandas library")
import datetime
print("Succesfully imported datetime library")
import time
print("Succesfully imported time library")
import pytz
print("Succesfully imported pytz library")
print("Installing geopy library")
%pip install geopy
print("Succesfully installed geopy library")
from geopy.geocoders import Nominatim
print("Succesfully imported Nominatim library")
print("All imports completed!")

Importing required libraries...
Succesfully imported pandas library
Succesfully imported datetime library
Succesfully imported time library
Succesfully imported pytz library
Installing geopy library
Note: you may need to restart the kernel to use updated packages.
Succesfully installed geopy library
Succesfully imported Nominatim library
All imports completed!


# Define Values

In [2]:
print("Defining values...")
uberdata_url = 'https://docs.google.com/spreadsheets/d/19DnHil6Iar2YV67hUU36VXbvKMYEQORzWHrE7LZOGpA/edit#gid=712533051'
print(f"Google Sheets URL loaded: {uberdata_url}")
uberdata = uberdata_url.replace('/edit#gid=', '/export?format=csv&gid=')
print(f"Google Sheets URL changed to: {uberdata}")
datetime_format = '%Y-%m-%d|%H:%M:%S'
print(f"Date and time format changed to {datetime_format}")
timezone = pytz.timezone('Asia/Bangkok')
print(f"Time Zone set to {timezone}")
start_row = 0 # Index number of dataframe to process
print(f"Start row set to {start_row}")
rows_per_chunk = 5000 # Number of rows to process at a time
print(f"Chunk size set to {rows_per_chunk}")
timeout = 5  # Wait duration for request timeout in seconds
print(f"Timeout duration set to {timeout}")
max_retry = 10 # Number of retries before skipping a row
print(f"Maximum retries set to {max_retry}")
geolocator = Nominatim(user_agent="MyApp") # Initialize Nominatim API
print("Geolocator defined")
print("All values defined!")

Defining values...
Google Sheets URL loaded: https://docs.google.com/spreadsheets/d/19DnHil6Iar2YV67hUU36VXbvKMYEQORzWHrE7LZOGpA/edit#gid=712533051
Google Sheets URL changed to: https://docs.google.com/spreadsheets/d/19DnHil6Iar2YV67hUU36VXbvKMYEQORzWHrE7LZOGpA/export?format=csv&gid=712533051
Date and time format changed to %Y-%m-%d|%H:%M:%S
Time Zone set to Asia/Bangkok
Start row set to 0
Chunk size set to 5000
Timeout duration set to 5
Maximum retries set to 10
Geolocator defined
All values defined!


# Run the codes

## 1st Iteration

In [3]:
end_row = start_row+rows_per_chunk # Value of end_row to enable resuming from last cell

current_time = datetime.datetime.now(timezone).strftime(datetime_format)
print(f"{current_time} - Loading CSV into DataFrame...")
#df_uberdata = pd.read_csv(uberdata)
df_uberdata = pd.read_csv('uber_dataset_reverse_geocode.csv')
df_uberdata = df_uberdata[df_uberdata["pickup_address"].isnull() | df_uberdata["dropoff_address"].isnull()]
current_time = datetime.datetime.now(timezone).strftime(datetime_format)
print(f"{current_time} - CSV is loaded")
#df_uberdata.insert(0, 'no', df_uberdata.reset_index().index + 1)
current_time = datetime.datetime.now(timezone).strftime(datetime_format)
print(f"{current_time} - Added row index column")
df_uberdata = df_uberdata.iloc[start_row:end_row].reset_index(drop=True)
current_time = datetime.datetime.now(timezone).strftime(datetime_format)
print(f"{current_time} - Starting processing rows {start_row} - {end_row}")

# Define a function to get the address components from location
def get_address_components(location, location_type):
    address = location.raw['address']
    address_components = {
        f"{location_type}_address": location.address,
        f"{location_type}_road": address.get('road', ''),
        f"{location_type}_neighbourhood": address.get('neighbourhood', ''),
        f"{location_type}_suburb": address.get('suburb', ''),
        f"{location_type}_county": address.get('county', ''),
        f"{location_type}_city": address.get('city', ''),
        f"{location_type}_state": address.get('state', ''),
        f"{location_type}_postcode": address.get('postcode', ''),
        f"{location_type}_country": address.get('country', '')
    }
    return address_components

# Define a function to get the pickup address from location
def get_pickup_address(row):
    current_time = datetime.datetime.now(timezone).strftime(datetime_format)
    print(f"{current_time} - Finding pickup address for row {start_row+row.name+1}")
    pickup_coordinates = f"{row['pickup_latitude']} , {row['pickup_longitude']}"
    timeout_count = 0
    while True:
        try:
            pickup_location = geolocator.reverse(pickup_coordinates)
            pickup_address_components = get_address_components(pickup_location, "pickup")
            return pd.Series(pickup_address_components)
        except Exception as e:
            if timeout_count == max_retry:
                current_time = datetime.datetime.now(timezone).strftime(datetime_format)
                print(f"{current_time} - Request timeout occurred for {max_retry} times, skipping row {start_row+row.name+1}. {e}")
                return None
            else:
                timeout_count += 1
                current_time = datetime.datetime.now(timezone).strftime(datetime_format)
                print(f"{current_time} - Request timeout occurred, retrying after {timeout} seconds, skipping row {start_row+row.name+1} in {max_retry-timeout_count+1} retries. {e}")
                time.sleep(timeout)

# Create new columns in df_uberdata for pickup address components
df_uberdata = df_uberdata.merge(df_uberdata.apply(get_pickup_address, axis=1), left_index=True, right_index=True)

# Define a function to get the dropoff address from location
def get_dropoff_address(row):
    current_time = datetime.datetime.now(timezone).strftime(datetime_format)
    print(f"{current_time} - Finding dropoff address for row {start_row+row.name+1}")
    dropoff_coordinates = f"{row['dropoff_latitude']} , {row['dropoff_longitude']}"
    timeout_count = 0
    while True:
        try:
            dropoff_location = geolocator.reverse(dropoff_coordinates)
            dropoff_address_components = get_address_components(dropoff_location, "dropoff")
            return pd.Series(dropoff_address_components)
        except Exception as e:
            if timeout_count == max_retry:
                current_time = datetime.datetime.now(timezone).strftime(datetime_format)
                print(f"{current_time} - Request timeout occurred for {max_retry} times, skipping row {start_row+row.name+1}. {e}")
                return None
            else:
                timeout_count += 1
                current_time = datetime.datetime.now(timezone).strftime(datetime_format)
                print(f"{current_time} - Request timeout occurred, retrying after {timeout} seconds, skipping row {start_row+row.name+1} after {max_retry-timeout_count} more retries. {e}")
                time.sleep(timeout)

# Create new columns in df_uberdata for dropoff address components
df_uberdata = df_uberdata.merge(df_uberdata.apply(get_dropoff_address, axis=1), left_index=True, right_index=True)

# Export the dataframe to a CSV file with the current time in the file name
current_time = datetime.datetime.now(timezone).strftime(datetime_format)
output_file = f"uber_dataset_reverse_geocode_rows_{start_row}-{end_row}_{current_time}.csv"
df_uberdata.to_csv(output_file, index=False)

current_time = datetime.datetime.now(timezone).strftime(datetime_format)
print(f"{current_time} - Dataframe exported to {output_file}")
current_time = datetime.datetime.now(timezone).strftime(datetime_format)
print(f"{current_time} - Finished processing rows {start_row}-{end_row}")
    
# Move on to the next chunk
start_row = end_row

2023-04-05|02:38:35 - Loading CSV into DataFrame...


  df_uberdata = pd.read_csv('uber_dataset_reverse_geocode.csv')


2023-04-05|02:38:37 - CSV is loaded
2023-04-05|02:38:37 - Added row index column
2023-04-05|02:38:37 - Starting processing rows 0 - 5000
2023-04-05|02:38:37 - Finding pickup address for row 1
2023-04-05|02:38:38 - Request timeout occurred, retrying after 5 seconds, skipping row 1 in 10 retries. 'NoneType' object has no attribute 'raw'
2023-04-05|02:38:43 - Request timeout occurred, retrying after 5 seconds, skipping row 1 in 9 retries. 'NoneType' object has no attribute 'raw'
2023-04-05|02:38:49 - Request timeout occurred, retrying after 5 seconds, skipping row 1 in 8 retries. 'NoneType' object has no attribute 'raw'
2023-04-05|02:38:54 - Request timeout occurred, retrying after 5 seconds, skipping row 1 in 7 retries. 'NoneType' object has no attribute 'raw'
2023-04-05|02:38:59 - Request timeout occurred, retrying after 5 seconds, skipping row 1 in 6 retries. 'NoneType' object has no attribute 'raw'
2023-04-05|02:39:04 - Request timeout occurred, retrying after 5 seconds, skipping row 

  return cls(latitude, longitude, altitude)


2023-04-05|03:07:01 - Request timeout occurred, retrying after 5 seconds, skipping row 39 in 9 retries. Must be a coordinate pair or Point
2023-04-05|03:07:06 - Request timeout occurred, retrying after 5 seconds, skipping row 39 in 8 retries. Must be a coordinate pair or Point
2023-04-05|03:07:11 - Request timeout occurred, retrying after 5 seconds, skipping row 39 in 7 retries. Must be a coordinate pair or Point
2023-04-05|03:07:16 - Request timeout occurred, retrying after 5 seconds, skipping row 39 in 6 retries. Must be a coordinate pair or Point
2023-04-05|03:07:21 - Request timeout occurred, retrying after 5 seconds, skipping row 39 in 5 retries. Must be a coordinate pair or Point
2023-04-05|03:07:26 - Request timeout occurred, retrying after 5 seconds, skipping row 39 in 4 retries. Must be a coordinate pair or Point
2023-04-05|03:07:31 - Request timeout occurred, retrying after 5 seconds, skipping row 39 in 3 retries. Must be a coordinate pair or Point
2023-04-05|03:07:36 - Reque

  return cls(latitude, longitude, altitude)


2023-04-05|03:19:22 - Request timeout occurred, retrying after 5 seconds, skipping row 54 in 9 retries. Must be a coordinate pair or Point
2023-04-05|03:19:27 - Request timeout occurred, retrying after 5 seconds, skipping row 54 in 8 retries. Must be a coordinate pair or Point
2023-04-05|03:19:32 - Request timeout occurred, retrying after 5 seconds, skipping row 54 in 7 retries. Must be a coordinate pair or Point
2023-04-05|03:19:37 - Request timeout occurred, retrying after 5 seconds, skipping row 54 in 6 retries. Must be a coordinate pair or Point
2023-04-05|03:19:42 - Request timeout occurred, retrying after 5 seconds, skipping row 54 in 5 retries. Must be a coordinate pair or Point
2023-04-05|03:19:47 - Request timeout occurred, retrying after 5 seconds, skipping row 54 in 4 retries. Must be a coordinate pair or Point
2023-04-05|03:19:52 - Request timeout occurred, retrying after 5 seconds, skipping row 54 in 3 retries. Must be a coordinate pair or Point
2023-04-05|03:19:57 - Reque

  return cls(latitude, longitude, altitude)


2023-04-05|03:28:17 - Request timeout occurred, retrying after 5 seconds, skipping row 65 in 9 retries. Must be a coordinate pair or Point
2023-04-05|03:28:22 - Request timeout occurred, retrying after 5 seconds, skipping row 65 in 8 retries. Must be a coordinate pair or Point
2023-04-05|03:28:27 - Request timeout occurred, retrying after 5 seconds, skipping row 65 in 7 retries. Must be a coordinate pair or Point
2023-04-05|03:28:32 - Request timeout occurred, retrying after 5 seconds, skipping row 65 in 6 retries. Must be a coordinate pair or Point
2023-04-05|03:28:37 - Request timeout occurred, retrying after 5 seconds, skipping row 65 in 5 retries. Must be a coordinate pair or Point
2023-04-05|03:28:42 - Request timeout occurred, retrying after 5 seconds, skipping row 65 in 4 retries. Must be a coordinate pair or Point
2023-04-05|03:28:47 - Request timeout occurred, retrying after 5 seconds, skipping row 65 in 3 retries. Must be a coordinate pair or Point
2023-04-05|03:28:52 - Reque

## 2nd Iteration

In [4]:
end_row = start_row+rows_per_chunk # Value of end_row to enable resuming from last cell

current_time = datetime.datetime.now(timezone).strftime(datetime_format)
print(f"{current_time} - Loading CSV into DataFrame...")
df_uberdata = pd.read_csv(uberdata)
current_time = datetime.datetime.now(timezone).strftime(datetime_format)
print(f"{current_time} - CSV is loaded")
df_uberdata.insert(0, 'no', df_uberdata.reset_index().index + 1)
current_time = datetime.datetime.now(timezone).strftime(datetime_format)
print(f"{current_time} - Added row index column")
df_uberdata = df_uberdata.iloc[start_row:end_row].reset_index(drop=True)
current_time = datetime.datetime.now(timezone).strftime(datetime_format)
print(f"{current_time} - Starting processing rows {start_row} - {end_row}")

# Define a function to get the address components from location
def get_address_components(location, location_type):
    address = location.raw['address']
    address_components = {
        f"{location_type}_address": location.address,
        f"{location_type}_road": address.get('road', ''),
        f"{location_type}_neighbourhood": address.get('neighbourhood', ''),
        f"{location_type}_suburb": address.get('suburb', ''),
        f"{location_type}_county": address.get('county', ''),
        f"{location_type}_city": address.get('city', ''),
        f"{location_type}_state": address.get('state', ''),
        f"{location_type}_postcode": address.get('postcode', ''),
        f"{location_type}_country": address.get('country', '')
    }
    return address_components

# Define a function to get the pickup address from location
def get_pickup_address(row):
    current_time = datetime.datetime.now(timezone).strftime(datetime_format)
    print(f"{current_time} - Finding pickup address for row {start_row+row.name+1}")
    pickup_coordinates = f"{row['pickup_latitude']} , {row['pickup_longitude']}"
    timeout_count = 0
    while True:
        try:
            pickup_location = geolocator.reverse(pickup_coordinates)
            pickup_address_components = get_address_components(pickup_location, "pickup")
            return pd.Series(pickup_address_components)
        except Exception as e:
            if timeout_count == max_retry:
                current_time = datetime.datetime.now(timezone).strftime(datetime_format)
                print(f"{current_time} - Request timeout occurred for {max_retry} times, skipping row {start_row+row.name+1}. {e}")
                return None
            else:
                timeout_count += 1
                current_time = datetime.datetime.now(timezone).strftime(datetime_format)
                print(f"{current_time} - Request timeout occurred, retrying after {timeout} seconds, skipping row {start_row+row.name+1} in {max_retry-timeout_count+1} retries. {e}")
                time.sleep(timeout)

# Create new columns in df_uberdata for pickup address components
df_uberdata = df_uberdata.merge(df_uberdata.apply(get_pickup_address, axis=1), left_index=True, right_index=True)

# Define a function to get the dropoff address from location
def get_dropoff_address(row):
    current_time = datetime.datetime.now(timezone).strftime(datetime_format)
    print(f"{current_time} - Finding dropoff address for row {start_row+row.name+1}")
    dropoff_coordinates = f"{row['dropoff_latitude']} , {row['dropoff_longitude']}"
    timeout_count = 0
    while True:
        try:
            dropoff_location = geolocator.reverse(dropoff_coordinates)
            dropoff_address_components = get_address_components(dropoff_location, "dropoff")
            return pd.Series(dropoff_address_components)
        except Exception as e:
            if timeout_count == max_retry:
                current_time = datetime.datetime.now(timezone).strftime(datetime_format)
                print(f"{current_time} - Request timeout occurred for {max_retry} times, skipping row {start_row+row.name+1}. {e}")
                return None
            else:
                timeout_count += 1
                current_time = datetime.datetime.now(timezone).strftime(datetime_format)
                print(f"{current_time} - Request timeout occurred, retrying after {timeout} seconds, skipping row {start_row+row.name+1} after {max_retry-timeout_count} more retries. {e}")
                time.sleep(timeout)

# Create new columns in df_uberdata for dropoff address components
df_uberdata = df_uberdata.merge(df_uberdata.apply(get_dropoff_address, axis=1), left_index=True, right_index=True)

# Export the dataframe to a CSV file with the current time in the file name
current_time = datetime.datetime.now(timezone).strftime(datetime_format)
output_file = f"uber_dataset_reverse_geocode_rows_{start_row}-{end_row}_{current_time}.csv"
df_uberdata.to_csv(output_file, index=False)

current_time = datetime.datetime.now(timezone).strftime(datetime_format)
print(f"{current_time} - Dataframe exported to {output_file}")
current_time = datetime.datetime.now(timezone).strftime(datetime_format)
print(f"{current_time} - Finished processing rows {start_row}-{end_row}")
    
# Move on to the next chunk
start_row = end_row

2023-03-29|19:19:54 - Loading CSV into DataFrame...
2023-03-29|19:20:13 - CSV is loaded
2023-03-29|19:20:13 - Added row index column
2023-03-29|19:20:13 - Starting processing rows 180000 - 185000
2023-03-29|19:20:13 - Finding pickup address for row 180001
2023-03-29|19:20:14 - Finding pickup address for row 180002
2023-03-29|19:20:15 - Finding pickup address for row 180003
2023-03-29|19:20:15 - Finding pickup address for row 180004
2023-03-29|19:20:16 - Finding pickup address for row 180005
2023-03-29|19:20:16 - Finding pickup address for row 180006
2023-03-29|19:20:17 - Finding pickup address for row 180007
2023-03-29|19:20:17 - Finding pickup address for row 180008
2023-03-29|19:20:18 - Finding pickup address for row 180009
2023-03-29|19:20:18 - Finding pickup address for row 180010
2023-03-29|19:20:19 - Finding pickup address for row 180011
2023-03-29|19:20:19 - Finding pickup address for row 180012
2023-03-29|19:20:20 - Finding pickup address for row 180013
2023-03-29|19:20:20 - Fi

## 3rd Iteration

In [4]:
end_row = start_row+rows_per_chunk # Value of end_row to enable resuming from last cell

current_time = datetime.datetime.now(timezone).strftime(datetime_format)
print(f"{current_time} - Loading CSV into DataFrame...")
df_uberdata = pd.read_csv(uberdata)
current_time = datetime.datetime.now(timezone).strftime(datetime_format)
print(f"{current_time} - CSV is loaded")
df_uberdata.insert(0, 'no', df_uberdata.reset_index().index + 1)
current_time = datetime.datetime.now(timezone).strftime(datetime_format)
print(f"{current_time} - Added row index column")
df_uberdata = df_uberdata.iloc[start_row:end_row].reset_index(drop=True)
current_time = datetime.datetime.now(timezone).strftime(datetime_format)
print(f"{current_time} - Starting processing rows {start_row} - {end_row}")

# Define a function to get the address components from location
def get_address_components(location, location_type):
    address = location.raw['address']
    address_components = {
        f"{location_type}_address": location.address,
        f"{location_type}_road": address.get('road', ''),
        f"{location_type}_neighbourhood": address.get('neighbourhood', ''),
        f"{location_type}_suburb": address.get('suburb', ''),
        f"{location_type}_county": address.get('county', ''),
        f"{location_type}_city": address.get('city', ''),
        f"{location_type}_state": address.get('state', ''),
        f"{location_type}_postcode": address.get('postcode', ''),
        f"{location_type}_country": address.get('country', '')
    }
    return address_components

# Define a function to get the pickup address from location
def get_pickup_address(row):
    current_time = datetime.datetime.now(timezone).strftime(datetime_format)
    print(f"{current_time} - Finding pickup address for row {start_row+row.name+1}")
    pickup_coordinates = f"{row['pickup_latitude']} , {row['pickup_longitude']}"
    timeout_count = 0
    while True:
        try:
            pickup_location = geolocator.reverse(pickup_coordinates)
            pickup_address_components = get_address_components(pickup_location, "pickup")
            return pd.Series(pickup_address_components)
        except Exception as e:
            if timeout_count == max_retry:
                current_time = datetime.datetime.now(timezone).strftime(datetime_format)
                print(f"{current_time} - Request timeout occurred for {max_retry} times, skipping row {start_row+row.name+1}. {e}")
                return None
            else:
                timeout_count += 1
                current_time = datetime.datetime.now(timezone).strftime(datetime_format)
                print(f"{current_time} - Request timeout occurred, retrying after {timeout} seconds, skipping row {start_row+row.name+1} in {max_retry-timeout_count+1} retries. {e}")
                time.sleep(timeout)

# Create new columns in df_uberdata for pickup address components
df_uberdata = df_uberdata.merge(df_uberdata.apply(get_pickup_address, axis=1), left_index=True, right_index=True)

# Define a function to get the dropoff address from location
def get_dropoff_address(row):
    current_time = datetime.datetime.now(timezone).strftime(datetime_format)
    print(f"{current_time} - Finding dropoff address for row {start_row+row.name+1}")
    dropoff_coordinates = f"{row['dropoff_latitude']} , {row['dropoff_longitude']}"
    timeout_count = 0
    while True:
        try:
            dropoff_location = geolocator.reverse(dropoff_coordinates)
            dropoff_address_components = get_address_components(dropoff_location, "dropoff")
            return pd.Series(dropoff_address_components)
        except Exception as e:
            if timeout_count == max_retry:
                current_time = datetime.datetime.now(timezone).strftime(datetime_format)
                print(f"{current_time} - Request timeout occurred for {max_retry} times, skipping row {start_row+row.name+1}. {e}")
                return None
            else:
                timeout_count += 1
                current_time = datetime.datetime.now(timezone).strftime(datetime_format)
                print(f"{current_time} - Request timeout occurred, retrying after {timeout} seconds, skipping row {start_row+row.name+1} after {max_retry-timeout_count} more retries. {e}")
                time.sleep(timeout)

# Create new columns in df_uberdata for dropoff address components
df_uberdata = df_uberdata.merge(df_uberdata.apply(get_dropoff_address, axis=1), left_index=True, right_index=True)

# Export the dataframe to a CSV file with the current time in the file name
current_time = datetime.datetime.now(timezone).strftime(datetime_format)
output_file = f"uber_dataset_reverse_geocode_rows_{start_row}-{end_row}_{current_time}.csv"
df_uberdata.to_csv(output_file, index=False)

current_time = datetime.datetime.now(timezone).strftime(datetime_format)
print(f"{current_time} - Dataframe exported to {output_file}")
current_time = datetime.datetime.now(timezone).strftime(datetime_format)
print(f"{current_time} - Finished processing rows {start_row}-{end_row}")
    
# Move on to the next chunk
start_row = end_row

2023-03-29|23:18:02 - Loading CSV into DataFrame...
2023-03-29|23:18:16 - CSV is loaded
2023-03-29|23:18:16 - Added row index column
2023-03-29|23:18:16 - Starting processing rows 170000 - 175000
2023-03-29|23:18:16 - Finding pickup address for row 170001
2023-03-29|23:18:17 - Finding pickup address for row 170002
2023-03-29|23:18:17 - Finding pickup address for row 170003
2023-03-29|23:18:18 - Finding pickup address for row 170004
2023-03-29|23:18:18 - Finding pickup address for row 170005
2023-03-29|23:18:19 - Finding pickup address for row 170006
2023-03-29|23:18:19 - Finding pickup address for row 170007
2023-03-29|23:18:20 - Finding pickup address for row 170008
2023-03-29|23:18:20 - Finding pickup address for row 170009
2023-03-29|23:18:21 - Finding pickup address for row 170010
2023-03-29|23:18:21 - Finding pickup address for row 170011
2023-03-29|23:18:22 - Finding pickup address for row 170012
2023-03-29|23:18:22 - Finding pickup address for row 170013
2023-03-29|23:18:23 - Fi

## 4th Iteration

In [3]:
end_row = start_row+rows_per_chunk # Value of end_row to enable resuming from last cell

current_time = datetime.datetime.now(timezone).strftime(datetime_format)
print(f"{current_time} - Loading CSV into DataFrame...")
df_uberdata = pd.read_csv(uberdata)
current_time = datetime.datetime.now(timezone).strftime(datetime_format)
print(f"{current_time} - CSV is loaded")
df_uberdata.insert(0, 'no', df_uberdata.reset_index().index + 1)
current_time = datetime.datetime.now(timezone).strftime(datetime_format)
print(f"{current_time} - Added row index column")
df_uberdata = df_uberdata.iloc[start_row:end_row].reset_index(drop=True)
current_time = datetime.datetime.now(timezone).strftime(datetime_format)
print(f"{current_time} - Starting processing rows {start_row} - {end_row}")

# Define a function to get the address components from location
def get_address_components(location, location_type):
    address = location.raw['address']
    address_components = {
        f"{location_type}_address": location.address,
        f"{location_type}_road": address.get('road', ''),
        f"{location_type}_neighbourhood": address.get('neighbourhood', ''),
        f"{location_type}_suburb": address.get('suburb', ''),
        f"{location_type}_county": address.get('county', ''),
        f"{location_type}_city": address.get('city', ''),
        f"{location_type}_state": address.get('state', ''),
        f"{location_type}_postcode": address.get('postcode', ''),
        f"{location_type}_country": address.get('country', '')
    }
    return address_components

# Define a function to get the pickup address from location
def get_pickup_address(row):
    current_time = datetime.datetime.now(timezone).strftime(datetime_format)
    print(f"{current_time} - Finding pickup address for row {start_row+row.name+1}")
    pickup_coordinates = f"{row['pickup_latitude']} , {row['pickup_longitude']}"
    timeout_count = 0
    while True:
        try:
            pickup_location = geolocator.reverse(pickup_coordinates)
            pickup_address_components = get_address_components(pickup_location, "pickup")
            return pd.Series(pickup_address_components)
        except Exception as e:
            if timeout_count == max_retry:
                current_time = datetime.datetime.now(timezone).strftime(datetime_format)
                print(f"{current_time} - Request timeout occurred for {max_retry} times, skipping row {start_row+row.name+1}. {e}")
                return None
            else:
                timeout_count += 1
                current_time = datetime.datetime.now(timezone).strftime(datetime_format)
                print(f"{current_time} - Request timeout occurred, retrying after {timeout} seconds, skipping row {start_row+row.name+1} in {max_retry-timeout_count+1} retries. {e}")
                time.sleep(timeout)

# Create new columns in df_uberdata for pickup address components
df_uberdata = df_uberdata.merge(df_uberdata.apply(get_pickup_address, axis=1), left_index=True, right_index=True)

# Define a function to get the dropoff address from location
def get_dropoff_address(row):
    current_time = datetime.datetime.now(timezone).strftime(datetime_format)
    print(f"{current_time} - Finding dropoff address for row {start_row+row.name+1}")
    dropoff_coordinates = f"{row['dropoff_latitude']} , {row['dropoff_longitude']}"
    timeout_count = 0
    while True:
        try:
            dropoff_location = geolocator.reverse(dropoff_coordinates)
            dropoff_address_components = get_address_components(dropoff_location, "dropoff")
            return pd.Series(dropoff_address_components)
        except Exception as e:
            if timeout_count == max_retry:
                current_time = datetime.datetime.now(timezone).strftime(datetime_format)
                print(f"{current_time} - Request timeout occurred for {max_retry} times, skipping row {start_row+row.name+1}. {e}")
                return None
            else:
                timeout_count += 1
                current_time = datetime.datetime.now(timezone).strftime(datetime_format)
                print(f"{current_time} - Request timeout occurred, retrying after {timeout} seconds, skipping row {start_row+row.name+1} after {max_retry-timeout_count} more retries. {e}")
                time.sleep(timeout)

# Create new columns in df_uberdata for dropoff address components
df_uberdata = df_uberdata.merge(df_uberdata.apply(get_dropoff_address, axis=1), left_index=True, right_index=True)

# Export the dataframe to a CSV file with the current time in the file name
current_time = datetime.datetime.now(timezone).strftime(datetime_format)
output_file = f"uber_dataset_reverse_geocode_rows_{start_row}-{end_row}_{current_time}.csv"
df_uberdata.to_csv(output_file, index=False)

current_time = datetime.datetime.now(timezone).strftime(datetime_format)
print(f"{current_time} - Dataframe exported to {output_file}")
current_time = datetime.datetime.now(timezone).strftime(datetime_format)
print(f"{current_time} - Finished processing rows {start_row}-{end_row}")
    
# Move on to the next chunk
start_row = end_row

2023-03-30|00:55:44 - Loading CSV into DataFrame...
2023-03-30|00:56:03 - CSV is loaded
2023-03-30|00:56:03 - Added row index column
2023-03-30|00:56:03 - Starting processing rows 165000 - 170000
2023-03-30|00:56:03 - Finding pickup address for row 165001
2023-03-30|00:56:07 - Finding pickup address for row 165002
2023-03-30|00:56:07 - Finding pickup address for row 165003
2023-03-30|00:56:08 - Finding pickup address for row 165004
2023-03-30|00:56:08 - Finding pickup address for row 165005
2023-03-30|00:56:09 - Finding pickup address for row 165006
2023-03-30|00:56:09 - Finding pickup address for row 165007
2023-03-30|00:56:10 - Finding pickup address for row 165008
2023-03-30|00:56:10 - Finding pickup address for row 165009
2023-03-30|00:56:11 - Finding pickup address for row 165010
2023-03-30|00:56:11 - Finding pickup address for row 165011
2023-03-30|00:56:12 - Finding pickup address for row 165012
2023-03-30|00:56:12 - Finding pickup address for row 165013
2023-03-30|00:56:13 - Fi

## 5th Iteration

In [5]:
end_row = start_row+rows_per_chunk # Value of end_row to enable resuming from last cell

current_time = datetime.datetime.now(timezone).strftime(datetime_format)
print(f"{current_time} - Loading CSV into DataFrame...")
df_uberdata = pd.read_csv(uberdata)
current_time = datetime.datetime.now(timezone).strftime(datetime_format)
print(f"{current_time} - CSV is loaded")
df_uberdata.insert(0, 'no', df_uberdata.reset_index().index + 1)
current_time = datetime.datetime.now(timezone).strftime(datetime_format)
print(f"{current_time} - Added row index column")
df_uberdata = df_uberdata.iloc[start_row:end_row].reset_index(drop=True)
current_time = datetime.datetime.now(timezone).strftime(datetime_format)
print(f"{current_time} - Starting processing rows {start_row} - {end_row}")

# Define a function to get the address components from location
def get_address_components(location, location_type):
    address = location.raw['address']
    address_components = {
        f"{location_type}_address": location.address,
        f"{location_type}_road": address.get('road', ''),
        f"{location_type}_neighbourhood": address.get('neighbourhood', ''),
        f"{location_type}_suburb": address.get('suburb', ''),
        f"{location_type}_county": address.get('county', ''),
        f"{location_type}_city": address.get('city', ''),
        f"{location_type}_state": address.get('state', ''),
        f"{location_type}_postcode": address.get('postcode', ''),
        f"{location_type}_country": address.get('country', '')
    }
    return address_components

# Define a function to get the pickup address from location
def get_pickup_address(row):
    current_time = datetime.datetime.now(timezone).strftime(datetime_format)
    print(f"{current_time} - Finding pickup address for row {start_row+row.name+1}")
    pickup_coordinates = f"{row['pickup_latitude']} , {row['pickup_longitude']}"
    timeout_count = 0
    while True:
        try:
            pickup_location = geolocator.reverse(pickup_coordinates)
            pickup_address_components = get_address_components(pickup_location, "pickup")
            return pd.Series(pickup_address_components)
        except Exception as e:
            if timeout_count == max_retry:
                current_time = datetime.datetime.now(timezone).strftime(datetime_format)
                print(f"{current_time} - Request timeout occurred for {max_retry} times, skipping row {start_row+row.name+1}. {e}")
                return None
            else:
                timeout_count += 1
                current_time = datetime.datetime.now(timezone).strftime(datetime_format)
                print(f"{current_time} - Request timeout occurred, retrying after {timeout} seconds, skipping row {start_row+row.name+1} in {max_retry-timeout_count+1} retries. {e}")
                time.sleep(timeout)

# Create new columns in df_uberdata for pickup address components
df_uberdata = df_uberdata.merge(df_uberdata.apply(get_pickup_address, axis=1), left_index=True, right_index=True)

# Define a function to get the dropoff address from location
def get_dropoff_address(row):
    current_time = datetime.datetime.now(timezone).strftime(datetime_format)
    print(f"{current_time} - Finding dropoff address for row {start_row+row.name+1}")
    dropoff_coordinates = f"{row['dropoff_latitude']} , {row['dropoff_longitude']}"
    timeout_count = 0
    while True:
        try:
            dropoff_location = geolocator.reverse(dropoff_coordinates)
            dropoff_address_components = get_address_components(dropoff_location, "dropoff")
            return pd.Series(dropoff_address_components)
        except Exception as e:
            if timeout_count == max_retry:
                current_time = datetime.datetime.now(timezone).strftime(datetime_format)
                print(f"{current_time} - Request timeout occurred for {max_retry} times, skipping row {start_row+row.name+1}. {e}")
                return None
            else:
                timeout_count += 1
                current_time = datetime.datetime.now(timezone).strftime(datetime_format)
                print(f"{current_time} - Request timeout occurred, retrying after {timeout} seconds, skipping row {start_row+row.name+1} after {max_retry-timeout_count} more retries. {e}")
                time.sleep(timeout)

# Create new columns in df_uberdata for dropoff address components
df_uberdata = df_uberdata.merge(df_uberdata.apply(get_dropoff_address, axis=1), left_index=True, right_index=True)

# Export the dataframe to a CSV file with the current time in the file name
current_time = datetime.datetime.now(timezone).strftime(datetime_format)
output_file = f"uber_dataset_reverse_geocode_rows_{start_row}-{end_row}_{current_time}.csv"
df_uberdata.to_csv(output_file, index=False)

current_time = datetime.datetime.now(timezone).strftime(datetime_format)
print(f"{current_time} - Dataframe exported to {output_file}")
current_time = datetime.datetime.now(timezone).strftime(datetime_format)
print(f"{current_time} - Finished processing rows {start_row}-{end_row}")
    
# Move on to the next chunk
start_row = end_row

2023-03-30|10:04:03 - Loading CSV into DataFrame...
2023-03-30|10:04:21 - CSV is loaded
2023-03-30|10:04:21 - Added row index column
2023-03-30|10:04:21 - Starting processing rows 165000 - 170000
2023-03-30|10:04:21 - Finding pickup address for row 165001
2023-03-30|10:04:22 - Finding pickup address for row 165002
2023-03-30|10:04:22 - Finding pickup address for row 165003
2023-03-30|10:04:23 - Finding pickup address for row 165004
2023-03-30|10:04:23 - Finding pickup address for row 165005
2023-03-30|10:04:24 - Finding pickup address for row 165006
2023-03-30|10:04:24 - Finding pickup address for row 165007
2023-03-30|10:04:25 - Finding pickup address for row 165008
2023-03-30|10:04:25 - Finding pickup address for row 165009
2023-03-30|10:04:26 - Finding pickup address for row 165010
2023-03-30|10:04:26 - Finding pickup address for row 165011
2023-03-30|10:04:27 - Finding pickup address for row 165012
2023-03-30|10:04:27 - Finding pickup address for row 165013
2023-03-30|10:04:28 - Fi

## 6th Iteration

In [3]:
end_row = start_row+rows_per_chunk # Value of end_row to enable resuming from last cell

current_time = datetime.datetime.now(timezone).strftime(datetime_format)
print(f"{current_time} - Loading CSV into DataFrame...")
df_uberdata = pd.read_csv(uberdata)
current_time = datetime.datetime.now(timezone).strftime(datetime_format)
print(f"{current_time} - CSV is loaded")
df_uberdata.insert(0, 'no', df_uberdata.reset_index().index + 1)
current_time = datetime.datetime.now(timezone).strftime(datetime_format)
print(f"{current_time} - Added row index column")
df_uberdata = df_uberdata.iloc[start_row:end_row].reset_index(drop=True)
current_time = datetime.datetime.now(timezone).strftime(datetime_format)
print(f"{current_time} - Starting processing rows {start_row} - {end_row}")

# Define a function to get the address components from location
def get_address_components(location, location_type):
    address = location.raw['address']
    address_components = {
        f"{location_type}_address": location.address,
        f"{location_type}_road": address.get('road', ''),
        f"{location_type}_neighbourhood": address.get('neighbourhood', ''),
        f"{location_type}_suburb": address.get('suburb', ''),
        f"{location_type}_county": address.get('county', ''),
        f"{location_type}_city": address.get('city', ''),
        f"{location_type}_state": address.get('state', ''),
        f"{location_type}_postcode": address.get('postcode', ''),
        f"{location_type}_country": address.get('country', '')
    }
    return address_components

# Define a function to get the pickup address from location
def get_pickup_address(row):
    current_time = datetime.datetime.now(timezone).strftime(datetime_format)
    print(f"{current_time} - Finding pickup address for row {start_row+row.name+1}")
    pickup_coordinates = f"{row['pickup_latitude']} , {row['pickup_longitude']}"
    timeout_count = 0
    while True:
        try:
            pickup_location = geolocator.reverse(pickup_coordinates)
            pickup_address_components = get_address_components(pickup_location, "pickup")
            return pd.Series(pickup_address_components)
        except Exception as e:
            if timeout_count == max_retry:
                current_time = datetime.datetime.now(timezone).strftime(datetime_format)
                print(f"{current_time} - Request timeout occurred for {max_retry} times, skipping row {start_row+row.name+1}. {e}")
                return None
            else:
                timeout_count += 1
                current_time = datetime.datetime.now(timezone).strftime(datetime_format)
                print(f"{current_time} - Request timeout occurred, retrying after {timeout} seconds, skipping row {start_row+row.name+1} in {max_retry-timeout_count+1} retries. {e}")
                time.sleep(timeout)

# Create new columns in df_uberdata for pickup address components
df_uberdata = df_uberdata.merge(df_uberdata.apply(get_pickup_address, axis=1), left_index=True, right_index=True)

# Define a function to get the dropoff address from location
def get_dropoff_address(row):
    current_time = datetime.datetime.now(timezone).strftime(datetime_format)
    print(f"{current_time} - Finding dropoff address for row {start_row+row.name+1}")
    dropoff_coordinates = f"{row['dropoff_latitude']} , {row['dropoff_longitude']}"
    timeout_count = 0
    while True:
        try:
            dropoff_location = geolocator.reverse(dropoff_coordinates)
            dropoff_address_components = get_address_components(dropoff_location, "dropoff")
            return pd.Series(dropoff_address_components)
        except Exception as e:
            if timeout_count == max_retry:
                current_time = datetime.datetime.now(timezone).strftime(datetime_format)
                print(f"{current_time} - Request timeout occurred for {max_retry} times, skipping row {start_row+row.name+1}. {e}")
                return None
            else:
                timeout_count += 1
                current_time = datetime.datetime.now(timezone).strftime(datetime_format)
                print(f"{current_time} - Request timeout occurred, retrying after {timeout} seconds, skipping row {start_row+row.name+1} after {max_retry-timeout_count} more retries. {e}")
                time.sleep(timeout)

# Create new columns in df_uberdata for dropoff address components
df_uberdata = df_uberdata.merge(df_uberdata.apply(get_dropoff_address, axis=1), left_index=True, right_index=True)

# Export the dataframe to a CSV file with the current time in the file name
current_time = datetime.datetime.now(timezone).strftime(datetime_format)
output_file = f"uber_dataset_reverse_geocode_rows_{start_row}-{end_row}_{current_time}.csv"
df_uberdata.to_csv(output_file, index=False)

current_time = datetime.datetime.now(timezone).strftime(datetime_format)
print(f"{current_time} - Dataframe exported to {output_file}")
current_time = datetime.datetime.now(timezone).strftime(datetime_format)
print(f"{current_time} - Finished processing rows {start_row}-{end_row}")
    
# Move on to the next chunk
start_row = end_row

2023-03-30|11:37:21 - Loading CSV into DataFrame...
2023-03-30|11:37:34 - CSV is loaded
2023-03-30|11:37:34 - Added row index column
2023-03-30|11:37:34 - Starting processing rows 160000 - 165000
2023-03-30|11:37:34 - Finding pickup address for row 160001
2023-03-30|11:37:35 - Finding pickup address for row 160002
2023-03-30|11:37:35 - Finding pickup address for row 160003
2023-03-30|11:37:36 - Finding pickup address for row 160004
2023-03-30|11:37:36 - Finding pickup address for row 160005
2023-03-30|11:37:37 - Finding pickup address for row 160006
2023-03-30|11:37:37 - Finding pickup address for row 160007
2023-03-30|11:37:38 - Finding pickup address for row 160008
2023-03-30|11:37:38 - Finding pickup address for row 160009
2023-03-30|11:37:39 - Finding pickup address for row 160010
2023-03-30|11:37:39 - Finding pickup address for row 160011
2023-03-30|11:37:40 - Finding pickup address for row 160012
2023-03-30|11:37:40 - Finding pickup address for row 160013
2023-03-30|11:37:41 - Fi

## 7th Iteration

In [3]:
end_row = start_row+rows_per_chunk # Value of end_row to enable resuming from last cell

current_time = datetime.datetime.now(timezone).strftime(datetime_format)
print(f"{current_time} - Loading CSV into DataFrame...")
df_uberdata = pd.read_csv(uberdata)
current_time = datetime.datetime.now(timezone).strftime(datetime_format)
print(f"{current_time} - CSV is loaded")
df_uberdata.insert(0, 'no', df_uberdata.reset_index().index + 1)
current_time = datetime.datetime.now(timezone).strftime(datetime_format)
print(f"{current_time} - Added row index column")
df_uberdata = df_uberdata.iloc[start_row:end_row].reset_index(drop=True)
current_time = datetime.datetime.now(timezone).strftime(datetime_format)
print(f"{current_time} - Starting processing rows {start_row} - {end_row}")

# Define a function to get the address components from location
def get_address_components(location, location_type):
    address = location.raw['address']
    address_components = {
        f"{location_type}_address": location.address,
        f"{location_type}_road": address.get('road', ''),
        f"{location_type}_neighbourhood": address.get('neighbourhood', ''),
        f"{location_type}_suburb": address.get('suburb', ''),
        f"{location_type}_county": address.get('county', ''),
        f"{location_type}_city": address.get('city', ''),
        f"{location_type}_state": address.get('state', ''),
        f"{location_type}_postcode": address.get('postcode', ''),
        f"{location_type}_country": address.get('country', '')
    }
    return address_components

# Define a function to get the pickup address from location
def get_pickup_address(row):
    current_time = datetime.datetime.now(timezone).strftime(datetime_format)
    print(f"{current_time} - Finding pickup address for row {start_row+row.name+1}")
    pickup_coordinates = f"{row['pickup_latitude']} , {row['pickup_longitude']}"
    timeout_count = 0
    while True:
        try:
            pickup_location = geolocator.reverse(pickup_coordinates)
            pickup_address_components = get_address_components(pickup_location, "pickup")
            return pd.Series(pickup_address_components)
        except Exception as e:
            if timeout_count == max_retry:
                current_time = datetime.datetime.now(timezone).strftime(datetime_format)
                print(f"{current_time} - Request timeout occurred for {max_retry} times, skipping row {start_row+row.name+1}. {e}")
                return None
            else:
                timeout_count += 1
                current_time = datetime.datetime.now(timezone).strftime(datetime_format)
                print(f"{current_time} - Request timeout occurred, retrying after {timeout} seconds, skipping row {start_row+row.name+1} in {max_retry-timeout_count+1} retries. {e}")
                time.sleep(timeout)

# Create new columns in df_uberdata for pickup address components
df_uberdata = df_uberdata.merge(df_uberdata.apply(get_pickup_address, axis=1), left_index=True, right_index=True)

# Define a function to get the dropoff address from location
def get_dropoff_address(row):
    current_time = datetime.datetime.now(timezone).strftime(datetime_format)
    print(f"{current_time} - Finding dropoff address for row {start_row+row.name+1}")
    dropoff_coordinates = f"{row['dropoff_latitude']} , {row['dropoff_longitude']}"
    timeout_count = 0
    while True:
        try:
            dropoff_location = geolocator.reverse(dropoff_coordinates)
            dropoff_address_components = get_address_components(dropoff_location, "dropoff")
            return pd.Series(dropoff_address_components)
        except Exception as e:
            if timeout_count == max_retry:
                current_time = datetime.datetime.now(timezone).strftime(datetime_format)
                print(f"{current_time} - Request timeout occurred for {max_retry} times, skipping row {start_row+row.name+1}. {e}")
                return None
            else:
                timeout_count += 1
                current_time = datetime.datetime.now(timezone).strftime(datetime_format)
                print(f"{current_time} - Request timeout occurred, retrying after {timeout} seconds, skipping row {start_row+row.name+1} after {max_retry-timeout_count} more retries. {e}")
                time.sleep(timeout)

# Create new columns in df_uberdata for dropoff address components
df_uberdata = df_uberdata.merge(df_uberdata.apply(get_dropoff_address, axis=1), left_index=True, right_index=True)

# Export the dataframe to a CSV file with the current time in the file name
current_time = datetime.datetime.now(timezone).strftime(datetime_format)
output_file = f"uber_dataset_reverse_geocode_rows_{start_row}-{end_row}_{current_time}.csv"
df_uberdata.to_csv(output_file, index=False)

current_time = datetime.datetime.now(timezone).strftime(datetime_format)
print(f"{current_time} - Dataframe exported to {output_file}")
current_time = datetime.datetime.now(timezone).strftime(datetime_format)
print(f"{current_time} - Finished processing rows {start_row}-{end_row}")
    
# Move on to the next chunk
start_row = end_row

2023-03-30|14:39:40 - Loading CSV into DataFrame...
2023-03-30|14:39:57 - CSV is loaded
2023-03-30|14:39:57 - Added row index column
2023-03-30|14:39:57 - Starting processing rows 155000 - 160000
2023-03-30|14:39:57 - Finding pickup address for row 155001
2023-03-30|14:39:58 - Finding pickup address for row 155002
2023-03-30|14:39:59 - Finding pickup address for row 155003
2023-03-30|14:39:59 - Finding pickup address for row 155004
2023-03-30|14:40:00 - Finding pickup address for row 155005
2023-03-30|14:40:00 - Finding pickup address for row 155006
2023-03-30|14:40:01 - Finding pickup address for row 155007
2023-03-30|14:40:01 - Finding pickup address for row 155008
2023-03-30|14:40:02 - Finding pickup address for row 155009
2023-03-30|14:40:02 - Finding pickup address for row 155010
2023-03-30|14:40:03 - Finding pickup address for row 155011
2023-03-30|14:40:03 - Finding pickup address for row 155012
2023-03-30|14:40:04 - Finding pickup address for row 155013
2023-03-30|14:40:04 - Fi

## 8th Iteration

In [3]:
end_row = start_row+rows_per_chunk # Value of end_row to enable resuming from last cell

current_time = datetime.datetime.now(timezone).strftime(datetime_format)
print(f"{current_time} - Loading CSV into DataFrame...")
df_uberdata = pd.read_csv(uberdata)
current_time = datetime.datetime.now(timezone).strftime(datetime_format)
print(f"{current_time} - CSV is loaded")
df_uberdata.insert(0, 'no', df_uberdata.reset_index().index + 1)
current_time = datetime.datetime.now(timezone).strftime(datetime_format)
print(f"{current_time} - Added row index column")
df_uberdata = df_uberdata.iloc[start_row:end_row].reset_index(drop=True)
current_time = datetime.datetime.now(timezone).strftime(datetime_format)
print(f"{current_time} - Starting processing rows {start_row} - {end_row}")

# Define a function to get the address components from location
def get_address_components(location, location_type):
    address = location.raw['address']
    address_components = {
        f"{location_type}_address": location.address,
        f"{location_type}_road": address.get('road', ''),
        f"{location_type}_neighbourhood": address.get('neighbourhood', ''),
        f"{location_type}_suburb": address.get('suburb', ''),
        f"{location_type}_county": address.get('county', ''),
        f"{location_type}_city": address.get('city', ''),
        f"{location_type}_state": address.get('state', ''),
        f"{location_type}_postcode": address.get('postcode', ''),
        f"{location_type}_country": address.get('country', '')
    }
    return address_components

# Define a function to get the pickup address from location
def get_pickup_address(row):
    current_time = datetime.datetime.now(timezone).strftime(datetime_format)
    print(f"{current_time} - Finding pickup address for row {start_row+row.name+1}")
    pickup_coordinates = f"{row['pickup_latitude']} , {row['pickup_longitude']}"
    timeout_count = 0
    while True:
        try:
            pickup_location = geolocator.reverse(pickup_coordinates)
            pickup_address_components = get_address_components(pickup_location, "pickup")
            return pd.Series(pickup_address_components)
        except Exception as e:
            if timeout_count == max_retry:
                current_time = datetime.datetime.now(timezone).strftime(datetime_format)
                print(f"{current_time} - Request timeout occurred for {max_retry} times, skipping row {start_row+row.name+1}. {e}")
                return None
            else:
                timeout_count += 1
                current_time = datetime.datetime.now(timezone).strftime(datetime_format)
                print(f"{current_time} - Request timeout occurred, retrying after {timeout} seconds, skipping row {start_row+row.name+1} in {max_retry-timeout_count+1} retries. {e}")
                time.sleep(timeout)

# Create new columns in df_uberdata for pickup address components
df_uberdata = df_uberdata.merge(df_uberdata.apply(get_pickup_address, axis=1), left_index=True, right_index=True)

# Define a function to get the dropoff address from location
def get_dropoff_address(row):
    current_time = datetime.datetime.now(timezone).strftime(datetime_format)
    print(f"{current_time} - Finding dropoff address for row {start_row+row.name+1}")
    dropoff_coordinates = f"{row['dropoff_latitude']} , {row['dropoff_longitude']}"
    timeout_count = 0
    while True:
        try:
            dropoff_location = geolocator.reverse(dropoff_coordinates)
            dropoff_address_components = get_address_components(dropoff_location, "dropoff")
            return pd.Series(dropoff_address_components)
        except Exception as e:
            if timeout_count == max_retry:
                current_time = datetime.datetime.now(timezone).strftime(datetime_format)
                print(f"{current_time} - Request timeout occurred for {max_retry} times, skipping row {start_row+row.name+1}. {e}")
                return None
            else:
                timeout_count += 1
                current_time = datetime.datetime.now(timezone).strftime(datetime_format)
                print(f"{current_time} - Request timeout occurred, retrying after {timeout} seconds, skipping row {start_row+row.name+1} after {max_retry-timeout_count} more retries. {e}")
                time.sleep(timeout)

# Create new columns in df_uberdata for dropoff address components
df_uberdata = df_uberdata.merge(df_uberdata.apply(get_dropoff_address, axis=1), left_index=True, right_index=True)

# Export the dataframe to a CSV file with the current time in the file name
current_time = datetime.datetime.now(timezone).strftime(datetime_format)
output_file = f"uber_dataset_reverse_geocode_rows_{start_row}-{end_row}_{current_time}.csv"
df_uberdata.to_csv(output_file, index=False)

current_time = datetime.datetime.now(timezone).strftime(datetime_format)
print(f"{current_time} - Dataframe exported to {output_file}")
current_time = datetime.datetime.now(timezone).strftime(datetime_format)
print(f"{current_time} - Finished processing rows {start_row}-{end_row}")
    
# Move on to the next chunk
start_row = end_row

2023-03-31|09:56:00 - Loading CSV into DataFrame...
2023-03-31|09:56:17 - CSV is loaded
2023-03-31|09:56:17 - Added row index column
2023-03-31|09:56:17 - Starting processing rows 150000 - 155000
2023-03-31|09:56:17 - Finding pickup address for row 150001
2023-03-31|09:56:17 - Finding pickup address for row 150002
2023-03-31|09:56:18 - Finding pickup address for row 150003
2023-03-31|09:56:18 - Finding pickup address for row 150004
2023-03-31|09:56:19 - Finding pickup address for row 150005
2023-03-31|09:56:19 - Finding pickup address for row 150006
2023-03-31|09:56:20 - Finding pickup address for row 150007
2023-03-31|09:56:20 - Finding pickup address for row 150008
2023-03-31|09:56:21 - Finding pickup address for row 150009
2023-03-31|09:56:21 - Finding pickup address for row 150010
2023-03-31|09:56:22 - Finding pickup address for row 150011
2023-03-31|09:56:23 - Finding pickup address for row 150012
2023-03-31|09:56:23 - Finding pickup address for row 150013
2023-03-31|09:56:23 - Fi

## 9th Iteration

In [None]:
end_row = start_row+rows_per_chunk # Value of end_row to enable resuming from last cell

current_time = datetime.datetime.now(timezone).strftime(datetime_format)
print(f"{current_time} - Loading CSV into DataFrame...")
df_uberdata = pd.read_csv(uberdata)
current_time = datetime.datetime.now(timezone).strftime(datetime_format)
print(f"{current_time} - CSV is loaded")
df_uberdata.insert(0, 'no', df_uberdata.reset_index().index + 1)
current_time = datetime.datetime.now(timezone).strftime(datetime_format)
print(f"{current_time} - Added row index column")
df_uberdata = df_uberdata.iloc[start_row:end_row].reset_index(drop=True)
current_time = datetime.datetime.now(timezone).strftime(datetime_format)
print(f"{current_time} - Starting processing rows {start_row} - {end_row}")

# Define a function to get the address components from location
def get_address_components(location, location_type):
    address = location.raw['address']
    address_components = {
        f"{location_type}_address": location.address,
        f"{location_type}_road": address.get('road', ''),
        f"{location_type}_neighbourhood": address.get('neighbourhood', ''),
        f"{location_type}_suburb": address.get('suburb', ''),
        f"{location_type}_county": address.get('county', ''),
        f"{location_type}_city": address.get('city', ''),
        f"{location_type}_state": address.get('state', ''),
        f"{location_type}_postcode": address.get('postcode', ''),
        f"{location_type}_country": address.get('country', '')
    }
    return address_components

# Define a function to get the pickup address from location
def get_pickup_address(row):
    current_time = datetime.datetime.now(timezone).strftime(datetime_format)
    print(f"{current_time} - Finding pickup address for row {start_row+row.name+1}")
    pickup_coordinates = f"{row['pickup_latitude']} , {row['pickup_longitude']}"
    timeout_count = 0
    while True:
        try:
            pickup_location = geolocator.reverse(pickup_coordinates)
            pickup_address_components = get_address_components(pickup_location, "pickup")
            return pd.Series(pickup_address_components)
        except Exception as e:
            if timeout_count == max_retry:
                current_time = datetime.datetime.now(timezone).strftime(datetime_format)
                print(f"{current_time} - Request timeout occurred for {max_retry} times, skipping row {start_row+row.name+1}. {e}")
                return None
            else:
                timeout_count += 1
                current_time = datetime.datetime.now(timezone).strftime(datetime_format)
                print(f"{current_time} - Request timeout occurred, retrying after {timeout} seconds, skipping row {start_row+row.name+1} in {max_retry-timeout_count+1} retries. {e}")
                time.sleep(timeout)

# Create new columns in df_uberdata for pickup address components
df_uberdata = df_uberdata.merge(df_uberdata.apply(get_pickup_address, axis=1), left_index=True, right_index=True)

# Define a function to get the dropoff address from location
def get_dropoff_address(row):
    current_time = datetime.datetime.now(timezone).strftime(datetime_format)
    print(f"{current_time} - Finding dropoff address for row {start_row+row.name+1}")
    dropoff_coordinates = f"{row['dropoff_latitude']} , {row['dropoff_longitude']}"
    timeout_count = 0
    while True:
        try:
            dropoff_location = geolocator.reverse(dropoff_coordinates)
            dropoff_address_components = get_address_components(dropoff_location, "dropoff")
            return pd.Series(dropoff_address_components)
        except Exception as e:
            if timeout_count == max_retry:
                current_time = datetime.datetime.now(timezone).strftime(datetime_format)
                print(f"{current_time} - Request timeout occurred for {max_retry} times, skipping row {start_row+row.name+1}. {e}")
                return None
            else:
                timeout_count += 1
                current_time = datetime.datetime.now(timezone).strftime(datetime_format)
                print(f"{current_time} - Request timeout occurred, retrying after {timeout} seconds, skipping row {start_row+row.name+1} after {max_retry-timeout_count} more retries. {e}")
                time.sleep(timeout)

# Create new columns in df_uberdata for dropoff address components
df_uberdata = df_uberdata.merge(df_uberdata.apply(get_dropoff_address, axis=1), left_index=True, right_index=True)

# Export the dataframe to a CSV file with the current time in the file name
current_time = datetime.datetime.now(timezone).strftime(datetime_format)
output_file = f"uber_dataset_reverse_geocode_rows_{start_row}-{end_row}_{current_time}.csv"
df_uberdata.to_csv(output_file, index=False)

current_time = datetime.datetime.now(timezone).strftime(datetime_format)
print(f"{current_time} - Dataframe exported to {output_file}")
current_time = datetime.datetime.now(timezone).strftime(datetime_format)
print(f"{current_time} - Finished processing rows {start_row}-{end_row}")
    
# Move on to the next chunk
start_row = end_row

## 10th Iteration

In [None]:
end_row = start_row+rows_per_chunk # Value of end_row to enable resuming from last cell

current_time = datetime.datetime.now(timezone).strftime(datetime_format)
print(f"{current_time} - Loading CSV into DataFrame...")
df_uberdata = pd.read_csv(uberdata)
current_time = datetime.datetime.now(timezone).strftime(datetime_format)
print(f"{current_time} - CSV is loaded")
df_uberdata.insert(0, 'no', df_uberdata.reset_index().index + 1)
current_time = datetime.datetime.now(timezone).strftime(datetime_format)
print(f"{current_time} - Added row index column")
df_uberdata = df_uberdata.iloc[start_row:end_row].reset_index(drop=True)
current_time = datetime.datetime.now(timezone).strftime(datetime_format)
print(f"{current_time} - Starting processing rows {start_row} - {end_row}")

# Define a function to get the address components from location
def get_address_components(location, location_type):
    address = location.raw['address']
    address_components = {
        f"{location_type}_address": location.address,
        f"{location_type}_road": address.get('road', ''),
        f"{location_type}_neighbourhood": address.get('neighbourhood', ''),
        f"{location_type}_suburb": address.get('suburb', ''),
        f"{location_type}_county": address.get('county', ''),
        f"{location_type}_city": address.get('city', ''),
        f"{location_type}_state": address.get('state', ''),
        f"{location_type}_postcode": address.get('postcode', ''),
        f"{location_type}_country": address.get('country', '')
    }
    return address_components

# Define a function to get the pickup address from location
def get_pickup_address(row):
    current_time = datetime.datetime.now(timezone).strftime(datetime_format)
    print(f"{current_time} - Finding pickup address for row {start_row+row.name+1}")
    pickup_coordinates = f"{row['pickup_latitude']} , {row['pickup_longitude']}"
    timeout_count = 0
    while True:
        try:
            pickup_location = geolocator.reverse(pickup_coordinates)
            pickup_address_components = get_address_components(pickup_location, "pickup")
            return pd.Series(pickup_address_components)
        except Exception as e:
            if timeout_count == max_retry:
                current_time = datetime.datetime.now(timezone).strftime(datetime_format)
                print(f"{current_time} - Request timeout occurred for {max_retry} times, skipping row {start_row+row.name+1}. {e}")
                return None
            else:
                timeout_count += 1
                current_time = datetime.datetime.now(timezone).strftime(datetime_format)
                print(f"{current_time} - Request timeout occurred, retrying after {timeout} seconds, skipping row {start_row+row.name+1} in {max_retry-timeout_count+1} retries. {e}")
                time.sleep(timeout)

# Create new columns in df_uberdata for pickup address components
df_uberdata = df_uberdata.merge(df_uberdata.apply(get_pickup_address, axis=1), left_index=True, right_index=True)

# Define a function to get the dropoff address from location
def get_dropoff_address(row):
    current_time = datetime.datetime.now(timezone).strftime(datetime_format)
    print(f"{current_time} - Finding dropoff address for row {start_row+row.name+1}")
    dropoff_coordinates = f"{row['dropoff_latitude']} , {row['dropoff_longitude']}"
    timeout_count = 0
    while True:
        try:
            dropoff_location = geolocator.reverse(dropoff_coordinates)
            dropoff_address_components = get_address_components(dropoff_location, "dropoff")
            return pd.Series(dropoff_address_components)
        except Exception as e:
            if timeout_count == max_retry:
                current_time = datetime.datetime.now(timezone).strftime(datetime_format)
                print(f"{current_time} - Request timeout occurred for {max_retry} times, skipping row {start_row+row.name+1}. {e}")
                return None
            else:
                timeout_count += 1
                current_time = datetime.datetime.now(timezone).strftime(datetime_format)
                print(f"{current_time} - Request timeout occurred, retrying after {timeout} seconds, skipping row {start_row+row.name+1} after {max_retry-timeout_count} more retries. {e}")
                time.sleep(timeout)

# Create new columns in df_uberdata for dropoff address components
df_uberdata = df_uberdata.merge(df_uberdata.apply(get_dropoff_address, axis=1), left_index=True, right_index=True)

# Export the dataframe to a CSV file with the current time in the file name
current_time = datetime.datetime.now(timezone).strftime(datetime_format)
output_file = f"uber_dataset_reverse_geocode_rows_{start_row}-{end_row}_{current_time}.csv"
df_uberdata.to_csv(output_file, index=False)

current_time = datetime.datetime.now(timezone).strftime(datetime_format)
print(f"{current_time} - Dataframe exported to {output_file}")
current_time = datetime.datetime.now(timezone).strftime(datetime_format)
print(f"{current_time} - Finished processing rows {start_row}-{end_row}")
    
# Move on to the next chunk
start_row = end_row

## from here it's still the old code below

## 11th Iteration

In [None]:
end_row = start_row+rows_per_chunk # Value of end_row to enable resuming from last cell

current_time = datetime.datetime.now(timezone).strftime(datetime_format)
print(f"{current_time} - Loading CSV into DataFrame...")
df_uberdata = pd.read_csv(uberdata)
current_time = datetime.datetime.now(timezone).strftime(datetime_format)
print(f"{current_time} - CSV is loaded")
df_uberdata.insert(0, 'no', df_uberdata.reset_index().index + 1)
current_time = datetime.datetime.now(timezone).strftime(datetime_format)
print(f"{current_time} - Added row index column")
df_uberdata = df_uberdata.iloc[start_row:end_row].reset_index(drop=True)
current_time = datetime.datetime.now(timezone).strftime(datetime_format)
print(f"{current_time} - Starting processing rows {start_row} - {end_row}")

# Define a function to get the address components from location
def get_address_components(location, location_type):
    address = location.raw['address']
    address_components = {
        f"{location_type}_address": location.address,
        f"{location_type}_road": address.get('road', ''),
        f"{location_type}_neighbourhood": address.get('neighbourhood', ''),
        f"{location_type}_suburb": address.get('suburb', ''),
        f"{location_type}_county": address.get('county', ''),
        f"{location_type}_city": address.get('city', ''),
        f"{location_type}_state": address.get('state', ''),
        f"{location_type}_postcode": address.get('postcode', ''),
        f"{location_type}_country": address.get('country', '')
    }
    return address_components

# Define a function to get the pickup address from location
def get_pickup_address(row):
    current_time = datetime.datetime.now(timezone).strftime(datetime_format)
    print(f"{current_time} - Finding pickup address for row {start_row+row.name+1}")
    pickup_coordinates = f"{row['pickup_latitude']} , {row['pickup_longitude']}"
    timeout_count = 0
    while True:
        try:
            pickup_location = geolocator.reverse(pickup_coordinates)
            pickup_address_components = get_address_components(pickup_location, "pickup")
            return pd.Series(pickup_address_components)
        except Exception as e:
            if timeout_count == max_retry:
                current_time = datetime.datetime.now(timezone).strftime(datetime_format)
                print(f"{current_time} - Request timeout occurred for {max_retry} times, skipping row {start_row+row.name+1}. {e}")
                return None
            else:
                timeout_count += 1
                current_time = datetime.datetime.now(timezone).strftime(datetime_format)
                print(f"{current_time} - Request timeout occurred, retrying after {timeout} seconds, skipping row {start_row+row.name+1} in {max_retry-timeout_count} retries. {e}")
                time.sleep(timeout)

# Create new columns in df_uberdata for pickup address components
df_uberdata = df_uberdata.merge(df_uberdata.apply(get_pickup_address, axis=1), left_index=True, right_index=True)

# Define a function to get the dropoff address from location
def get_dropoff_address(row):
    current_time = datetime.datetime.now(timezone).strftime(datetime_format)
    print(f"{current_time} - Finding dropoff address for row {start_row+row.name+1}")
    dropoff_coordinates = f"{row['dropoff_latitude']} , {row['dropoff_longitude']}"
    timeout_count = 0
    while True:
        try:
            dropoff_location = geolocator.reverse(dropoff_coordinates)
            dropoff_address_components = get_address_components(dropoff_location, "dropoff")
            return pd.Series(dropoff_address_components)
        except Exception as e:
            if timeout_count == max_retry:
                current_time = datetime.datetime.now(timezone).strftime(datetime_format)
                print(f"{current_time} - Request timeout occurred for {max_retry} times, skipping row {start_row+row.name+1}. {e}")
                return None
            else:
                timeout_count += 1
                current_time = datetime.datetime.now(timezone).strftime(datetime_format)
                print(f"{current_time} - Request timeout occurred, retrying after {timeout} seconds, skipping row {start_row+row.name+1} in {max_retry-timeout_count} retries. {e}")
                time.sleep(timeout)

# Create new columns in df_uberdata for dropoff address components
df_uberdata = df_uberdata.merge(df_uberdata.apply(get_dropoff_address, axis=1), left_index=True, right_index=True)

# Export the dataframe to a CSV file with the current time in the file name
current_time = datetime.datetime.now(timezone).strftime(datetime_format)
output_file = f"uber_dataset_reverse_geocode_rows_{start_row}-{end_row}_{current_time}.csv"
df_uberdata.to_csv(output_file, index=False)

current_time = datetime.datetime.now(timezone).strftime(datetime_format)
print(f"{current_time} - Dataframe exported to {output_file}")
current_time = datetime.datetime.now(timezone).strftime(datetime_format)
print(f"{current_time} - Finished processing rows {start_row}-{end_row}")
    
# Move on to the next chunk
start_row = end_row

## 12th Iteration

In [None]:
end_row = start_row+rows_per_chunk # Value of end_row to enable resuming from last cell

current_time = datetime.datetime.now(timezone).strftime(datetime_format)
print(f"{current_time} - Loading CSV into DataFrame...")
df_uberdata = pd.read_csv(uberdata)
current_time = datetime.datetime.now(timezone).strftime(datetime_format)
print(f"{current_time} - CSV is loaded")
df_uberdata.insert(0, 'no', df_uberdata.reset_index().index + 1)
current_time = datetime.datetime.now(timezone).strftime(datetime_format)
print(f"{current_time} - Added row index column")
df_uberdata = df_uberdata.iloc[start_row:end_row].reset_index(drop=True)
current_time = datetime.datetime.now(timezone).strftime(datetime_format)
print(f"{current_time} - Starting processing rows {start_row} - {end_row}")

# Define a function to get the address components from location
def get_address_components(location, location_type):
    address = location.raw['address']
    address_components = {
        f"{location_type}_address": location.address,
        f"{location_type}_road": address.get('road', ''),
        f"{location_type}_neighbourhood": address.get('neighbourhood', ''),
        f"{location_type}_suburb": address.get('suburb', ''),
        f"{location_type}_county": address.get('county', ''),
        f"{location_type}_city": address.get('city', ''),
        f"{location_type}_state": address.get('state', ''),
        f"{location_type}_postcode": address.get('postcode', ''),
        f"{location_type}_country": address.get('country', '')
    }
    return address_components

# Define a function to get the pickup address from location
def get_pickup_address(row):
    current_time = datetime.datetime.now(timezone).strftime(datetime_format)
    print(f"{current_time} - Finding pickup address for row {start_row+row.name+1}")
    pickup_coordinates = f"{row['pickup_latitude']} , {row['pickup_longitude']}"
    timeout_count = 0
    while True:
        try:
            pickup_location = geolocator.reverse(pickup_coordinates)
            pickup_address_components = get_address_components(pickup_location, "pickup")
            return pd.Series(pickup_address_components)
        except Exception as e:
            if timeout_count == max_retry:
                current_time = datetime.datetime.now(timezone).strftime(datetime_format)
                print(f"{current_time} - Request timeout occurred for {max_retry} times, skipping row {start_row+row.name+1}. {e}")
                return None
            else:
                timeout_count += 1
                current_time = datetime.datetime.now(timezone).strftime(datetime_format)
                print(f"{current_time} - Request timeout occurred, retrying after {timeout} seconds, skipping row {start_row+row.name+1} in {max_retry-timeout_count} retries. {e}")
                time.sleep(timeout)

# Create new columns in df_uberdata for pickup address components
df_uberdata = df_uberdata.merge(df_uberdata.apply(get_pickup_address, axis=1), left_index=True, right_index=True)

# Define a function to get the dropoff address from location
def get_dropoff_address(row):
    current_time = datetime.datetime.now(timezone).strftime(datetime_format)
    print(f"{current_time} - Finding dropoff address for row {start_row+row.name+1}")
    dropoff_coordinates = f"{row['dropoff_latitude']} , {row['dropoff_longitude']}"
    timeout_count = 0
    while True:
        try:
            dropoff_location = geolocator.reverse(dropoff_coordinates)
            dropoff_address_components = get_address_components(dropoff_location, "dropoff")
            return pd.Series(dropoff_address_components)
        except Exception as e:
            if timeout_count == max_retry:
                current_time = datetime.datetime.now(timezone).strftime(datetime_format)
                print(f"{current_time} - Request timeout occurred for {max_retry} times, skipping row {start_row+row.name+1}. {e}")
                return None
            else:
                timeout_count += 1
                current_time = datetime.datetime.now(timezone).strftime(datetime_format)
                print(f"{current_time} - Request timeout occurred, retrying after {timeout} seconds, skipping row {start_row+row.name+1} in {max_retry-timeout_count} retries. {e}")
                time.sleep(timeout)

# Create new columns in df_uberdata for dropoff address components
df_uberdata = df_uberdata.merge(df_uberdata.apply(get_dropoff_address, axis=1), left_index=True, right_index=True)

# Export the dataframe to a CSV file with the current time in the file name
current_time = datetime.datetime.now(timezone).strftime(datetime_format)
output_file = f"uber_dataset_reverse_geocode_rows_{start_row}-{end_row}_{current_time}.csv"
df_uberdata.to_csv(output_file, index=False)

current_time = datetime.datetime.now(timezone).strftime(datetime_format)
print(f"{current_time} - Dataframe exported to {output_file}")
current_time = datetime.datetime.now(timezone).strftime(datetime_format)
print(f"{current_time} - Finished processing rows {start_row}-{end_row}")
    
# Move on to the next chunk
start_row = end_row

## 13th Iteration

In [None]:
end_row = start_row+rows_per_chunk # Value of end_row to enable resuming from last cell

current_time = datetime.datetime.now(timezone).strftime(datetime_format)
print(f"{current_time} - Loading CSV into DataFrame...")
df_uberdata = pd.read_csv(uberdata)
current_time = datetime.datetime.now(timezone).strftime(datetime_format)
print(f"{current_time} - CSV is loaded")
df_uberdata.insert(0, 'no', df_uberdata.reset_index().index + 1)
current_time = datetime.datetime.now(timezone).strftime(datetime_format)
print(f"{current_time} - Added row index column")
df_uberdata = df_uberdata.iloc[start_row:end_row].reset_index(drop=True)
current_time = datetime.datetime.now(timezone).strftime(datetime_format)
print(f"{current_time} - Starting processing rows {start_row} - {end_row}")

# Define a function to get the address components from location
def get_address_components(location, location_type):
    address = location.raw['address']
    address_components = {
        f"{location_type}_address": location.address,
        f"{location_type}_road": address.get('road', ''),
        f"{location_type}_neighbourhood": address.get('neighbourhood', ''),
        f"{location_type}_suburb": address.get('suburb', ''),
        f"{location_type}_county": address.get('county', ''),
        f"{location_type}_city": address.get('city', ''),
        f"{location_type}_state": address.get('state', ''),
        f"{location_type}_postcode": address.get('postcode', ''),
        f"{location_type}_country": address.get('country', '')
    }
    return address_components

# Define a function to get the pickup address from location
def get_pickup_address(row):
    current_time = datetime.datetime.now(timezone).strftime(datetime_format)
    print(f"{current_time} - Finding pickup address for row {start_row+row.name+1}")
    pickup_coordinates = f"{row['pickup_latitude']} , {row['pickup_longitude']}"
    timeout_count = 0
    while True:
        try:
            pickup_location = geolocator.reverse(pickup_coordinates)
            pickup_address_components = get_address_components(pickup_location, "pickup")
            return pd.Series(pickup_address_components)
        except Exception as e:
            if timeout_count == max_retry:
                current_time = datetime.datetime.now(timezone).strftime(datetime_format)
                print(f"{current_time} - Request timeout occurred for {max_retry} times, skipping row {start_row+row.name+1}. {e}")
                return None
            else:
                timeout_count += 1
                current_time = datetime.datetime.now(timezone).strftime(datetime_format)
                print(f"{current_time} - Request timeout occurred, retrying after {timeout} seconds, skipping row {start_row+row.name+1} in {max_retry-timeout_count} retries. {e}")
                time.sleep(timeout)

# Create new columns in df_uberdata for pickup address components
df_uberdata = df_uberdata.merge(df_uberdata.apply(get_pickup_address, axis=1), left_index=True, right_index=True)

# Define a function to get the dropoff address from location
def get_dropoff_address(row):
    current_time = datetime.datetime.now(timezone).strftime(datetime_format)
    print(f"{current_time} - Finding dropoff address for row {start_row+row.name+1}")
    dropoff_coordinates = f"{row['dropoff_latitude']} , {row['dropoff_longitude']}"
    timeout_count = 0
    while True:
        try:
            dropoff_location = geolocator.reverse(dropoff_coordinates)
            dropoff_address_components = get_address_components(dropoff_location, "dropoff")
            return pd.Series(dropoff_address_components)
        except Exception as e:
            if timeout_count == max_retry:
                current_time = datetime.datetime.now(timezone).strftime(datetime_format)
                print(f"{current_time} - Request timeout occurred for {max_retry} times, skipping row {start_row+row.name+1}. {e}")
                return None
            else:
                timeout_count += 1
                current_time = datetime.datetime.now(timezone).strftime(datetime_format)
                print(f"{current_time} - Request timeout occurred, retrying after {timeout} seconds, skipping row {start_row+row.name+1} in {max_retry-timeout_count} retries. {e}")
                time.sleep(timeout)

# Create new columns in df_uberdata for dropoff address components
df_uberdata = df_uberdata.merge(df_uberdata.apply(get_dropoff_address, axis=1), left_index=True, right_index=True)

# Export the dataframe to a CSV file with the current time in the file name
current_time = datetime.datetime.now(timezone).strftime(datetime_format)
output_file = f"uber_dataset_reverse_geocode_rows_{start_row}-{end_row}_{current_time}.csv"
df_uberdata.to_csv(output_file, index=False)

current_time = datetime.datetime.now(timezone).strftime(datetime_format)
print(f"{current_time} - Dataframe exported to {output_file}")
current_time = datetime.datetime.now(timezone).strftime(datetime_format)
print(f"{current_time} - Finished processing rows {start_row}-{end_row}")
    
# Move on to the next chunk
start_row = end_row

## 14th Iteration

In [None]:
end_row = start_row+rows_per_chunk # Value of end_row to enable resuming from last cell

current_time = datetime.datetime.now(timezone).strftime(datetime_format)
print(f"{current_time} - Loading CSV into DataFrame...")
df_uberdata = pd.read_csv(uberdata)
current_time = datetime.datetime.now(timezone).strftime(datetime_format)
print(f"{current_time} - CSV is loaded")
df_uberdata.insert(0, 'no', df_uberdata.reset_index().index + 1)
current_time = datetime.datetime.now(timezone).strftime(datetime_format)
print(f"{current_time} - Added row index column")
df_uberdata = df_uberdata.iloc[start_row:end_row].reset_index(drop=True)
current_time = datetime.datetime.now(timezone).strftime(datetime_format)
print(f"{current_time} - Starting processing rows {start_row} - {end_row}")

# Define a function to get the address components from location
def get_address_components(location, location_type):
    address = location.raw['address']
    address_components = {
        f"{location_type}_address": location.address,
        f"{location_type}_road": address.get('road', ''),
        f"{location_type}_neighbourhood": address.get('neighbourhood', ''),
        f"{location_type}_suburb": address.get('suburb', ''),
        f"{location_type}_county": address.get('county', ''),
        f"{location_type}_city": address.get('city', ''),
        f"{location_type}_state": address.get('state', ''),
        f"{location_type}_postcode": address.get('postcode', ''),
        f"{location_type}_country": address.get('country', '')
    }
    return address_components

# Define a function to get the pickup address from location
def get_pickup_address(row):
    current_time = datetime.datetime.now(timezone).strftime(datetime_format)
    print(f"{current_time} - Finding pickup address for row {start_row+row.name+1}")
    pickup_coordinates = f"{row['pickup_latitude']} , {row['pickup_longitude']}"
    timeout_count = 0
    while True:
        try:
            pickup_location = geolocator.reverse(pickup_coordinates)
            pickup_address_components = get_address_components(pickup_location, "pickup")
            return pd.Series(pickup_address_components)
        except Exception as e:
            if timeout_count == max_retry:
                current_time = datetime.datetime.now(timezone).strftime(datetime_format)
                print(f"{current_time} - Request timeout occurred for {max_retry} times, skipping row {start_row+row.name+1}. {e}")
                return None
            else:
                timeout_count += 1
                current_time = datetime.datetime.now(timezone).strftime(datetime_format)
                print(f"{current_time} - Request timeout occurred, retrying after {timeout} seconds, skipping row {start_row+row.name+1} in {max_retry-timeout_count} retries. {e}")
                time.sleep(timeout)

# Create new columns in df_uberdata for pickup address components
df_uberdata = df_uberdata.merge(df_uberdata.apply(get_pickup_address, axis=1), left_index=True, right_index=True)

# Define a function to get the dropoff address from location
def get_dropoff_address(row):
    current_time = datetime.datetime.now(timezone).strftime(datetime_format)
    print(f"{current_time} - Finding dropoff address for row {start_row+row.name+1}")
    dropoff_coordinates = f"{row['dropoff_latitude']} , {row['dropoff_longitude']}"
    timeout_count = 0
    while True:
        try:
            dropoff_location = geolocator.reverse(dropoff_coordinates)
            dropoff_address_components = get_address_components(dropoff_location, "dropoff")
            return pd.Series(dropoff_address_components)
        except Exception as e:
            if timeout_count == max_retry:
                current_time = datetime.datetime.now(timezone).strftime(datetime_format)
                print(f"{current_time} - Request timeout occurred for {max_retry} times, skipping row {start_row+row.name+1}. {e}")
                return None
            else:
                timeout_count += 1
                current_time = datetime.datetime.now(timezone).strftime(datetime_format)
                print(f"{current_time} - Request timeout occurred, retrying after {timeout} seconds, skipping row {start_row+row.name+1} in {max_retry-timeout_count} retries. {e}")
                time.sleep(timeout)

# Create new columns in df_uberdata for dropoff address components
df_uberdata = df_uberdata.merge(df_uberdata.apply(get_dropoff_address, axis=1), left_index=True, right_index=True)

# Export the dataframe to a CSV file with the current time in the file name
current_time = datetime.datetime.now(timezone).strftime(datetime_format)
output_file = f"uber_dataset_reverse_geocode_rows_{start_row}-{end_row}_{current_time}.csv"
df_uberdata.to_csv(output_file, index=False)

current_time = datetime.datetime.now(timezone).strftime(datetime_format)
print(f"{current_time} - Dataframe exported to {output_file}")
current_time = datetime.datetime.now(timezone).strftime(datetime_format)
print(f"{current_time} - Finished processing rows {start_row}-{end_row}")
    
# Move on to the next chunk
start_row = end_row

## 15th Iteration

In [None]:
end_row = start_row+rows_per_chunk # Value of end_row to enable resuming from last cell

current_time = datetime.datetime.now(timezone).strftime(datetime_format)
print(f"{current_time} - Loading CSV into DataFrame...")
df_uberdata = pd.read_csv(uberdata)
current_time = datetime.datetime.now(timezone).strftime(datetime_format)
print(f"{current_time} - CSV is loaded")
df_uberdata.insert(0, 'no', df_uberdata.reset_index().index + 1)
current_time = datetime.datetime.now(timezone).strftime(datetime_format)
print(f"{current_time} - Added row index column")
df_uberdata = df_uberdata.iloc[start_row:end_row].reset_index(drop=True)
current_time = datetime.datetime.now(timezone).strftime(datetime_format)
print(f"{current_time} - Starting processing rows {start_row} - {end_row}")

# Define a function to get the address components from location
def get_address_components(location, location_type):
    address = location.raw['address']
    address_components = {
        f"{location_type}_address": location.address,
        f"{location_type}_road": address.get('road', ''),
        f"{location_type}_neighbourhood": address.get('neighbourhood', ''),
        f"{location_type}_suburb": address.get('suburb', ''),
        f"{location_type}_county": address.get('county', ''),
        f"{location_type}_city": address.get('city', ''),
        f"{location_type}_state": address.get('state', ''),
        f"{location_type}_postcode": address.get('postcode', ''),
        f"{location_type}_country": address.get('country', '')
    }
    return address_components

# Define a function to get the pickup address from location
def get_pickup_address(row):
    current_time = datetime.datetime.now(timezone).strftime(datetime_format)
    print(f"{current_time} - Finding pickup address for row {start_row+row.name+1}")
    pickup_coordinates = f"{row['pickup_latitude']} , {row['pickup_longitude']}"
    timeout_count = 0
    while True:
        try:
            pickup_location = geolocator.reverse(pickup_coordinates)
            pickup_address_components = get_address_components(pickup_location, "pickup")
            return pd.Series(pickup_address_components)
        except Exception as e:
            if timeout_count == max_retry:
                current_time = datetime.datetime.now(timezone).strftime(datetime_format)
                print(f"{current_time} - Request timeout occurred for {max_retry} times, skipping row {start_row+row.name+1}. {e}")
                return None
            else:
                timeout_count += 1
                current_time = datetime.datetime.now(timezone).strftime(datetime_format)
                print(f"{current_time} - Request timeout occurred, retrying after {timeout} seconds, skipping row {start_row+row.name+1} in {max_retry-timeout_count} retries. {e}")
                time.sleep(timeout)

# Create new columns in df_uberdata for pickup address components
df_uberdata = df_uberdata.merge(df_uberdata.apply(get_pickup_address, axis=1), left_index=True, right_index=True)

# Define a function to get the dropoff address from location
def get_dropoff_address(row):
    current_time = datetime.datetime.now(timezone).strftime(datetime_format)
    print(f"{current_time} - Finding dropoff address for row {start_row+row.name+1}")
    dropoff_coordinates = f"{row['dropoff_latitude']} , {row['dropoff_longitude']}"
    timeout_count = 0
    while True:
        try:
            dropoff_location = geolocator.reverse(dropoff_coordinates)
            dropoff_address_components = get_address_components(dropoff_location, "dropoff")
            return pd.Series(dropoff_address_components)
        except Exception as e:
            if timeout_count == max_retry:
                current_time = datetime.datetime.now(timezone).strftime(datetime_format)
                print(f"{current_time} - Request timeout occurred for {max_retry} times, skipping row {start_row+row.name+1}. {e}")
                return None
            else:
                timeout_count += 1
                current_time = datetime.datetime.now(timezone).strftime(datetime_format)
                print(f"{current_time} - Request timeout occurred, retrying after {timeout} seconds, skipping row {start_row+row.name+1} in {max_retry-timeout_count} retries. {e}")
                time.sleep(timeout)

# Create new columns in df_uberdata for dropoff address components
df_uberdata = df_uberdata.merge(df_uberdata.apply(get_dropoff_address, axis=1), left_index=True, right_index=True)

# Export the dataframe to a CSV file with the current time in the file name
current_time = datetime.datetime.now(timezone).strftime(datetime_format)
output_file = f"uber_dataset_reverse_geocode_rows_{start_row}-{end_row}_{current_time}.csv"
df_uberdata.to_csv(output_file, index=False)

current_time = datetime.datetime.now(timezone).strftime(datetime_format)
print(f"{current_time} - Dataframe exported to {output_file}")
current_time = datetime.datetime.now(timezone).strftime(datetime_format)
print(f"{current_time} - Finished processing rows {start_row}-{end_row}")
    
# Move on to the next chunk
start_row = end_row

## 16th Iteration

In [None]:
end_row = start_row+rows_per_chunk # Value of end_row to enable resuming from last cell

current_time = datetime.datetime.now(timezone).strftime(datetime_format)
print(f"{current_time} - Loading CSV into DataFrame...")
df_uberdata = pd.read_csv(uberdata)
current_time = datetime.datetime.now(timezone).strftime(datetime_format)
print(f"{current_time} - CSV is loaded")
df_uberdata.insert(0, 'no', df_uberdata.reset_index().index + 1)
current_time = datetime.datetime.now(timezone).strftime(datetime_format)
print(f"{current_time} - Added row index column")
df_uberdata = df_uberdata.iloc[start_row:end_row].reset_index(drop=True)
current_time = datetime.datetime.now(timezone).strftime(datetime_format)
print(f"{current_time} - Starting processing rows {start_row} - {end_row}")

# Define a function to get the address components from location
def get_address_components(location, location_type):
    address = location.raw['address']
    address_components = {
        f"{location_type}_address": location.address,
        f"{location_type}_road": address.get('road', ''),
        f"{location_type}_neighbourhood": address.get('neighbourhood', ''),
        f"{location_type}_suburb": address.get('suburb', ''),
        f"{location_type}_county": address.get('county', ''),
        f"{location_type}_city": address.get('city', ''),
        f"{location_type}_state": address.get('state', ''),
        f"{location_type}_postcode": address.get('postcode', ''),
        f"{location_type}_country": address.get('country', '')
    }
    return address_components

# Define a function to get the pickup address from location
def get_pickup_address(row):
    current_time = datetime.datetime.now(timezone).strftime(datetime_format)
    print(f"{current_time} - Finding pickup address for row {start_row+row.name+1}")
    pickup_coordinates = f"{row['pickup_latitude']} , {row['pickup_longitude']}"
    timeout_count = 0
    while True:
        try:
            pickup_location = geolocator.reverse(pickup_coordinates)
            pickup_address_components = get_address_components(pickup_location, "pickup")
            return pd.Series(pickup_address_components)
        except Exception as e:
            if timeout_count == max_retry:
                current_time = datetime.datetime.now(timezone).strftime(datetime_format)
                print(f"{current_time} - Request timeout occurred for {max_retry} times, skipping row {start_row+row.name+1}. {e}")
                return None
            else:
                timeout_count += 1
                current_time = datetime.datetime.now(timezone).strftime(datetime_format)
                print(f"{current_time} - Request timeout occurred, retrying after {timeout} seconds, skipping row {start_row+row.name+1} in {max_retry-timeout_count} retries. {e}")
                time.sleep(timeout)

# Create new columns in df_uberdata for pickup address components
df_uberdata = df_uberdata.merge(df_uberdata.apply(get_pickup_address, axis=1), left_index=True, right_index=True)

# Define a function to get the dropoff address from location
def get_dropoff_address(row):
    current_time = datetime.datetime.now(timezone).strftime(datetime_format)
    print(f"{current_time} - Finding dropoff address for row {start_row+row.name+1}")
    dropoff_coordinates = f"{row['dropoff_latitude']} , {row['dropoff_longitude']}"
    timeout_count = 0
    while True:
        try:
            dropoff_location = geolocator.reverse(dropoff_coordinates)
            dropoff_address_components = get_address_components(dropoff_location, "dropoff")
            return pd.Series(dropoff_address_components)
        except Exception as e:
            if timeout_count == max_retry:
                current_time = datetime.datetime.now(timezone).strftime(datetime_format)
                print(f"{current_time} - Request timeout occurred for {max_retry} times, skipping row {start_row+row.name+1}. {e}")
                return None
            else:
                timeout_count += 1
                current_time = datetime.datetime.now(timezone).strftime(datetime_format)
                print(f"{current_time} - Request timeout occurred, retrying after {timeout} seconds, skipping row {start_row+row.name+1} in {max_retry-timeout_count} retries. {e}")
                time.sleep(timeout)

# Create new columns in df_uberdata for dropoff address components
df_uberdata = df_uberdata.merge(df_uberdata.apply(get_dropoff_address, axis=1), left_index=True, right_index=True)

# Export the dataframe to a CSV file with the current time in the file name
current_time = datetime.datetime.now(timezone).strftime(datetime_format)
output_file = f"uber_dataset_reverse_geocode_rows_{start_row}-{end_row}_{current_time}.csv"
df_uberdata.to_csv(output_file, index=False)

current_time = datetime.datetime.now(timezone).strftime(datetime_format)
print(f"{current_time} - Dataframe exported to {output_file}")
current_time = datetime.datetime.now(timezone).strftime(datetime_format)
print(f"{current_time} - Finished processing rows {start_row}-{end_row}")
    
# Move on to the next chunk
start_row = end_row

## 17th Iteration

In [None]:
end_row = start_row+rows_per_chunk # Value of end_row to enable resuming from last cell

current_time = datetime.datetime.now(timezone).strftime(datetime_format)
print(f"{current_time} - Loading CSV into DataFrame...")
df_uberdata = pd.read_csv(uberdata)
current_time = datetime.datetime.now(timezone).strftime(datetime_format)
print(f"{current_time} - CSV is loaded")
df_uberdata.insert(0, 'no', df_uberdata.reset_index().index + 1)
current_time = datetime.datetime.now(timezone).strftime(datetime_format)
print(f"{current_time} - Added row index column")
df_uberdata = df_uberdata.iloc[start_row:end_row].reset_index(drop=True)
current_time = datetime.datetime.now(timezone).strftime(datetime_format)
print(f"{current_time} - Starting processing rows {start_row} - {end_row}")

# Define a function to get the address components from location
def get_address_components(location, location_type):
    address = location.raw['address']
    address_components = {
        f"{location_type}_address": location.address,
        f"{location_type}_road": address.get('road', ''),
        f"{location_type}_neighbourhood": address.get('neighbourhood', ''),
        f"{location_type}_suburb": address.get('suburb', ''),
        f"{location_type}_county": address.get('county', ''),
        f"{location_type}_city": address.get('city', ''),
        f"{location_type}_state": address.get('state', ''),
        f"{location_type}_postcode": address.get('postcode', ''),
        f"{location_type}_country": address.get('country', '')
    }
    return address_components

# Define a function to get the pickup address from location
def get_pickup_address(row):
    current_time = datetime.datetime.now(timezone).strftime(datetime_format)
    print(f"{current_time} - Finding pickup address for row {start_row+row.name+1}")
    pickup_coordinates = f"{row['pickup_latitude']} , {row['pickup_longitude']}"
    timeout_count = 0
    while True:
        try:
            pickup_location = geolocator.reverse(pickup_coordinates)
            pickup_address_components = get_address_components(pickup_location, "pickup")
            return pd.Series(pickup_address_components)
        except Exception as e:
            if timeout_count == max_retry:
                current_time = datetime.datetime.now(timezone).strftime(datetime_format)
                print(f"{current_time} - Request timeout occurred for {max_retry} times, skipping row {start_row+row.name+1}. {e}")
                return None
            else:
                timeout_count += 1
                current_time = datetime.datetime.now(timezone).strftime(datetime_format)
                print(f"{current_time} - Request timeout occurred, retrying after {timeout} seconds, skipping row {start_row+row.name+1} in {max_retry-timeout_count} retries. {e}")
                time.sleep(timeout)

# Create new columns in df_uberdata for pickup address components
df_uberdata = df_uberdata.merge(df_uberdata.apply(get_pickup_address, axis=1), left_index=True, right_index=True)

# Define a function to get the dropoff address from location
def get_dropoff_address(row):
    current_time = datetime.datetime.now(timezone).strftime(datetime_format)
    print(f"{current_time} - Finding dropoff address for row {start_row+row.name+1}")
    dropoff_coordinates = f"{row['dropoff_latitude']} , {row['dropoff_longitude']}"
    timeout_count = 0
    while True:
        try:
            dropoff_location = geolocator.reverse(dropoff_coordinates)
            dropoff_address_components = get_address_components(dropoff_location, "dropoff")
            return pd.Series(dropoff_address_components)
        except Exception as e:
            if timeout_count == max_retry:
                current_time = datetime.datetime.now(timezone).strftime(datetime_format)
                print(f"{current_time} - Request timeout occurred for {max_retry} times, skipping row {start_row+row.name+1}. {e}")
                return None
            else:
                timeout_count += 1
                current_time = datetime.datetime.now(timezone).strftime(datetime_format)
                print(f"{current_time} - Request timeout occurred, retrying after {timeout} seconds, skipping row {start_row+row.name+1} in {max_retry-timeout_count} retries. {e}")
                time.sleep(timeout)

# Create new columns in df_uberdata for dropoff address components
df_uberdata = df_uberdata.merge(df_uberdata.apply(get_dropoff_address, axis=1), left_index=True, right_index=True)

# Export the dataframe to a CSV file with the current time in the file name
current_time = datetime.datetime.now(timezone).strftime(datetime_format)
output_file = f"uber_dataset_reverse_geocode_rows_{start_row}-{end_row}_{current_time}.csv"
df_uberdata.to_csv(output_file, index=False)

current_time = datetime.datetime.now(timezone).strftime(datetime_format)
print(f"{current_time} - Dataframe exported to {output_file}")
current_time = datetime.datetime.now(timezone).strftime(datetime_format)
print(f"{current_time} - Finished processing rows {start_row}-{end_row}")
    
# Move on to the next chunk
start_row = end_row

## 18th Iteration

In [None]:
end_row = start_row+rows_per_chunk # Value of end_row to enable resuming from last cell

current_time = datetime.datetime.now(timezone).strftime(datetime_format)
print(f"{current_time} - Loading CSV into DataFrame...")
df_uberdata = pd.read_csv(uberdata)
current_time = datetime.datetime.now(timezone).strftime(datetime_format)
print(f"{current_time} - CSV is loaded")
df_uberdata.insert(0, 'no', df_uberdata.reset_index().index + 1)
current_time = datetime.datetime.now(timezone).strftime(datetime_format)
print(f"{current_time} - Added row index column")
df_uberdata = df_uberdata.iloc[start_row:end_row].reset_index(drop=True)
current_time = datetime.datetime.now(timezone).strftime(datetime_format)
print(f"{current_time} - Starting processing rows {start_row} - {end_row}")

# Define a function to get the address components from location
def get_address_components(location, location_type):
    address = location.raw['address']
    address_components = {
        f"{location_type}_address": location.address,
        f"{location_type}_road": address.get('road', ''),
        f"{location_type}_neighbourhood": address.get('neighbourhood', ''),
        f"{location_type}_suburb": address.get('suburb', ''),
        f"{location_type}_county": address.get('county', ''),
        f"{location_type}_city": address.get('city', ''),
        f"{location_type}_state": address.get('state', ''),
        f"{location_type}_postcode": address.get('postcode', ''),
        f"{location_type}_country": address.get('country', '')
    }
    return address_components

# Define a function to get the pickup address from location
def get_pickup_address(row):
    current_time = datetime.datetime.now(timezone).strftime(datetime_format)
    print(f"{current_time} - Finding pickup address for row {start_row+row.name+1}")
    pickup_coordinates = f"{row['pickup_latitude']} , {row['pickup_longitude']}"
    timeout_count = 0
    while True:
        try:
            pickup_location = geolocator.reverse(pickup_coordinates)
            pickup_address_components = get_address_components(pickup_location, "pickup")
            return pd.Series(pickup_address_components)
        except Exception as e:
            if timeout_count == max_retry:
                current_time = datetime.datetime.now(timezone).strftime(datetime_format)
                print(f"{current_time} - Request timeout occurred for {max_retry} times, skipping row {start_row+row.name+1}. {e}")
                return None
            else:
                timeout_count += 1
                current_time = datetime.datetime.now(timezone).strftime(datetime_format)
                print(f"{current_time} - Request timeout occurred, retrying after {timeout} seconds, skipping row {start_row+row.name+1} in {max_retry-timeout_count} retries. {e}")
                time.sleep(timeout)

# Create new columns in df_uberdata for pickup address components
df_uberdata = df_uberdata.merge(df_uberdata.apply(get_pickup_address, axis=1), left_index=True, right_index=True)

# Define a function to get the dropoff address from location
def get_dropoff_address(row):
    current_time = datetime.datetime.now(timezone).strftime(datetime_format)
    print(f"{current_time} - Finding dropoff address for row {start_row+row.name+1}")
    dropoff_coordinates = f"{row['dropoff_latitude']} , {row['dropoff_longitude']}"
    timeout_count = 0
    while True:
        try:
            dropoff_location = geolocator.reverse(dropoff_coordinates)
            dropoff_address_components = get_address_components(dropoff_location, "dropoff")
            return pd.Series(dropoff_address_components)
        except Exception as e:
            if timeout_count == max_retry:
                current_time = datetime.datetime.now(timezone).strftime(datetime_format)
                print(f"{current_time} - Request timeout occurred for {max_retry} times, skipping row {start_row+row.name+1}. {e}")
                return None
            else:
                timeout_count += 1
                current_time = datetime.datetime.now(timezone).strftime(datetime_format)
                print(f"{current_time} - Request timeout occurred, retrying after {timeout} seconds, skipping row {start_row+row.name+1} in {max_retry-timeout_count} retries. {e}")
                time.sleep(timeout)

# Create new columns in df_uberdata for dropoff address components
df_uberdata = df_uberdata.merge(df_uberdata.apply(get_dropoff_address, axis=1), left_index=True, right_index=True)

# Export the dataframe to a CSV file with the current time in the file name
current_time = datetime.datetime.now(timezone).strftime(datetime_format)
output_file = f"uber_dataset_reverse_geocode_rows_{start_row}-{end_row}_{current_time}.csv"
df_uberdata.to_csv(output_file, index=False)

current_time = datetime.datetime.now(timezone).strftime(datetime_format)
print(f"{current_time} - Dataframe exported to {output_file}")
current_time = datetime.datetime.now(timezone).strftime(datetime_format)
print(f"{current_time} - Finished processing rows {start_row}-{end_row}")
    
# Move on to the next chunk
start_row = end_row

## 19th Iteration

In [None]:
end_row = start_row+rows_per_chunk # Value of end_row to enable resuming from last cell

current_time = datetime.datetime.now(timezone).strftime(datetime_format)
print(f"{current_time} - Loading CSV into DataFrame...")
df_uberdata = pd.read_csv(uberdata)
current_time = datetime.datetime.now(timezone).strftime(datetime_format)
print(f"{current_time} - CSV is loaded")
df_uberdata.insert(0, 'no', df_uberdata.reset_index().index + 1)
current_time = datetime.datetime.now(timezone).strftime(datetime_format)
print(f"{current_time} - Added row index column")
df_uberdata = df_uberdata.iloc[start_row:end_row].reset_index(drop=True)
current_time = datetime.datetime.now(timezone).strftime(datetime_format)
print(f"{current_time} - Starting processing rows {start_row} - {end_row}")

# Define a function to get the address components from location
def get_address_components(location, location_type):
    address = location.raw['address']
    address_components = {
        f"{location_type}_address": location.address,
        f"{location_type}_road": address.get('road', ''),
        f"{location_type}_neighbourhood": address.get('neighbourhood', ''),
        f"{location_type}_suburb": address.get('suburb', ''),
        f"{location_type}_county": address.get('county', ''),
        f"{location_type}_city": address.get('city', ''),
        f"{location_type}_state": address.get('state', ''),
        f"{location_type}_postcode": address.get('postcode', ''),
        f"{location_type}_country": address.get('country', '')
    }
    return address_components

# Define a function to get the pickup address from location
def get_pickup_address(row):
    current_time = datetime.datetime.now(timezone).strftime(datetime_format)
    print(f"{current_time} - Finding pickup address for row {start_row+row.name+1}")
    pickup_coordinates = f"{row['pickup_latitude']} , {row['pickup_longitude']}"
    timeout_count = 0
    while True:
        try:
            pickup_location = geolocator.reverse(pickup_coordinates)
            pickup_address_components = get_address_components(pickup_location, "pickup")
            return pd.Series(pickup_address_components)
        except Exception as e:
            if timeout_count == max_retry:
                current_time = datetime.datetime.now(timezone).strftime(datetime_format)
                print(f"{current_time} - Request timeout occurred for {max_retry} times, skipping row {start_row+row.name+1}. {e}")
                return None
            else:
                timeout_count += 1
                current_time = datetime.datetime.now(timezone).strftime(datetime_format)
                print(f"{current_time} - Request timeout occurred, retrying after {timeout} seconds, skipping row {start_row+row.name+1} in {max_retry-timeout_count} retries. {e}")
                time.sleep(timeout)

# Create new columns in df_uberdata for pickup address components
df_uberdata = df_uberdata.merge(df_uberdata.apply(get_pickup_address, axis=1), left_index=True, right_index=True)

# Define a function to get the dropoff address from location
def get_dropoff_address(row):
    current_time = datetime.datetime.now(timezone).strftime(datetime_format)
    print(f"{current_time} - Finding dropoff address for row {start_row+row.name+1}")
    dropoff_coordinates = f"{row['dropoff_latitude']} , {row['dropoff_longitude']}"
    timeout_count = 0
    while True:
        try:
            dropoff_location = geolocator.reverse(dropoff_coordinates)
            dropoff_address_components = get_address_components(dropoff_location, "dropoff")
            return pd.Series(dropoff_address_components)
        except Exception as e:
            if timeout_count == max_retry:
                current_time = datetime.datetime.now(timezone).strftime(datetime_format)
                print(f"{current_time} - Request timeout occurred for {max_retry} times, skipping row {start_row+row.name+1}. {e}")
                return None
            else:
                timeout_count += 1
                current_time = datetime.datetime.now(timezone).strftime(datetime_format)
                print(f"{current_time} - Request timeout occurred, retrying after {timeout} seconds, skipping row {start_row+row.name+1} in {max_retry-timeout_count} retries. {e}")
                time.sleep(timeout)

# Create new columns in df_uberdata for dropoff address components
df_uberdata = df_uberdata.merge(df_uberdata.apply(get_dropoff_address, axis=1), left_index=True, right_index=True)

# Export the dataframe to a CSV file with the current time in the file name
current_time = datetime.datetime.now(timezone).strftime(datetime_format)
output_file = f"uber_dataset_reverse_geocode_rows_{start_row}-{end_row}_{current_time}.csv"
df_uberdata.to_csv(output_file, index=False)

current_time = datetime.datetime.now(timezone).strftime(datetime_format)
print(f"{current_time} - Dataframe exported to {output_file}")
current_time = datetime.datetime.now(timezone).strftime(datetime_format)
print(f"{current_time} - Finished processing rows {start_row}-{end_row}")
    
# Move on to the next chunk
start_row = end_row

## 20th Iteration

In [None]:
end_row = start_row+rows_per_chunk # Value of end_row to enable resuming from last cell

current_time = datetime.datetime.now(timezone).strftime(datetime_format)
print(f"{current_time} - Loading CSV into DataFrame...")
df_uberdata = pd.read_csv(uberdata)
current_time = datetime.datetime.now(timezone).strftime(datetime_format)
print(f"{current_time} - CSV is loaded")
df_uberdata.insert(0, 'no', df_uberdata.reset_index().index + 1)
current_time = datetime.datetime.now(timezone).strftime(datetime_format)
print(f"{current_time} - Added row index column")
df_uberdata = df_uberdata.iloc[start_row:end_row].reset_index(drop=True)
current_time = datetime.datetime.now(timezone).strftime(datetime_format)
print(f"{current_time} - Starting processing rows {start_row} - {end_row}")

# Define a function to get the address components from location
def get_address_components(location, location_type):
    address = location.raw['address']
    address_components = {
        f"{location_type}_address": location.address,
        f"{location_type}_road": address.get('road', ''),
        f"{location_type}_neighbourhood": address.get('neighbourhood', ''),
        f"{location_type}_suburb": address.get('suburb', ''),
        f"{location_type}_county": address.get('county', ''),
        f"{location_type}_city": address.get('city', ''),
        f"{location_type}_state": address.get('state', ''),
        f"{location_type}_postcode": address.get('postcode', ''),
        f"{location_type}_country": address.get('country', '')
    }
    return address_components

# Define a function to get the pickup address from location
def get_pickup_address(row):
    current_time = datetime.datetime.now(timezone).strftime(datetime_format)
    print(f"{current_time} - Finding pickup address for row {start_row+row.name+1}")
    pickup_coordinates = f"{row['pickup_latitude']} , {row['pickup_longitude']}"
    timeout_count = 0
    while True:
        try:
            pickup_location = geolocator.reverse(pickup_coordinates)
            pickup_address_components = get_address_components(pickup_location, "pickup")
            return pd.Series(pickup_address_components)
        except Exception as e:
            if timeout_count == max_retry:
                current_time = datetime.datetime.now(timezone).strftime(datetime_format)
                print(f"{current_time} - Request timeout occurred for {max_retry} times, skipping row {start_row+row.name+1}. {e}")
                return None
            else:
                timeout_count += 1
                current_time = datetime.datetime.now(timezone).strftime(datetime_format)
                print(f"{current_time} - Request timeout occurred, retrying after {timeout} seconds, skipping row {start_row+row.name+1} in {max_retry-timeout_count} retries. {e}")
                time.sleep(timeout)

# Create new columns in df_uberdata for pickup address components
df_uberdata = df_uberdata.merge(df_uberdata.apply(get_pickup_address, axis=1), left_index=True, right_index=True)

# Define a function to get the dropoff address from location
def get_dropoff_address(row):
    current_time = datetime.datetime.now(timezone).strftime(datetime_format)
    print(f"{current_time} - Finding dropoff address for row {start_row+row.name+1}")
    dropoff_coordinates = f"{row['dropoff_latitude']} , {row['dropoff_longitude']}"
    timeout_count = 0
    while True:
        try:
            dropoff_location = geolocator.reverse(dropoff_coordinates)
            dropoff_address_components = get_address_components(dropoff_location, "dropoff")
            return pd.Series(dropoff_address_components)
        except Exception as e:
            if timeout_count == max_retry:
                current_time = datetime.datetime.now(timezone).strftime(datetime_format)
                print(f"{current_time} - Request timeout occurred for {max_retry} times, skipping row {start_row+row.name+1}. {e}")
                return None
            else:
                timeout_count += 1
                current_time = datetime.datetime.now(timezone).strftime(datetime_format)
                print(f"{current_time} - Request timeout occurred, retrying after {timeout} seconds, skipping row {start_row+row.name+1} in {max_retry-timeout_count} retries. {e}")
                time.sleep(timeout)

# Create new columns in df_uberdata for dropoff address components
df_uberdata = df_uberdata.merge(df_uberdata.apply(get_dropoff_address, axis=1), left_index=True, right_index=True)

# Export the dataframe to a CSV file with the current time in the file name
current_time = datetime.datetime.now(timezone).strftime(datetime_format)
output_file = f"uber_dataset_reverse_geocode_rows_{start_row}-{end_row}_{current_time}.csv"
df_uberdata.to_csv(output_file, index=False)

current_time = datetime.datetime.now(timezone).strftime(datetime_format)
print(f"{current_time} - Dataframe exported to {output_file}")
current_time = datetime.datetime.now(timezone).strftime(datetime_format)
print(f"{current_time} - Finished processing rows {start_row}-{end_row}")
    
# Move on to the next chunk
start_row = end_row