# **Import Library**

In [1]:
# Google Drive Public File/Folder Downloader
# https://pypi.org/project/gdown/
# https://github.com/wkentaro/gdown

!pip install -q -U gdown

In [2]:
import math
import gdown
import pandas as pd

# **Import Dataset**

In [3]:
# File ID from Google Drive link
base_url = "https://drive.google.com/uc?id={id}"
file_id = {
    # "old_users_data"              : "16ngL0bfyQqucpYIffxkKiCa3qSGhR26S",
    "new_tourism_with_id_links"   : "10vsiQ3A3kxRkPrIR0rn_qR0ySZ368Pck",
    # "new_tourism_rating_comments" : "1o-phm0eE64NulxzTLM_FWCyiMbfPHsYt",
    # "package_tourism"             : "1LtPNXAnkJde03TFHJXqhIbKvtI_FXJxC"
}

# Download all the dataset file using gdown
for filename, id in file_id.items():
  gdown.download(id=id, output=f"{filename}.csv")      # Using Python
  # !gdown {base_url.format(id=id)} -O {filename}.csv  # Using terminal command

Downloading...
From: https://drive.google.com/uc?id=10vsiQ3A3kxRkPrIR0rn_qR0ySZ368Pck
To: /content/new_tourism_with_id_links.csv
100%|██████████| 435k/435k [00:00<00:00, 86.9MB/s]


In [4]:
# Load the data from the csv file
file_path = "new_tourism_with_id_links.csv"  # Update with your csv file path
data = pd.read_csv(file_path)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 437 entries, 0 to 436
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Place_Id      437 non-null    int64  
 1   Place_Name    437 non-null    object 
 2   Category      437 non-null    object 
 3   Description   437 non-null    object 
 4   City          437 non-null    object 
 5   Price         437 non-null    int64  
 6   Rating        437 non-null    object 
 7   Time_Minutes  205 non-null    float64
 8   Coordinate    437 non-null    object 
 9   Lat           437 non-null    float64
 10  Long          437 non-null    float64
 11  Img_Path_0    437 non-null    object 
 12  Img_Path_1    358 non-null    object 
 13  Img_Path_2    166 non-null    object 
dtypes: float64(3), int64(2), object(9)
memory usage: 47.9+ KB


In [5]:
data.sample(3)

Unnamed: 0,Place_Id,Place_Name,Category,Description,City,Price,Rating,Time_Minutes,Coordinate,Lat,Long,Img_Path_0,Img_Path_1,Img_Path_2
280,281,Tektona Waterpark,"Taman Hiburan, Taman Rekreasi Anak",Tektona Waterpark sebuah wahana yang memberika...,Bandung,60000,38,,"{'lat': -7.009602199999999, 'lng': 107.6062161}",-7.009602,107.606216,images_output\Tektona Waterpark\Tektona Waterp...,,
295,296,Batununggal Indah Club,Taman Hiburan,Kolam renang Batununggal merupakan salah satu ...,Bandung,70000,44,,"{'lat': -6.963229999999999, 'lng': 107.626416}",-6.96323,107.626416,images_output\Batununggal Indah Club\Batunungg...,images_output\Batununggal Indah Club\Batunungg...,images_output\Batununggal Indah Club\Batunungg...
232,233,Gunung Papandayan,"Cagar Alam, Suaka Margasatwa",Gunung Papandayan adalah gunung api strato yan...,Bandung,30000,46,,"{'lat': -7.319325300000001, 'lng': 107.7310494}",-7.319325,107.731049,images_output\Gunung Papandayan\Gunung Papanda...,images_output\Gunung Papandayan\Gunung Papanda...,images_output\Gunung Papandayan\Gunung Papanda...


# **Fix Latitude Longitude**

> I've saved the results back to Google Drive > Final Dataset, so no need to run it again.

In [38]:
# import ast

# def get_lat(coordinate: str):
#   coordinate = ast.literal_eval(coordinate)
#   return coordinate['lat']

# def get_long(coordinate: str):
#   coordinate = ast.literal_eval(coordinate)
#   return coordinate['lng']

# data['Lat'] = data['Coordinate'].apply(get_lat)
# data['Long'] = data['Coordinate'].apply(get_long)

# filename = 'new_tourism_with_id_links'
# data.to_csv(f'{filename}.csv', index=False)
# data.to_excel(f'{filename}.xlsx', index=False)

# **Calculate Haversine Distance**

In [6]:
# Haversine formula function to calculate the distance between two points
def haversine(lat1, lon1, lat2, lon2):
    R = 6371.0  # Earth's radius in kilometers

    # Convert degrees to radians
    lat1, lon1, lat2, lon2 = map(math.radians, [lat1, lon1, lat2, lon2])

    # Haversine formula
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = math.sin(dlat / 2)**2 + math.cos(lat1) * math.cos(lat2) * math.sin(dlon / 2)**2
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))
    distance = R * c
    return distance

In [7]:
# Create an empty list to store the results
results = []

# Iterate over all combinations of places
for i, row1 in data.iterrows():
    for j, row2 in data.iterrows():
        if i != j:  # Ensure we include all pairs (including reverse)
            place_a_id = row1['Place_Id']
            place_b_id = row2['Place_Id']
            lat1, lon1 = row1['Lat'], row1['Long']
            lat2, lon2 = row2['Lat'], row2['Long']

            # Calculate distance
            distance = haversine(lat1, lon1, lat2, lon2)

            # Add the result to the list
            results.append({
                'Place_Id_Source': place_a_id,
                'Place_Id_Target': place_b_id,
                'Distance': round(distance, 2)
            })

# Convert the results to a DataFrame
result_df = pd.DataFrame(results)
result_df

Unnamed: 0,Place_Id_Source,Place_Id_Target,Distance
0,1,2,4.34
1,1,3,5.61
2,1,4,16.00
3,1,5,5.85
4,1,6,7.30
...,...,...,...
190527,437,432,2.23
190528,437,433,21.41
190529,437,434,5.49
190530,437,435,4.20


In [9]:
# Save the results to a new Excel file
output_file_path = 'tourist_spots_distance'
result_df.to_excel(f'{output_file_path}.xlsx', index=False)
result_df.to_csv(f'{output_file_path}.csv', index=False)

print(f'Successfully saved the distance to the tourist spot to {output_file_path}.csv|.xlsx')
print('Please download the results')

Successfully saved the distance to the tourist spot to tourist_spots_distance.csv|.xlsx
Please download the results
