In [1]:
import pyodbc
import pandas as pd
from geopy.distance import geodesic
import folium
import warnings

# Suppress all warnings
warnings.filterwarnings('ignore')
# Database Connection
server = 'EFGSVR050\\NOVY'
database = 'Olist Case Study'
conn = pyodbc.connect(f'DRIVER={{SQL Server}};SERVER={server};DATABASE={database}')

In [2]:
# Query to fetch geolocation data
geo_query = """
-- Fetch customer geolocation data
SELECT 
    c.customer_unique_id AS unique_id,
    'customer' AS entity_type,
    c.customer_zip_code_prefix,
    g1.geolocation_lat AS latitude,
    g1.geolocation_lng AS longitude
FROM 
    olist_customers_dataset c
JOIN 
    olist_geolocation_dataset g1 ON c.customer_zip_code_prefix = g1.geolocation_zip_code_prefix

UNION

-- Fetch seller geolocation data
SELECT 
    s.seller_id AS unique_id,
    'seller' AS entity_type,
    s.seller_zip_code_prefix,
    g2.geolocation_lat AS latitude,
    g2.geolocation_lng AS longitude
FROM 
    olist_sellers_dataset s
JOIN 
    olist_geolocation_dataset g2 ON s.seller_zip_code_prefix = g2.geolocation_zip_code_prefix

"""

# Fetching geolocation data into DataFrame
geo_data = pd.read_sql(geo_query, conn)
conn.close()

In [3]:
# Separate customer and seller data from geo_data
customer_data = geo_data[geo_data['entity_type'] == 'customer']
seller_data = geo_data[geo_data['entity_type'] == 'seller']

In [4]:
# Ensure no duplicate records
customers = customer_data.drop_duplicates(subset=['unique_id'])
sellers = seller_data.drop_duplicates(subset=['unique_id'])

In [5]:
import pandas as pd
import numpy as np
from scipy.spatial.distance import cdist

# Example DataFrames for customers and sellers
# Ensure your DataFrames have columns: 'unique_id', 'latitude', 'longitude'
# customers = pd.DataFrame(...)
# sellers = pd.DataFrame(...)

# Extract coordinates
customer_coords = customers[['latitude', 'longitude']].to_numpy()
seller_coords = sellers[['latitude', 'longitude']].to_numpy()

# Calculate pairwise distances (Euclidean, not geodesic)
distance_matrix = cdist(customer_coords, seller_coords, metric='euclidean')

# Convert distances to geodesic (approximation for short distances)
# Use a scaling factor to adjust if coordinates are degrees
earth_radius_km = 6371  # Approximate radius of Earth
distance_matrix *= (np.pi / 180) * earth_radius_km  # Convert degrees to radians, then to km

# Flatten distance matrix and prepare DataFrame
customer_ids = np.repeat(customers['unique_id'].values, len(sellers))
seller_ids = np.tile(sellers['unique_id'].values, len(customers))

distances_df = pd.DataFrame({
    'customer_id': customer_ids,
    'seller_id': seller_ids,
    'distance_km': distance_matrix.flatten()
})

# Optional: Sort by distance for easy lookup
distances_df.sort_values(by=['customer_id', 'distance_km'], inplace=True)

# Display or save the output
print(distances_df.head())

                           customer_id                         seller_id  \
2304  0000366f3b9a7992bf8c76cfdf3221e2  c003204e1ab016dfa150abc119207b24   
2258  0000366f3b9a7992bf8c76cfdf3221e2  bc39d8938f90a3a2b98193723ed59774   
126   0000366f3b9a7992bf8c76cfdf3221e2  0a85ebe4e328db81ac9109781205e2f7   
169   0000366f3b9a7992bf8c76cfdf3221e2  0e44d110fa6a54e121cb2c095a77762f   
1906  0000366f3b9a7992bf8c76cfdf3221e2  999f6d9a1c7c81f43be44c6b7d076210   

      distance_km  
2304     8.049073  
2258     9.245595  
126     10.021591  
169     10.684016  
1906    11.458320  


In [7]:
from sklearn.neighbors import NearestNeighbors
import pandas as pd

# Prepare customer and seller coordinates
customer_coords = customers[['latitude', 'longitude']].to_numpy()
seller_coords = sellers[['latitude', 'longitude']].to_numpy()

# Fit Nearest Neighbors model
nbrs = NearestNeighbors(n_neighbors=1, algorithm='ball_tree').fit(seller_coords)

# Find the nearest seller for each customer
distances, indices = nbrs.kneighbors(customer_coords)

# Create output DataFrame
distances_df2 = pd.DataFrame({
    'customer_id': customers['unique_id'],
    'seller_id': sellers.iloc[indices.flatten()]['unique_id'].values,
    'distance_km': distances.flatten()
})

# Display or save results
print(distances_df2.head())

                         customer_id                         seller_id  \
0   0000366f3b9a7992bf8c76cfdf3221e2  c003204e1ab016dfa150abc119207b24   
6   0000b849f77a49e4a4ce2b2a4ca5be3f  55f7a3319d80f7fdf078b8f03e6725fe   
19  0000f46a3911fa3c0805444483337064  aafe36600ce604f205b86b5084d3d767   
32  0000f6ccb0745a6a4b88665a16c9f078  06a2c3af7b3aee5d69171b0e14f0ee87   
36  0004aac84e0df4da2b147fca70cf8255  3e46a38ee171c503c3b4a3f23fe3bf0c   

    distance_km  
0      0.072387  
6      0.002488  
19     0.014139  
32     4.404898  
36     0.002029  


In [8]:
# Save the DataFrame to an Excel file
distances_df2.to_excel('nearest_seller_distances.xlsx', index=False)

print("Data saved to 'nearest_seller_distances.xlsx'")

Data saved to 'nearest_seller_distances.xlsx'


In [10]:
from folium.plugins import MarkerCluster
# Initialize a folium map centered at an approximate midpoint
map_center = [(customers['latitude'].mean() + sellers['latitude'].mean()) / 2,
              (customers['longitude'].mean() + sellers['longitude'].mean()) / 2]
map_plot = folium.Map(location=map_center, zoom_start=6)

# Add sellers to the map
seller_cluster = MarkerCluster(name="Sellers").add_to(map_plot)
for _, row in sellers.iterrows():
    folium.Marker(
        location=(row['latitude'], row['longitude']),
        popup=f"Seller: {row['unique_id']}",
        icon=folium.Icon(color='blue', icon='store')
    ).add_to(seller_cluster)

# Add customers to the map
customer_cluster = MarkerCluster(name="Customers").add_to(map_plot)
for _, row in customers.iterrows():
    folium.Marker(
        location=(row['latitude'], row['longitude']),
        popup=f"Customer: {row['unique_id']}",
        icon=folium.Icon(color='green', icon='user')
    ).add_to(customer_cluster)

# Add lines connecting customers to their nearest sellers
for _, row in distances_df2.iterrows():
    customer_row = customers[customers['unique_id'] == row['customer_id']].iloc[0]
    seller_row = sellers[sellers['unique_id'] == row['seller_id']].iloc[0]
    folium.PolyLine(
        locations=[(customer_row['latitude'], customer_row['longitude']),
                   (seller_row['latitude'], seller_row['longitude'])],
        color='red',
        weight=2
    ).add_to(map_plot)

# Add layer control
folium.LayerControl().add_to(map_plot)

# Save map to an HTML file
map_plot.save('customer_seller_map.html')

print("Map saved as 'customer_seller_map.html'")

Map saved as 'customer_seller_map.html'
