In [2]:
import pandas as pd
import numpy as np
import statistics
import matplotlib.pyplot as plt
from sklearn.cluster import AgglomerativeClustering, KMeans, DBSCAN, OPTICS, Birch
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from numba import njit, prange
import itertools
import math
import haversine as hs
import requests
import json
import time

In [3]:
df = pd.read_excel('CargoSample.xlsx')
df = df.dropna()
dates = df['CargoDate'].unique()

In [4]:
df.head()

Unnamed: 0,Cargo_ID,DepartmentInfo_ID,CargoDate,SaleInvoice_ID,MainAccount_ID,LatitudePoint,LongitudePoint
0,140218461,18,2023-07-03,1402189460,330623,38.475766,47.063073
1,140218504,18,2023-07-11,14021810299,330623,38.475766,47.063073
2,140218504,18,2023-07-11,14021810493,330623,38.475766,47.063073
3,140218465,18,2023-07-04,1402189463,330627,38.061419,46.322348
4,140218492,18,2023-07-10,14021810212,330627,38.061419,46.322348


In [5]:
df.describe()

Unnamed: 0,Cargo_ID,DepartmentInfo_ID,SaleInvoice_ID,MainAccount_ID,LatitudePoint,LongitudePoint
count,19851.0,19851.0,19851.0,19851.0,19851.0,19851.0
mean,465059700.0,29.075261,7439222000.0,79387560.0,34.532162,51.710861
std,551866500.0,15.734324,6305245000.0,303856400.0,3.050551,4.190524
min,14024110.0,8.0,14024120.0,328391.0,0.0,0.0
25%,140214300.0,14.0,1402326000.0,338638.0,32.647165,49.580341
50%,140233200.0,32.0,1402401000.0,347613.0,35.70135,51.395628
75%,1402081000.0,49.0,14021810000.0,371865.0,36.5141,52.712001
max,1402512000.0,51.0,14025130000.0,1401002000.0,39.6527,62.708922


In [6]:
def compare_machine_to_human_clustering(df):
    number_of_dates = len(dates)
    sum_of_scores = 0
    average = 0
    for date in dates:
        target_date_df = df.loc[df['CargoDate'] == date]
        number_of_visitors = target_date_df.iloc[:, 0].unique().shape[0]

        target_date_df = target_date_df.dropna()
        target_date_df = target_date_df[(target_date_df['LatitudePoint'] > 0) & (target_date_df['LongitudePoint'] > 0)]

        model = AgglomerativeClustering(n_clusters=number_of_visitors, affinity='euclidean', linkage='ward')
        model.fit(target_date_df[['LatitudePoint', 'LongitudePoint']])
        target_date_df['cluster'] = model.labels_

        best_point = 0
        current_point = 0
        for cluster in target_date_df['cluster'].unique():
            target_cluster_df = target_date_df.loc[target_date_df['cluster'] == cluster]
            for cargo_id in target_cluster_df['Cargo_ID'].unique():
                best_point += 1
                target_cargo_df = target_cluster_df.loc[target_cluster_df['Cargo_ID'] == cargo_id]
                current_point += target_cargo_df.shape[0] / target_cluster_df.shape[0]
        sum_of_scores += (current_point/best_point) * 100
    
    average = sum_of_scores / number_of_dates
    return average

compare_machine_to_human_clustering(df)

53.1612729435728

In [7]:
# Example of batch processing
def calculate_distance_batch(points):
    if(len(points) < 2):
        return 0

    requests_string = "http://router.project-osrm.org/route/v1/car/"

    for index, (lat, lon) in enumerate(points):
        requests_string += f"{lon},{lat}"
        if index < len(points) - 1:
            requests_string += ";"

    requests_string += "?overview=false"

    r = requests.get(requests_string)
    routes = json.loads(r.content)

    return routes.get("routes")[0]["distance"]

def build_df_with_cluster(df):
    target_date_df = df.loc[df['CargoDate'] == date]

    number_of_visitors = target_date_df.iloc[:, 0].unique().shape[0]

    target_date_df = target_date_df.dropna()
    target_date_df = target_date_df[target_date_df['LatitudePoint'] > 0]

    model = AgglomerativeClustering(n_clusters=number_of_visitors, affinity='euclidean', linkage='ward')
    model.fit(target_date_df[['LatitudePoint', 'LongitudePoint']])

    target_date_df['cluster'] = model.labels_

    return target_date_df

In [9]:
for date in dates:
    target_date_df = build_df_with_cluster(df)

    #calculate maximum distance between two points in each cluster with latitude and longitude
    try:
        machine_computed_distance = 0
        for cluster in target_date_df['cluster'].unique():
            t_df2 = target_date_df.loc[target_date_df['cluster'] == cluster]
            distance = calculate_distance_batch(t_df2[['LatitudePoint', 'LongitudePoint']].values.tolist())
            machine_computed_distance += distance
                    
        human_computed_distance = 0
        for user_cluster in target_date_df['Cargo_ID'].unique():
            t_df2 = target_date_df.loc[target_date_df['Cargo_ID'] == user_cluster]
            distance = calculate_distance_batch(t_df2[['LatitudePoint', 'LongitudePoint']].values.tolist())
            human_computed_distance += distance
    except: 
        print("Error occured while calculating distance for date", date)
        continue

    print("***************************************************")
    print("AI MAXIMUM DISTANCE RESULT MACHINE", machine_computed_distance, "HUMAN MAXIMUM DISTANCE RESULT", human_computed_distance)
    print("***************************************************")
    #put the result in an excel file
    result_df = pd.DataFrame({'CargoDate': [date], 'Machine': [machine_computed_distance], 'Human': [human_computed_distance]})
    result_df.to_excel('result.xlsx', index=False, header=False)
    time.sleep(50)

***************************************************
AI MAXIMUM DISTANCE RESULT MACHINE 10689820.399999993 HUMAN MAXIMUM DISTANCE RESULT 33742927.79999997
***************************************************
***************************************************
AI MAXIMUM DISTANCE RESULT MACHINE 10573970.799999988 HUMAN MAXIMUM DISTANCE RESULT 38019277.19999997
***************************************************
Error occured while calculating distance for date 2023-07-04T00:00:00.000000000
Error occured while calculating distance for date 2023-07-10T00:00:00.000000000
Error occured while calculating distance for date 2023-07-08T00:00:00.000000000
Error occured while calculating distance for date 2023-07-02T00:00:00.000000000
Error occured while calculating distance for date 2023-07-06T00:00:00.000000000
Error occured while calculating distance for date 2023-07-12T00:00:00.000000000
Error occured while calculating distance for date 2023-07-13T00:00:00.000000000
Error occured while calculat