In [None]:
from custom.GeoSpatialEncoder import GeoSpatialEncoder
from custom.PC_Class import PC
import importlib
import pandas as pd
import numpy as np
from tqdm import tqdm
import warnings
import os

datetime_cols = ['CREATIONDATETIME', 'LAAD_DATETIME_VAN', 'LAAD_DATETIME_TOT', 'LOS_DATETIME_VAN', 'LOS_DATETIME_TOT', '15CREATIONDATETIME']
direct = os.getcwd()
file_path = direct + # PATH TO DATA FOR TRAINING HERE 
total_rows = sum(1 for row in open(file_path, 'r', encoding='utf-8'))
chunk_size = 10000  
tqdm.pandas(desc="Reading CSV")
chunks = pd.read_csv(file_path, chunksize=chunk_size, iterator=True, index_col = 0, parse_dates=datetime_cols)

df_orders = pd.concat(tqdm(chunks, total=total_rows//chunk_size))

# Convert the 'LOS_DATETIME_VAN' column to datetime format
for column in datetime_cols:
    print(f"column: {column}")
    df_orders[column] = pd.to_datetime(df_orders[column], errors='coerce')

print("Lenght of input data:", str(len(df_orders)))
warnings.filterwarnings("ignore", category=FutureWarning, module="seaborn")

In [None]:
aggregated_df = df_orders[df_orders["AFHCODE"] == 'd'].groupby('OPDRACHTGEVERNAAM').agg({
        'OPDRACHTGEVERNAAM': 'first',
        'PALLETPLAATSEN': 'sum',
        'SHIPMENTNUMBER': 'count'
    }).reset_index(drop=True)


cutoff = 365

aggregated_df[aggregated_df["SHIPMENTNUMBER"] > cutoff]
companies_list = aggregated_df[aggregated_df["SHIPMENTNUMBER"] > cutoff].sort_values("PALLETPLAATSEN", ascending=False)["OPDRACHTGEVERNAAM"].values
included_volume = df_orders[df_orders["OPDRACHTGEVERNAAM"].isin(companies_list)]["PALLETPLAATSEN"].sum()
excluded_volume = df_orders[~df_orders["OPDRACHTGEVERNAAM"].isin(companies_list)]["PALLETPLAATSEN"].sum()
total_volume = included_volume + excluded_volume
percentage = included_volume / total_volume * 100
df_orders = df_orders[df_orders["OPDRACHTGEVERNAAM"].isin(companies_list)]
print(f"Number of companies with more than 365 days of data available: {len(companies_list)}")
print(f"Total volume: {total_volume}")
print(f"Included volume: {included_volume}")
print(f"Excluded volume: {excluded_volume}")
print(f"Percentage: {percentage}")

In [None]:
PC_obj = PC()
print("PC object created")

GSE = GeoSpatialEncoder(PC_obj)
print("GSE object created")

In [None]:
GSE.set_input_df(df_orders)
print("GSE input set")

GSE.clean_input_df()
print("GSE input cleaned")

# Kmeans


## Inertia Scores Kmeans


### Intertia scores using palletplaatsen as weight

In [None]:
scores_kmeans = []
import time
testingsizes = list(range(1,40))
for x in testingsizes:
    start_time = time.time()
    GSE.train_kmeans(x, 'PALLETPLAATSEN')
    score = GSE.return_inertia()
    scores_kmeans.append(score)
    end_time = time.time()
    iteration_time = end_time - start_time
    print( f"Trained {x} clusters in {iteration_time} with score:  {score}")

In [None]:
# Writing lists to a text file
# with open('kmeansscoresPALLETPLAATSEN.txt', 'w') as file:
#     file.write(','.join(map(str, testingsizes)) + '\n')
#     file.write(','.join(map(str, scores_kmeans)) + '\n')

In [None]:
with open('kmeansscoresPALLETPLAATSEN.txt', 'r') as file:
    lines = file.readlines()
    testingsizes = lines[0].strip().split(',')
    scores_kmeans = lines[1].strip().split(',')
    #convert all elements to int
    scores_kmeans = [float(i) for i in scores_kmeans]
    testingsizes = [int(i) for i in testingsizes]

print(testingsizes)  
print(scores_kmeans)  

In [None]:
# import matplotlib.pyplot as plt
# plt.figure(figsize=(8, 6))
# plt.plot(testingsizes, scores_kmeans, 'bo-', linewidth=2)
# plt.xlabel('Number of clusters (k)')
# plt.ylabel('Inertia Score')
# plt.title('Elbow Graph for KMeans Clustering (Inertia Score; Weighed by number of pallets)')
# plt.grid(True)
# plt.show()

In [None]:
GSE.train_kmeans(8, 'PALLETPLAATSEN')
GSE.plot_clusters('kmeans', '8 clusters for all companies weighed by number of pallets')

### Intertia scores using ordercount as weight

#### For a single company

In [None]:
GSE = GeoSpatialEncoder(PC_obj)
input_temp = df_orders[df_orders["OPDRACHTGEVERNAAM"] == 'PAARDEKOOPER_BV']
GSE.set_input_df(input_temp)
GSE.clean_input_df()

scores_kmeans = []
import time
testingsizes = list(range(1,40))
for x in testingsizes:
    start_time = time.time()
    GSE.train_kmeans(x, 'SHIPMENT_COUNT')
    score = GSE.return_inertia()
    scores_kmeans.append(score)
    end_time = time.time()
    iteration_time = end_time - start_time
    print( f"Trained {x} clusters in {iteration_time} with score:  {score}")

In [None]:
import matplotlib.pyplot as plt
plt.figure(figsize=(8, 6))
plt.plot(testingsizes, scores_kmeans, 'bo-', linewidth=2)
plt.xlabel('Number of clusters (k)')
plt.ylabel('Inertia Score')
plt.title('Elbow Graph for KMeans Clustering for Company A (Inertia Score; Weighed by order count)')
plt.grid(True)

plt.show()

#### For multiple companies

In [None]:
GSE = GeoSpatialEncoder(PC_obj)

GSE.set_input_df(df_orders)
GSE.clean_input_df()

scores_kmeans = []
import time
testingsizes = list(range(1,40))
for x in testingsizes:
    start_time = time.time()
    GSE.train_kmeans(x, 'SHIPMENT_COUNT')
    score = GSE.return_inertia()
    scores_kmeans.append(score)
    end_time = time.time()
    iteration_time = end_time - start_time
    print( f"Trained {x} clusters in {iteration_time} with score:  {score}")

In [None]:
# # Writing lists to a text file
# with open('kmeansscoresSHIPMENTCOUNT.txt', 'w') as file:
#     file.write(','.join(map(str, testingsizes)) + '\n')
#     file.write(','.join(map(str, scores_kmeans)) + '\n')

In [None]:
with open('kmeansscoresSHIPMENTCOUNT.txt', 'r') as file:
    lines = file.readlines()
    testingsizes = lines[0].strip().split(',')
    scores_kmeans = lines[1].strip().split(',')
    #convert all elements to int
    scores_kmeans = [float(i) for i in scores_kmeans]
    testingsizes = [int(i) for i in testingsizes]

print(testingsizes)  
print(scores_kmeans)  

In [None]:
import matplotlib.pyplot as plt
plt.figure(figsize=(8, 6))
plt.plot(testingsizes, scores_kmeans, 'bo-', linewidth=2)
plt.xlabel('Number of clusters (k)')
plt.ylabel('Inertia Score')
plt.title('Elbow Graph for KMeans Clustering (Inertia Score; Weighed by number of orders)')
plt.grid(True)

plt.show()

In [None]:
GSE.train_kmeans(4, 'SHIPMENT_COUNT')
GSE.plot_clusters('kmeans', '4 clusters for all companies weighed by number of pallets')

# Hierarch

#### For a single company


In [None]:
GSE = GeoSpatialEncoder(PC_obj)
input_temp = df_orders[df_orders["OPDRACHTGEVERNAAM"] == ''] #SAMPLE COMPANY HERE
GSE.set_input_df(input_temp)
GSE.clean_input_df()

scores = []
import time
testingsizes = list(range(2,80))
for x in testingsizes:
    start_time = time.time()
    GSE.train_agglomerative(x)
    score = GSE.return_silhouette_score()
    scores.append(score)
    end_time = time.time()
    iteration_time = end_time - start_time
    print( f"Trained {x} clusters in {iteration_time} with score:  {score}")

In [None]:
import matplotlib.pyplot as plt
plt.figure(figsize=(8, 6))
plt.plot(testingsizes, scores, 'bo-', linewidth=2)
plt.xlabel('Number of clusters (k)')
plt.ylabel('Silhouette Score')
plt.title('Elbow Graph for Company A using Agglomerative Clustering (Silhouette Score)')
plt.grid(True)
plt.show()

#### For multiple companies

In [None]:
GSE = GeoSpatialEncoder(PC_obj)

GSE.set_input_df(df_orders)
GSE.clean_input_df()

scores = []
import time
testingsizes = list(range(2,80))
for x in testingsizes:
    start_time = time.time()
    GSE.train_agglomerative(x)
    score = GSE.return_silhouette_score()
    scores.append(score)
    end_time = time.time()
    iteration_time = end_time - start_time
    print( f"Trained {x} clusters in {iteration_time} with score:  {score}")

In [None]:
import matplotlib.pyplot as plt
plt.figure(figsize=(8, 6))
plt.plot(testingsizes, scores, 'bo-', linewidth=2)
plt.xlabel('Number of clusters (k)')
plt.ylabel('Silhouette Score')
plt.title('Elbow Graph for Agglomerative Clustering (Silhouette Score)')
plt.grid(True)
plt.show()

In [None]:
GSE.train_agglomerative(3)
GSE.plot_clusters('agglomerative', '3 clusters for all companies weighed by number of orders')

# Balance Score

## Per customer models 

In [11]:
import warnings

# Suppress the specific sklearn warning
warnings.filterwarnings("ignore", message="X does not have valid feature names, but KMeans was fitted with feature names")

# Suppress the specific SettingWithCopyWarning
warnings.filterwarnings("ignore", category=pd.errors.SettingWithCopyWarning)

# function that checks how many rows have a non zero value in the column devided by the total amount of rows
def check_cluster_balance(df, column):
    return df[df[column] != 0].shape[0] / df.shape[0]


In [None]:
companies_list

In [None]:
GSE.set_verbose(False)

results_dict = {}
testsizes = list(range(1,150))

for company in companies_list:
    print(company)
    print("====================================")
    
    df_to_use = df_orders[df_orders['OPDRACHTGEVERNAAM'] == company]
    unique_clusters = len(df_to_use["LOS_CPC"].unique()) - 3
    GSE.set_input_df(df_to_use)
    GSE.clean_input_df()
    
    results_dict[company] = {}
    for testsize in testsizes:
        if testsize > unique_clusters:
            break
                
        #kmeans palletplaasten
        if "kmeans_pallets" not in results_dict[company]:
                results_dict[company]["kmeans_pallets"] = {}

        GSE.train_kmeans(testsize, "PALLETPLAATSEN")
        df_condensed = GSE.condense_orders()
        region_columns = [col for col in df_condensed.columns if col.startswith('REGION_')]
        
        for region_col in region_columns:
            bal = check_cluster_balance(df_condensed, region_col)
            if testsize not in results_dict[company]["kmeans_pallets"]:
                results_dict[company]["kmeans_pallets"][testsize] = {}
            results_dict[company]["kmeans_pallets"][testsize][region_col] = bal
        mean_kmeans = np.mean([results_dict[company]["kmeans_pallets"][testsize][region_col] for region_col in region_columns])
        print(f"{testsize} clusters kmeans pallets mean balance: {mean_kmeans}")

        #kmeans ordercount
        if "kmeans_ordercount" not in results_dict[company]:
                results_dict[company]["kmeans_ordercount"] = {}

        GSE.train_kmeans(testsize, "SHIPMENT_COUNT")
        df_condensed = GSE.condense_orders()
        region_columns = [col for col in df_condensed.columns if col.startswith('REGION_')]
        
        for region_col in region_columns:
            bal = check_cluster_balance(df_condensed, region_col)
            if testsize not in results_dict[company]["kmeans_ordercount"]:
                results_dict[company]["kmeans_ordercount"][testsize] = {}
            results_dict[company]["kmeans_ordercount"][testsize][region_col] = bal
        mean_kmeans_pal = np.mean([results_dict[company]["kmeans_ordercount"][testsize][region_col] for region_col in region_columns])
        print(f"{testsize} clusters kmeans ordercount mean balance: {mean_kmeans_pal}")

        #hierarch
        if "hierarch" not in results_dict[company]:
                results_dict[company]["hierarch"] = {}

        GSE.train_agglomerative(n_clusters = testsize)
        df_condensed = GSE.condense_orders()
        region_columns = [col for col in df_condensed.columns if col.startswith('REGION_')]
        
        for region_col in region_columns:
            bal = check_cluster_balance(df_condensed, region_col)
            if testsize not in results_dict[company]["hierarch"]:
                results_dict[company]["hierarch"][testsize] = {}
            results_dict[company]["hierarch"][testsize][region_col] = bal
        mean_agg = np.mean([results_dict[company]["hierarch"][testsize][region_col] for region_col in region_columns])
        print(f"{testsize} clusters hierarch mean balance: {mean_agg}")

        if mean_kmeans < 0.2 and mean_agg < 0.2 and mean_kmeans_pal < 0.2:
            break

In [19]:
# import json
# # Writing to a JSON file
# with open('modellenbalanceindividual.json', 'w') as file:
#     json.dump(results_dict, file, indent=4)  # indent=4 is optional for pretty printing

In [20]:
with open('modellenbalanceindividual.json', 'r') as file:
    results_dict = json.load(file)

## Global model calculated per customer

In [None]:
GSE.set_verbose(False)

results_dict2 = {}
testsizes = list(range(1,250))

df_to_use = df_orders
GSE.set_input_df(df_to_use)
GSE.clean_input_df()

for testsize in testsizes:
    print(testsize)
    #kmeans palletplaasten
    GSE.train_kmeans(testsize, "PALLETPLAATSEN")
    df_condensed = GSE.condense_orders()
    region_columns = [col for col in df_condensed.columns if col.startswith('REGION_')] 

    for company in companies_list:
        if company not in results_dict2:
            results_dict2[company] = {}
        if "kmeans_pallets" not in results_dict2[company]:
            results_dict2[company]["kmeans_pallets"] = {}
        temp_df = df_condensed[df_condensed['OPDRACHTGEVERNAAM'] == company]
        for region_col in region_columns:
            bal = check_cluster_balance(temp_df, region_col)
            if testsize not in results_dict2[company]["kmeans_pallets"]:
                results_dict2[company]["kmeans_pallets"][testsize] = {}
            results_dict2[company]["kmeans_pallets"][testsize][region_col] = bal
        mean_kmeans = np.mean([results_dict2[company]["kmeans_pallets"][testsize][region_col] for region_col in region_columns])
        print(f"{company} clusters kmeans pallets mean balance: {mean_kmeans}")


    #kmeans ordercount
    GSE.train_kmeans(testsize, "SHIPMENT_COUNT")
    df_condensed = GSE.condense_orders()
    region_columns = [col for col in df_condensed.columns if col.startswith('REGION_')]

    for company in companies_list:
        if "kmeans_ordercount" not in results_dict2[company]:
            results_dict2[company]["kmeans_ordercount"] = {}
        temp_df = df_condensed[df_condensed['OPDRACHTGEVERNAAM'] == company]
        for region_col in region_columns:
            bal = check_cluster_balance(temp_df, region_col)
            if testsize not in results_dict2[company]["kmeans_ordercount"]:
                results_dict2[company]["kmeans_ordercount"][testsize] = {}
            results_dict2[company]["kmeans_ordercount"][testsize][region_col] = bal
        mean_kmeans_pal = np.mean([results_dict2[company]["kmeans_ordercount"][testsize][region_col] for region_col in region_columns])
        print(f"{company} clusters kmeans ordercount mean balance: {mean_kmeans_pal}")

    
    #hierarch
    GSE.train_agglomerative(n_clusters = testsize)
    df_condensed = GSE.condense_orders()
    region_columns = [col for col in df_condensed.columns if col.startswith('REGION_')]
    
    for company in companies_list:
        if "hierarch" not in results_dict2[company]:
            results_dict2[company]["hierarch"] = {}
        temp_df = df_condensed[df_condensed['OPDRACHTGEVERNAAM'] == company]
        for region_col in region_columns:
            bal = check_cluster_balance(temp_df, region_col)
            if testsize not in results_dict2[company]["hierarch"]:
                results_dict2[company]["hierarch"][testsize] = {}
            results_dict2[company]["hierarch"][testsize][region_col] = bal
        mean_agg = np.mean([results_dict2[company]["hierarch"][testsize][region_col] for region_col in region_columns])
        print(f"{company} clusters hierarch mean balance: {mean_agg}")
    

In [None]:
# import json
# # Writing to a JSON file
# with open('modellenbalancecommon_gse.json', 'w') as file:
#     json.dump(results_dict2, file, indent=4)  # indent=4 is optional for pretty printing

# Plotting of various statistics

In [23]:
# Read from JSON file modellenbalancecommon_gse.json to results_dict2
import json
with open('modellenbalancecommon_gse.json', 'r') as file:
    temp = json.load(file)
results_dict2={}
for x in temp:
    results_dict2[x] ={}
    for y in temp[x]:
        results_dict2[x][y] = {}
        for z in temp[x][y]:
            results_dict2[x][y][int(z)] = temp[x][y][z]

# Read from JSON file modellenbalanceindividual.json to results_dict
with open('modellenbalanceindividual.json', 'r') as file:
    temp = json.load(file)
results_dict = {}
for x in temp:
    results_dict[x] ={}
    for y in temp[x]:
        results_dict[x][y] = {}
        for z in temp[x][y]:
            results_dict[x][y][int(z)] = temp[x][y][z]
            
            
            

## All companies with global model

In [None]:
import matplotlib.pyplot as plt 
plt.figure(figsize=(8, 6))
for company in results_dict2:
    companylist = []
    testsizes = []
    for testsize in results_dict2[company]["hierarch"]:
        testsizes.append(testsize)
        mean_agg = np.mean([results_dict2[company]["hierarch"][testsize][region_col] for region_col in results_dict2[company]["hierarch"][testsize]])
        # print(f"{testsize} clusters hierarch mean balance: {mean_agg}")
        companylist.append(mean_agg)
        #plot the results
    
    plt.plot(testsizes, companylist, '-', linewidth=1)
plt.xlabel('Number of clusters (k)')
plt.ylabel('Mean balance')
plt.title(f'Balance of clusters with a global model (hierarchical)')
plt.grid(True)
plt.show()


## All companies with individual models

In [None]:
import matplotlib.pyplot as plt 
plt.figure(figsize=(8, 6))
for company in results_dict:
    companylist = []
    testsizes = []
    for testsize in results_dict[company]["hierarch"]:
        testsizes.append(testsize)
        mean_agg = np.mean([results_dict[company]["hierarch"][testsize][region_col] for region_col in results_dict[company]["hierarch"][testsize]])
        # print(f"{testsize} clusters hierarch mean balance: {mean_agg}")
        companylist.append(mean_agg)
        #plot the results
    
    plt.plot(testsizes, companylist, '-', linewidth=1)
plt.xlabel('Number of clusters (k)')
plt.ylabel('Mean balance')
plt.title(f'Balance of clusters with company specific cluster models (hierarchical)')
plt.grid(True)
plt.show()

## Per company a graph for balance+variance


In [None]:
import matplotlib.pyplot as plt
import numpy as np
colordict = {0: {"kmeans_pallets": "#1f77b4", "kmeans_ordercount": "#ff7f0e", "hierarch": "#2ca02c"},
             1: {"kmeans_pallets": "#9467bd", "kmeans_ordercount": "#d62728", "hierarch": "#17becf"}}
for company in results_dict2:
    plt.figure(figsize=(8, 6))
    for type in results_dict2[company]:
        companylist = []
        varlist = []
        testsizes = []
        for testsize in results_dict2[company][type]:
            testsizes.append(testsize)
            mean_agg = np.mean([results_dict2[company][type][testsize][region_col] for region_col in results_dict2[company][type][testsize]])
            variability_agg = np.std([results_dict2[company][type][testsize][region_col] for region_col in results_dict2[company][type][testsize]])
            # print(f"{testsize} clusters hierarch mean balance: {mean_agg}")
            companylist.append(mean_agg)
            varlist.append(variability_agg)
            #plot the results
        plt.plot(testsizes, companylist, '-',color=colordict[0][type], linewidth=2, label=f"individual_{type}")
        # plt.plot(testsizes, varlist, '--', color=colordict[0][type], linewidth=2, label=f"individual_{type}_var")
    
    for type in results_dict[company]:
        companylist = []
        varlist = []
        testsizes = []
        for testsize in results_dict[company][type]:
            testsizes.append(testsize)
            mean_agg = np.mean([results_dict[company][type][testsize][region_col] for region_col in results_dict[company][type][testsize]])
            variability_agg = np.std([results_dict[company][type][testsize][region_col] for region_col in results_dict[company][type][testsize]])
            # print(f"{testsize} clusters hierarch mean balance: {mean_agg}")
            companylist.append(mean_agg)
            varlist.append(variability_agg)
            #plot the results
        plt.plot(testsizes, companylist, '-', color=colordict[1][type], linewidth=2, label=f"common_{type}")
        # plt.plot(testsizes, varlist, '--', color=colordict[1][type], linewidth=2, label=f"common_{type}_var")
   


    plt.xlabel('Number of clusters (k)', fontsize=14)
    plt.ylabel('Mean balance', fontsize=14)
    plt.xlim(0, 150)
    plt.xticks(fontsize=12)
    plt.yticks(fontsize=12)
    name = final_dict[company]['fakename']
    plt.title(f'Balance-score against number of clusters with various \n clustering models for {final_dict[company]["fakename"]}', fontsize=16)
    legendlist = ['Per Company Model-KMeans Weighed By Pallets', 'Per Company Model-KMeans Weighed By Order Count', 'Per Company Model-Hierarchical', 'Global Model-KMeans Weighed By Pallets', 'Global Model-KMeans Weighed By Order Count', 'Global Model-Hierarchical']
    plt.legend(legendlist,fontsize=12)
    plt.grid(True)
    plt.show()


## Single graph with mean model scores

In [None]:
# Create dictionaries to store aggregated values
aggregated_results_dict2 = {}
aggregated_results_dict = {}

# Initialize the dictionaries
for company in results_dict2:
    for type in results_dict2[company]:
        if type not in aggregated_results_dict2:
            aggregated_results_dict2[type] = {}
        for testsize in results_dict2[company][type]:
            if testsize not in aggregated_results_dict2[type]:
                aggregated_results_dict2[type][testsize] = []

for company in results_dict:
    for type in results_dict[company]:
        if type not in aggregated_results_dict:
            aggregated_results_dict[type] = {}
        for testsize in results_dict[company][type]:
            if testsize not in aggregated_results_dict[type]:
                aggregated_results_dict[type][testsize] = []

# Accumulate values for each type and testsize across all companies
for company in results_dict2:
    for type in results_dict2[company]:
        for testsize in results_dict2[company][type]:
            mean_agg = np.mean([results_dict2[company][type][testsize][region_col] for region_col in results_dict2[company][type][testsize]])
            aggregated_results_dict2[type][testsize].append(mean_agg)

for company in results_dict:
    for type in results_dict[company]:
        for testsize in results_dict[company][type]:
            mean_agg = np.mean([results_dict[company][type][testsize][region_col] for region_col in results_dict[company][type][testsize]])
            aggregated_results_dict[type][testsize].append(mean_agg)

# Create a single plot
plt.figure(figsize=(10, 8))

# Plot the aggregated mean values
for type in aggregated_results_dict2:
    testsizes = sorted(aggregated_results_dict2[type].keys())
    mean_values = [np.mean(aggregated_results_dict2[type][testsize]) for testsize in testsizes]
    plt.plot(testsizes, mean_values, '-', linewidth=2, label=f"individual_{type}")


for type in aggregated_results_dict:
    testsizes = sorted(aggregated_results_dict[type].keys())
    mean_values = [np.mean(aggregated_results_dict[type][testsize]) for testsize in testsizes]
    plt.plot(testsizes, mean_values, '--', linewidth=2, label=f"common_{type}")

plt.xlabel('Number of clusters (k)', fontsize=14)
plt.ylabel('Mean balance', fontsize=14)
plt.xlim(0, 150)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.title('Balance of clusters with various cluster models (Aggregated)', fontsize=16)
legendlist = ['Per Company Model-KMeans Weighed By Pallets', 'Per Company Model-KMeans Weighed By Order Count', 'Per Company Model-Hierarchical', 'Global Model-KMeans Weighed By Pallets', 'Global Model-KMeans Weighed By Order Count', 'Global Model-Hierarchical']

plt.legend(legendlist,fontsize=14)
plt.grid(True)
plt.show()

In [None]:
# common
print("global")
for model in aggregated_results_dict2:
    for size in aggregated_results_dict2[model]:
        avg = np.mean(aggregated_results_dict2[model][size])
        if avg < 0.5:
            break
    print(f"model: {model}, size: {size}, mean: {np.mean(aggregated_results_dict2[model][size])}")

# individual
print("individual")
for model in aggregated_results_dict:
    for size in aggregated_results_dict[model]:
        avg = np.mean(aggregated_results_dict[model][size])
        if avg < 0.5:
            break
    print(f"model: {model}, size: {size}, mean: {np.mean(aggregated_results_dict[model][size])}")


## Per company two graphs for balance and std (anonimized)

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import string

# Color dictionary
colordict = {0: {"kmeans_pallets": "#1f77b4", "kmeans_ordercount": "#ff7f0e", "hierarch": "#2ca02c"},
             1: {"kmeans_pallets": "#9467bd", "kmeans_ordercount": "#d62728", "hierarch": "#17becf"}}

# List of alphabet letters to use for company labels
company_labels = list(string.ascii_uppercase)

# Counter for companies
company_counter = 0

for company in results_dict2:
    fig, axes = plt.subplots(1, 2, figsize=(16, 6))

    for type in results_dict2[company]:
        companylist = []
        varlist = []
        testsizes = []
        for testsize in results_dict2[company][type]:
            testsizes.append(testsize)
            mean_agg = np.mean([results_dict2[company][type][testsize][region_col] for region_col in results_dict2[company][type][testsize]])
            variability_agg = np.std([results_dict2[company][type][testsize][region_col] for region_col in results_dict2[company][type][testsize]])
            companylist.append(mean_agg)
            varlist.append(variability_agg)
        
        # Plot the balance results on the left subplot
        axes[0].plot(testsizes, companylist, '-', color=colordict[0][type], linewidth=2, label=f"individual_{type}")
        # Plot the variability results on the right subplot
        axes[1].plot(testsizes, varlist, '-', color=colordict[0][type], linewidth=2, label=f"individual_{type}_var")
    
    for type in results_dict[company]:
        companylist = []
        varlist = []
        testsizes = []
        for testsize in results_dict[company][type]:
            testsizes.append(testsize)
            mean_agg = np.mean([results_dict[company][type][testsize][region_col] for region_col in results_dict[company][type][testsize]])
            variability_agg = np.std([results_dict[company][type][testsize][region_col] for region_col in results_dict[company][type][testsize]])
            companylist.append(mean_agg)
            varlist.append(variability_agg)
        
        # Plot the balance results on the left subplot
        axes[0].plot(testsizes, companylist, '--', color=colordict[0][type], linewidth=2, label=f"common_{type}")
        # Plot the variability results on the right subplot
        axes[1].plot(testsizes, varlist, '--', color=colordict[0][type], linewidth=2, label=f"common_{type}_var")

    # Use the company_counter to get the label
    company_label = company_labels[company_counter % len(company_labels)]
    company_counter += 1

    # Labeling and formatting for the left subplot
    axes[0].set_xlabel('Number of clusters (k)')
    axes[0].set_ylabel('Mean balance')
    axes[0].set_xlim(0, 150)
    axes[0].set_title(f'Balance of clusters for Company {company_label}')
    axes[0].legend()
    axes[0].grid(True)

    # Labeling and formatting for the right subplot
    axes[1].set_xlabel('Number of clusters (k)')
    axes[1].set_ylabel('Standard Deviation')
    axes[1].set_xlim(0, 150)
    axes[1].set_title(f'Variability of clusters for Company {company_label}')
    axes[1].legend()
    axes[1].grid(True)

    plt.tight_layout()
    plt.show()


## Two graphs for all companies model Balance and STD

In [None]:
# Color dictionary
colordict = {0: {"kmeans_pallets": "#1f77b4", "kmeans_ordercount": "#ff7f0e", "hierarch": "#2ca02c"},
             1: {"kmeans_pallets": "#9467bd", "kmeans_ordercount": "#d62728", "hierarch": "#17becf"}}

# Create dictionaries to store aggregated values
aggregated_means_dict2 = {}
aggregated_stds_dict2 = {}
aggregated_means_dict = {}
aggregated_stds_dict = {}

# Initialize the dictionaries
for company in results_dict2:
    for type in results_dict2[company]:
        if type not in aggregated_means_dict2:
            aggregated_means_dict2[type] = {}
            aggregated_stds_dict2[type] = {}
        for testsize in results_dict2[company][type]:
            if testsize not in aggregated_means_dict2[type]:
                aggregated_means_dict2[type][testsize] = []
                aggregated_stds_dict2[type][testsize] = []

for company in results_dict:
    for type in results_dict[company]:
        if type not in aggregated_means_dict:
            aggregated_means_dict[type] = {}
            aggregated_stds_dict[type] = {}
        for testsize in results_dict[company][type]:
            if testsize not in aggregated_means_dict[type]:
                aggregated_means_dict[type][testsize] = []
                aggregated_stds_dict[type][testsize] = []

# Accumulate values for each type and testsize across all companies
for company in results_dict2:
    for type in results_dict2[company]:
        for testsize in results_dict2[company][type]:
            mean_agg = np.mean([results_dict2[company][type][testsize][region_col] for region_col in results_dict2[company][type][testsize]])
            std_agg = np.std([results_dict2[company][type][testsize][region_col] for region_col in results_dict2[company][type][testsize]])
            aggregated_means_dict2[type][testsize].append(mean_agg)
            aggregated_stds_dict2[type][testsize].append(std_agg)

for company in results_dict:
    for type in results_dict[company]:
        for testsize in results_dict[company][type]:
            mean_agg = np.mean([results_dict[company][type][testsize][region_col] for region_col in results_dict[company][type][testsize]])
            std_agg = np.std([results_dict[company][type][testsize][region_col] for region_col in results_dict[company][type][testsize]])
            aggregated_means_dict[type][testsize].append(mean_agg)
            aggregated_stds_dict[type][testsize].append(std_agg)

# Create a single figure with two subplots
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Plot the aggregated mean values on the left subplot
for type in aggregated_means_dict2:
    testsizes = sorted(aggregated_means_dict2[type].keys())
    mean_values = [np.mean(aggregated_means_dict2[type][testsize]) for testsize in testsizes]
    axes[0].plot(testsizes, mean_values, '-', linewidth=2, color=colordict[0][type], label=f"individual_{type}")

for type in aggregated_means_dict:
    testsizes = sorted(aggregated_means_dict[type].keys())
    mean_values = [np.mean(aggregated_means_dict[type][testsize]) for testsize in testsizes]
    axes[0].plot(testsizes, mean_values, '--', linewidth=2, color=colordict[1][type], label=f"common_{type}")

axes[0].set_xlabel('Number of clusters (k)')
axes[0].set_ylabel('Mean balance in clusters')
axes[0].set_xlim(0, 150)
axes[0].set_title('Mean Balance in clusters for all companies')
axes[0].legend()
axes[0].grid(True)

# Plot the aggregated standard deviation values on the right subplot
for x in aggregated_stds_dict2:
    testsizes = sorted(aggregated_stds_dict2[x].keys())
    std_values = [np.mean(aggregated_stds_dict2[x][testsize]) for testsize in testsizes]
    axes[1].plot(testsizes, std_values, '-', linewidth=2, color=colordict[0][x], label=f"individual_{x}_var")

for x in aggregated_stds_dict:
    testsizes = sorted(aggregated_stds_dict[x].keys())
    std_values = [np.mean(aggregated_stds_dict[x][testsize]) for testsize in testsizes]
    axes[1].plot(testsizes, std_values, '--', linewidth=2, color=colordict[1][x], label=f"common_{x}_var")

axes[1].set_xlabel('Number of clusters (k)')
axes[1].set_ylabel('Standard Deviation in cluster balance')
axes[1].set_xlim(0, 150)
axes[1].set_title('Aggregated Standard Deviation for all Companies')
axes[1].legend()
axes[1].grid(True)

plt.tight_layout()
plt.show()

# Define definitive number of clusters for companies


In [None]:
import string
import custom.GeoSpatialEncoder
final_dict = {}

def generate_company_labels(num_companies):
    company_labels = list(string.ascii_uppercase)
    extended_labels = []

    # Generate single-letter labels
    for letter in company_labels:
        extended_labels.append(letter)
    
    # Generate double-letter labels
    for first_letter in company_labels:
        for second_letter in company_labels:
            extended_labels.append(first_letter + second_letter)
    
    # Return the first 'num_companies' labels
    return extended_labels[:num_companies]

company_labels = generate_company_labels(len(companies_list))
    

for i, company in enumerate(companies_list):
    company_label = "Company "+ company_labels[i]
    final_dict[company] = {}
    df_to_use = df_orders[(df_orders['OPDRACHTGEVERNAAM'] == company) & (df_orders['AFHCODE'] == 'd')]
    # find the highest number of clusters for each company in dictionary where the mean is above 0.5
    for n in range(1, 250):
        mean_agg = np.mean([results_dict2[company]['kmeans_ordercount'][n][region] for region in results_dict2[company]['kmeans_ordercount'][n]])
        if mean_agg < 0.5:
            break
    print(f"Company: {company}, Number of clusters: {n}")
    final_dict[company]["nclusters"] = n
    final_dict[company]["fakename"] = company_label
    final_dict[company]["model"] = GeoSpatialEncoder(PC_obj)
    final_dict[company]["model"].set_verbose(False)
    final_dict[company]["model"].set_input_df(df_to_use)
    final_dict[company]["model"].clean_input_df()
    final_dict[company]["model"].train_kmeans(n, 'SHIPMENT_COUNT')
    final_dict[company]["CPC_dict"] = final_dict[company]['model'].kmeans_dict

    print(f"Model trained for {company_label}")

In [None]:
results_dict2[company].keys()

In [None]:
temp_dict = {}
for i, company in enumerate(companies_list):
    temp_dict[company] = []
    for model in results_dict2[company]:
        for testsize in results_dict2[company][model]:
            mean_agg = np.mean([results_dict2[company][model][testsize][region] for region in results_dict2[company][model][testsize]])
            if mean_agg < 0.5:
                break
        temp_dict[company].append(testsize)

position_counts = {1: 0, 2: 0, 3: 0}    
for company, values in temp_dict.items():
    max_value = max(values)
    for i, value in enumerate(values):
        if value == max_value:
            position_counts[i + 1] += 1

position_counts

In [None]:
# # remove GSE object from dictionary:
# final_dict_export = {}
# for company in companies_list:
#     final_dict_export[company] = final_dict[company].copy()
#     final_dict_export[company].pop('model', None)


# # save final_dict to a json file
# import json
# with open('clusters_final_dict.json', 'w') as file:
#     json.dump(final_dict_export, file, indent=4)  # indent=4 is optional for pretty printing

In [5]:
# read final_dict from json file
import json
with open('clusters_final_dict.json', 'r') as file:
    final_dict = json.load(file)

## Plot for each company the clusters

In [None]:
for company in final_dict:
    print(f"Company: {company}, Number of clusters: {final_dict[company]['nclusters']}")
    df_to_use = df_orders[(df_orders['OPDRACHTGEVERNAAM'] == company) & (df_orders['AFHCODE'] == 'd')]
    final_dict[company]["model"] = GeoSpatialEncoder(PC_obj)
    final_dict[company]["model"].set_verbose(False)
    final_dict[company]["model"].set_input_df(df_to_use)
    final_dict[company]["model"].clean_input_df()
    final_dict[company]["model"].train_kmeans(final_dict[company]['nclusters'], 'SHIPMENT_COUNT')

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.lines as mlines
for company in companies_list:
    fakename = final_dict[company]["fakename"]
    df_plot = final_dict[company]["model"].df_CPC_kmeans.groupby('LOS_CPC').agg({'PALLETPLAATSEN': 'first', 
                                                                   'LOS_LAT': 'first', 
                                                                   'LOS_LON': 'first',
                                                                   'COORDINATES': 'first',
                                                                   'CLUSTER': 'first'}).reset_index()
    cluster_info = final_dict[company]["model"].return_cluster_kmeans_info()
    n_clusters = final_dict[company]["model"].kmeans_n_clusters

    # Plot the base map with the outlines of the countries
    fig, ax = plt.subplots(figsize=(10, 10))
    final_dict[company]["model"].countries.boundary.plot(ax=ax, linewidth=1, edgecolor='black')

    # Plot the clusters
    sns.scatterplot(x="LOS_LON", y="LOS_LAT", hue="CLUSTER", data=df_plot, palette="tab20", ax=ax, legend=False)

    for i, center in cluster_info.iterrows():
        ax.plot(center['LOS_LON'], center['LOS_LAT'], 'r+', markersize=10)
    plt.title(f"{n_clusters} clusters for {fakename}", fontsize=16)
    plt.xlabel("Longitude", fontsize=14)
    plt.ylabel("Latitude", fontsize=14)
    plt.xticks(fontsize=12)
    plt.yticks(fontsize=12)
    # Custom legend
    cluster_center = mlines.Line2D([], [], color='red', marker='+', linestyle='None', markersize=10, label='Cluster Center')
    delivery_location = mlines.Line2D([], [], color='grey', marker='o', linestyle='None', markersize=10, label='Delivery Location')
    plt.legend(handles=[cluster_center, delivery_location], fontsize=12, loc='upper left')


    ax.set_xticks(np.arange(3, 8, 1))
    ax.set_yticks(np.arange(50, 54, 1))

    plt.savefig(f"FILEPATH HERE", bbox_inches='tight')

    plt.show()

In [None]:
for company in companies_list:
    print(f"Company: {company}")
    print("====================================")
    final_dict[company]['model'].plot_clusters("kmeans", f"{final_dict[company]['nclusters']} clusters for {final_dict[company]['fakename']}",
                                               f"FILEPATH HERE")
    print(f"Number of clusters: {final_dict[company]['nclusters']}")
    print(f"Fake name: {final_dict[company]['fakename']}")
    print("====================================")

In [None]:
results_dict2["HUSK_MEDICAL_BV"]["kmeans_ordercount"][9]

In [None]:
results_dict2["HUSK_MEDICAL_BV"]["kmeans_ordercount"][9]

In [None]:
for company in companies_list:

final_dict
