In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
from tqdm import tqdm
import seaborn as sns
from datetime import time, timedelta
from custom.GeoSpatialEncoder import GeoSpatialEncoder
from custom.PC_Class import PC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, make_scorer, mean_squared_error, r2_score
import xgboost as xgb
import warnings
import os
warnings.filterwarnings("ignore", category=FutureWarning, module="seaborn")

def load_data(file_path):
    datetime_cols = ['CREATIONDATETIME', 'LAAD_DATETIME_VAN', 'LAAD_DATETIME_TOT', 'LOS_DATETIME_VAN', 'LOS_DATETIME_TOT', '15CREATIONDATETIME']
    total_rows = sum(1 for row in open(file_path, 'r', encoding='utf-8'))
    chunk_size = 10000  
    tqdm.pandas(desc="Reading CSV")
    chunks = pd.read_csv(file_path, chunksize=chunk_size, iterator=True, index_col = 0, parse_dates=datetime_cols)

    df_orders = pd.concat(tqdm(chunks, total=total_rows//chunk_size))
    print("Lenght of input data:", str(len(df_orders)))
    return df_orders


In [None]:
direct = os.getcwd()
file_path = direct + "////data////vos_input_data////MultiHubData3_training.csv" 
PC_obj = PC()
print("PC object created")
df_orders  = load_data(file_path)

# Load companies information from file

In [None]:
# read the final_dict from file
import json
with open('clusters_final_dict.json', 'r') as file:
    clusters_dict = json.load(file)

for company in clusters_dict:
    print(company)
    df_to_use = df_orders[(df_orders['OPDRACHTGEVERNAAM'] == company)]
    clusters_dict[company]["model"] = GeoSpatialEncoder(PC_obj)
    clusters_dict[company]["model"].set_verbose(False)
    clusters_dict[company]["model"].set_input_df(df_to_use)
    clusters_dict[company]["model"].clean_input_df()
    clusters_dict[company]["model"].train_kmeans(clusters_dict[company]['nclusters'], 'SHIPMENT_COUNT')
    # clusters_dict[company]["CPC_dict2"] = clusters_dict[company]['model'].kmeans_dict
    # equal = clusters_dict[company]["CPC_dict2"] == clusters_dict[company]["CPC_dict"]
    # print(f"Loaded dict equals saved dict: {equal}")

# Test the model types


### Miscelaneous

In [4]:
def create_binary_and_scaled_columns(df_to_use):
    region_columns = [col for col in df_to_use.columns if col.startswith('REGION_')]
    
    # Calculate binary and scaled columns
    bin_columns = df_to_use[region_columns].gt(0).astype(int)
    bin_columns.columns = ["BIN_" + col for col in region_columns]
    
    scaled_columns = df_to_use[region_columns]#.div(df_to_use['PALLETPLAATSEN_ACTUAL'], axis=0) # <-------------------------------------------------------------------------
    scaled_columns.columns = ["SCALED_" + col for col in region_columns]
    
    # Concatenate original df with new binary and scaled columns
    df_to_use = pd.concat([df_to_use, bin_columns, scaled_columns], axis=1)
    
    # Filter out the columns where the total is 0
    totals = df_to_use[region_columns].sum()
    zero_regions = totals[totals == 0].index.tolist()
    
    # Drop columns corresponding to zero regions
    drop_columns = zero_regions + ["BIN_" + col for col in zero_regions] + ["SCALED_" + col for col in zero_regions]
    df_to_use.drop(columns=drop_columns, inplace=True)
    
    print(f"Removed {len(zero_regions)} regions with no demand")
    return df_to_use


def filter_train_test_split(df_to_use):
    df_to_use = df_to_use.copy()
    categorical_features = ['LAADPC', 'dayofweekcreation', 'weeknr']
    continuous_features = ['PALLETPLAATSEN', 'AANTALORDERS']
    features = categorical_features + continuous_features
    region_columns = [col for col in df_to_use.columns if col.startswith('REGION_')]
    non_region_columns = [col for col in df_to_use.columns if not col.startswith('REGION_')]

    df_to_use = create_binary_and_scaled_columns(df_to_use)
    # Encode and scale the entire dataset
    encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
    scaler = MinMaxScaler()

    # Fit the encoder and scaler on the entire data
    encoded_categorical_features = encoder.fit_transform(df_to_use[categorical_features])
    scaled_continuous_features = scaler.fit_transform(df_to_use[continuous_features])

    # Combine encoded and scaled features with the original DataFrame
    encoded_feature_names = encoder.get_feature_names_out(categorical_features)
    scaled_feature_names = scaler.get_feature_names_out(continuous_features)
    feature_names = np.concatenate([encoded_feature_names, scaled_feature_names])

    # Create DataFrames for encoded and scaled features
    df_encoded = pd.DataFrame(encoded_categorical_features, columns=encoded_feature_names, index=df_to_use.index)
    df_scaled = pd.DataFrame(scaled_continuous_features, columns=continuous_features, index=df_to_use.index)

    # Concatenate the encoded and scaled features with the rest of the DataFrame
    df_processed = pd.concat([df_to_use.drop(columns=categorical_features + continuous_features), df_encoded, df_scaled], axis=1)

    # Split the DataFrame into train and test sets
    train_df, test_df = train_test_split(df_processed, test_size=0.2, random_state=42)

    feature_names # feature names used for training both models
    classification_targets = [col for col in df_to_use.columns if col.startswith('BIN_REGION_')] # targets for classification model
    regression_targets = [col for col in df_to_use.columns if col.startswith('SCALED_REGION_')] # targets for regression model

    # Create the classification feature sets
    X_train_class = train_df[feature_names]
    Y_train_class = train_df[classification_targets]
    X_test_class = test_df[feature_names]
    Y_test_class = test_df[classification_targets]

    # Create the regression feature sets
    X_train_reg = train_df[list(feature_names) + classification_targets]
    Y_train_reg = train_df[regression_targets]
    X_test_reg = test_df[list(feature_names) + classification_targets]
    Y_test_reg = test_df[regression_targets]

    return X_train_class, Y_train_class, X_test_class, Y_test_class, X_train_reg, Y_train_reg, X_test_reg, Y_test_reg, scaler, encoder

def softmax_with_zeros(x):
    e_x = np.zeros_like(x, dtype=float)
    non_zero_indices = x != 0
    if np.any(non_zero_indices):
        e_x[non_zero_indices] = np.exp(x[non_zero_indices] - np.max(x[non_zero_indices]))
        e_x[non_zero_indices] = e_x[non_zero_indices] / e_x[non_zero_indices].sum()
    return pd.Series(e_x, index=x.index)

def eval_output(Y_test, Y_pred, company, model_name, extra_info=""):
    #Y_pred = Y_pred.abs()
    mse = mean_squared_error(Y_test, Y_pred, multioutput='uniform_average')
    mae = mean_absolute_error(Y_test, Y_pred)
    r2 = r2_score(Y_test, Y_pred)

    print(f'Metrics for {company} with {model_name} {extra_info}:')
    print(f'Mean Squared Error: {mse}')
    print(f'Mean Absolute Error: {mae}')
    print(f'R-squared: {r2}')
    print("\n")
    return {"MSE": mse,
            "MAE": mae, 
            "R2": r2}

def get_binary_columns(df):
    bin_columns = [col for col in df.columns if col.startswith('BIN_REGION_')]
    return df[bin_columns]


def filter_by_binary(predicteddf, testdf, binarydf):
    """
    Filter the predicted and test dataframes by the binary dataframe.
    
    Parameters:
    - predicteddf: DataFrame of predicted values
    - testdf: DataFrame of true values
    - binarydf: DataFrame of binary values (1 or 0)
    
    Returns:
    - Filtered predicted and test arrays
    """
    assert predicteddf.shape == testdf.shape == binarydf.shape, "All input DataFrames must have the same shape"

    predicteddf.reset_index(drop=True, inplace=True)
    testdf.reset_index(drop=True, inplace=True)
    binarydf.reset_index(drop=True, inplace=True)

    predicteddf.columns = testdf.columns
    binarydf.columns = testdf.columns
    
    filtered_predicted = []
    filtered_test = []
    
    # Iterate over each row
    for i in range(len(binarydf)):
        mask = binarydf.iloc[i] == 1
        filtered_predicted.extend(predicteddf.iloc[i][mask].values)
        filtered_test.extend(testdf.iloc[i][mask].values)
    
    return np.array(filtered_predicted), np.array(filtered_test)

def create_dirty_data(X_test_reg, Y_pred_class):
    columns_to_replace = [col for col in X_test_reg.columns if col.startswith('BIN_REGION_')]
    Y_pred_class.columns = columns_to_replace
    X_test_reg_dirty = X_test_reg.copy()
    X_test_reg_dirty.reset_index(inplace=True, drop=True)
    X_test_reg_dirty.drop(columns=columns_to_replace, inplace=True)
    X_test_reg_dirty = pd.concat([X_test_reg_dirty, Y_pred_class], axis=1)
    return X_test_reg_dirty

def evaluate_model(X_train, X_test, Y_train, Y_test, model, model_name):
    if model_name == 'AdaBoost':
        Y_pred_aggregate = np.array([model.fit(X_train, Y_train[target]).predict(X_test) for target in Y_train.columns]).T
        Y_pred_df = pd.DataFrame(Y_pred_aggregate, columns=Y_train.columns)
    else:
        model.fit(X_train, Y_train)
        Y_pred_df = pd.DataFrame(model.predict(X_test), columns=Y_test.columns)
    zerodiv = 1
    accuracy = accuracy_score(Y_test.values.flatten(), Y_pred_df.values.flatten())
    precision = precision_score(Y_test.values, Y_pred_df.values, average='macro', zero_division=zerodiv)
    recall = recall_score(Y_test.values, Y_pred_df.values, average='macro', zero_division=zerodiv)
    f1 = f1_score(Y_test.values, Y_pred_df.values, average='macro', zero_division=zerodiv)

    print(f'Metrics for {company} with {model_name}:')
    print(f'Accuracy: {accuracy}')
    print(f'Precision: {precision}')
    print(f'Recall: {recall}')
    print(f'F1 Score: {f1}')
    print("\n")

    return {
        'model': model_name,
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }


# Classifier selection


### Models test


In [5]:
import warnings
import re
# Suppress the specific sklearn warning
warnings.filterwarnings("ignore", message="X does not have valid feature names, but KMeans was fitted with feature names")

# Suppress the specific SettingWithCopyWarning
warnings.filterwarnings("ignore", category=pd.errors.SettingWithCopyWarning)
# Suppress specific FutureWarnings

warnings.filterwarnings("ignore", category=FutureWarning, message="DataFrame.applymap has been deprecated. Use DataFrame.map instead.")
warnings.filterwarnings("ignore", category=FutureWarning, message="The SAMME.R algorithm (the default) is deprecated and will be removed in 1.6. Use the SAMME algorithm to circumvent this warning.")
# Suppress specific FutureWarnings related to SAMME.R algorithm deprecation
warnings.filterwarnings("ignore", category=FutureWarning, message=re.escape("The SAMME.R algorithm (the default) is deprecated and will be removed in 1.6. Use the SAMME algorithm to circumvent this warning."))


In [None]:
classifier_dict = {}

categorical_features = ['LAADPC', 'dayofweekcreation', 'weeknr']
continuous_features = ['PALLETPLAATSEN', 'AANTALORDERS']
features = categorical_features + continuous_features
for company in clusters_dict:
    classifier_dict[company] = {}
    print(f"Processing {company}")
    df_condensed = clusters_dict[company]["model"].condense_orders()   
    df_condensed["weeknr"] = df_condensed["CREATIONDATETIME"].dt.strftime("%V")
    region_columns = [col for col in df_condensed.columns if col.startswith('REGION_')]
        
    # Filter data for the specific company
    df_to_use = df_condensed[df_condensed["OPDRACHTGEVERNAAM"] == company]

    # Calculate the totals for each region column and aquire targets
    totals = df_to_use[region_columns].sum()
    non_zero_totals = totals[totals != 0]
    targets = list(non_zero_totals.keys())

    df_to_use = df_to_use[features + targets]

    # Prepare the feature matrix and target matrix
    X = df_to_use[features]
    Y = df_to_use[targets]

    # Convert target values to binary (0 = False, 1 = True)
    Y_binary = Y.applymap(lambda x: 1 if x > 0 else 0)

    # Handling categorical variables
    encoder = OneHotEncoder(sparse_output=False)
    encoded_categorical_features = encoder.fit_transform(df_to_use[categorical_features])

    # Scale continuous variables
    scaler = MinMaxScaler(feature_range=(0, 1))
    scaled_continuous_features = scaler.fit_transform(df_to_use[continuous_features])

    # Combine all features
    X_formatted = np.hstack([scaled_continuous_features, encoded_categorical_features])

    X_train, X_test, Y_train, Y_test = train_test_split(X_formatted, Y_binary, test_size=0.2, random_state=41)

    models = {
        'Random Forest': RandomForestClassifier(random_state=42, n_estimators=300, min_samples_split=2, min_samples_leaf=1, max_depth=6, bootstrap=False),
        'XGBoost': xgb.XGBClassifier(random_state=42, n_estimators=300, max_depth=6, learning_rate=0.01),
        'AdaBoost': AdaBoostClassifier(random_state=42, n_estimators=300, learning_rate=0.01)
    }
    for model_name, model in models.items():
        classifier_dict[company][model_name] = evaluate_model(X_train, X_test, Y_train, Y_test, model, model_name)

In [None]:
totals_dict = {}
totals_dict['volume'] = []
for model in classifier_dict[company].keys():
    totals_dict[model] = {}
    totals_dict[model]['accuracy'] = []
    totals_dict[model]['precision'] = []
    totals_dict[model]['recall'] = []
    totals_dict[model]['f1'] = []

for company in classifier_dict.keys():
    df = clusters_dict[company]['model'].df_input
    df[df["OPDRACHTGEVERNAAM"] == company]

    totals_dict['volume'].append(df["PALLETPLAATSEN"].sum())
    for model in classifier_dict[company].keys():
        totals_dict[model]['accuracy'].append(classifier_dict[company][model]['accuracy'])
        totals_dict[model]['precision'].append(classifier_dict[company][model]['precision'])
        totals_dict[model]['recall'].append(classifier_dict[company][model]['recall'])
        totals_dict[model]['f1'].append(classifier_dict[company][model]['f1'])


for model in totals_dict:
    if model != "volume":
        print(model)
        for score in totals_dict[model]:
            print(score, np.mean(totals_dict[model][score]))
print("==================================")
# print weighed scores
total_pallets = sum(totals_dict['volume'])
totals_dict["weight"] = totals_dict['volume']/total_pallets
for model in totals_dict:
    if model != "volume" and model != "weight":
        print(model)
        for score in totals_dict[model]:
            print(score, sum(totals_dict[model][score]* totals_dict["weight"]))

## plot for each company and the three models

In [None]:
import matplotlib.pyplot as plt
import numpy as np
for company in classifier_dict:
    classifier_dict[company]['fakename'] = clusters_dict[company]['fakename']

biggest_companies_10 = list(classifier_dict.keys())[0:10] 

# Initialize the plot with a larger figure size
fig, ax = plt.subplots(figsize=(12, 6))
bar_width = 0.25
index = np.arange(len(biggest_companies_10))

# Model names
model_names = ['Random Forest', 'XGBoost', 'AdaBoost']

# Plot recall scores for each model
for i, model_name in enumerate(model_names):
    recall_scores = [classifier_dict[company][model_name]['recall'] for company in biggest_companies_10]
    bar_positions = index + i * bar_width
    ax.bar(bar_positions, recall_scores, bar_width, label=model_name)

# Generate labels "Company A", "Company B", etc., for the top 10 companies
company_labels = [classifier_dict[company]['fakename'] for company in biggest_companies_10]

# Set plot details
ax.set_ylabel('Recall Scores', fontsize=16)
ax.set_title('Recall Scores for Top 10 Companies Across Models', fontsize=26)
ax.set_xticks(index + bar_width)
plt.yticks(fontsize=16)
ax.set_xticklabels(company_labels, rotation=45, fontsize=16)  # Rotate labels for better readability
ax.legend(fontsize=16, loc='lower left')

plt.show()
# plt.show()


In [None]:
import matplotlib.pyplot as plt
import numpy as np
for company in classifier_dict:
    classifier_dict[company]['fakename'] = clusters_dict[company]['fakename']

biggest_companies_10 = list(classifier_dict.keys())[0:10] 

# Initialize the plot with a larger figure size
fig, ax = plt.subplots(figsize=(12, 6))
bar_width = 0.25
index = np.arange(len(biggest_companies_10))

# Model names
model_names = ['Random Forest', 'XGBoost', 'AdaBoost']

# Plot recall scores for each model
for i, model_name in enumerate(model_names):
    recall_scores = [classifier_dict[company][model_name]['f1'] for company in biggest_companies_10]
    bar_positions = index + i * bar_width
    ax.bar(bar_positions, recall_scores, bar_width, label=model_name)

# Generate labels "Company A", "Company B", etc., for the top 10 companies
company_labels = [classifier_dict[company]['fakename'] for company in biggest_companies_10]

# Set plot details with specified font sizes
ax.set_ylabel('F1 Scores', fontsize=16)
ax.set_title('F1 Scores for Top 10 Companies Across Models', fontsize=26)
ax.set_xticks(index + bar_width)
ax.set_xticklabels(company_labels, rotation=45, fontsize=16)  # Rotate labels for better readability
ax.legend(fontsize=16, loc='lower left')
plt.yticks(fontsize=16)

plt.show()
# plt.show()


In [None]:
import matplotlib.pyplot as plt
import numpy as np
for company in classifier_dict:
    classifier_dict[company]['fakename'] = clusters_dict[company]['fakename']

biggest_companies_10 = list(classifier_dict.keys())[0:10] 

# Initialize the plot with a larger figure size
fig, ax = plt.subplots(figsize=(12, 6))
bar_width = 0.25
index = np.arange(len(biggest_companies_10))

# Model names
model_names = ['Random Forest', 'XGBoost', 'AdaBoost']

# Plot recall scores for each model
for i, model_name in enumerate(model_names):
    recall_scores = [classifier_dict[company][model_name]['accuracy'] for company in biggest_companies_10]
    bar_positions = index + i * bar_width
    ax.bar(bar_positions, recall_scores, bar_width, label=model_name)

# Generate labels "Company A", "Company B", etc., for the top 10 companies
company_labels = [classifier_dict[company]['fakename'] for company in biggest_companies_10]

# Set plot details
ax.set_ylabel('Acccuracy Scores')
ax.set_title('Acccuracy Scores for Top 10 Companies Across Models')
ax.set_xticks(index + bar_width)
ax.set_xticklabels(company_labels, rotation=45)  # Rotate labels for better readability
ax.legend()

plt.show()
# plt.show()


In [None]:
import matplotlib.pyplot as plt
import numpy as np
for company in classifier_dict:
    classifier_dict[company]['fakename'] = clusters_dict[company]['fakename']

biggest_companies_10 = list(classifier_dict.keys())[0:10] 

# Initialize the plot with a larger figure size
fig, ax = plt.subplots(figsize=(12, 6))
bar_width = 0.25
index = np.arange(len(biggest_companies_10))

# Model names
model_names = ['Random Forest', 'XGBoost', 'AdaBoost']

# Plot recall scores for each model
for i, model_name in enumerate(model_names):
    recall_scores = [classifier_dict[company][model_name]['precision'] for company in biggest_companies_10]
    bar_positions = index + i * bar_width
    ax.bar(bar_positions, recall_scores, bar_width, label=model_name)

# Generate labels "Company A", "Company B", etc., for the top 10 companies
company_labels = [classifier_dict[company]['fakename'] for company in biggest_companies_10]

# Set plot details
ax.set_ylabel('Precission Scores')
ax.set_title('Precission Scores for Top 10 Companies Across Models')
ax.set_xticks(index + bar_width)
ax.set_xticklabels(company_labels, rotation=45)  # Rotate labels for better readability
ax.legend()

plt.show()
# plt.show()


# Classifier CV XGBoost


In [None]:
warnings.filterwarnings("ignore", category=FutureWarning, message=".*applymap has been deprecated.*")

# Assuming df_condensed is already defined and loaded with data
categorical_features = ['LAADPC', 'dayofweekcreation', 'weeknr']
continuous_features = ['PALLETPLAATSEN', 'AANTALORDERS']
features = categorical_features + continuous_features

classifier_dict3 = {}

for company in clusters_dict: 
    print("==================================")
    print(f"Training for {company}")

    df_condensed = clusters_dict[company]["model"].condense_orders()
    classifier_dict3[company] = {}
    # Filter data for the specific company
    df_to_use = df_condensed[df_condensed["OPDRACHTGEVERNAAM"] == company]
    region_columns = [col for col in df_condensed.columns if col.startswith('REGION_')]

    # Calculate the totals for each region column and aquire targets
    totals = df_to_use[region_columns].sum()
    non_zero_totals = totals[totals != 0]
    targets = list(non_zero_totals.keys())

    df_to_use = df_to_use[features + targets]

    # Prepare the feature matrix and target matrix
    X = df_to_use[features]
    Y = df_to_use[targets]

    # Convert target values to binary (0 = False, 1 = True)
    Y_binary = Y.applymap(lambda x: 1 if x > 0 else 0)

    # Handling categorical variables
    encoder = OneHotEncoder(sparse_output=False)
    encoded_categorical_features = encoder.fit_transform(df_to_use[categorical_features])

    # Scale continuous variables
    scaler = MinMaxScaler(feature_range=(0, 1))
    scaled_continuous_features = scaler.fit_transform(df_to_use[continuous_features])

    # Combine all features
    X_formatted = np.hstack([scaled_continuous_features, encoded_categorical_features])

    X_train, X_test, Y_train, Y_test = train_test_split(X_formatted, Y_binary, test_size=0.2, random_state=42)

    param_grid_xgb = {
        'n_estimators': [50, 100, 200, 300],
        'max_depth': [20, 30, 40, 50],
        'learning_rate': [0.001, 0.005, 0.01, 0.05, 0.1]
    }

    # Optimize for recall
    def custom_recall(y_true, y_pred):
        return recall_score(y_true, y_pred, average='macro', zero_division=0)
    custom_scorer = make_scorer(custom_recall)

    model = GridSearchCV(xgb.XGBClassifier(random_state=42), param_grid_xgb, cv=3, scoring=custom_scorer, verbose=0)

    model.fit(X_train, Y_train)
    best_model = model.best_estimator_
    best_params = model.best_params_
    classifier_dict3[company]['recall'] = evaluate_model(X_train, X_test, Y_train, Y_test, best_model, "XGBoostRecall")
    classifier_dict3[company]['recall']['best_params'] = best_params
    print(f"Best parameters are: {best_params}")
    print(f"Accuracy: {classifier_dict3[company]['recall']['accuracy']}, Recall: {classifier_dict3[company]['recall']['recall']}, Precision: {classifier_dict3[company]['recall']['precision']}, F1: {classifier_dict3[company]['recall']['f1']}")
    clusters_dict[company]["recall_params"] = best_params
    print("----------------------------------")
    
    
    # Optimize for f1
    def custom_f1(y_true, y_pred):
        return f1_score(y_true, y_pred, average='macro', zero_division=0)
    custom_scorer = make_scorer(custom_f1)

    model = GridSearchCV(xgb.XGBClassifier(random_state=42), param_grid_xgb, cv=3, scoring=custom_scorer, verbose=0)
    model.fit(X_train, Y_train)
    best_model = model.best_estimator_
    best_params = model.best_params_
    classifier_dict3[company]['f1'] = evaluate_model(X_train, X_test, Y_train, Y_test, best_model, "XGBoostRecall")
    classifier_dict3[company]['f1']['best_params'] = best_params
    print(f"Best parameters are: {best_params}")
    print(f"Accuracy: {classifier_dict3[company][model_name]['accuracy']}, Recall: {classifier_dict3[company]['f1']['recall']}, Precision: {classifier_dict3[company]['f1']['precision']}, F1: {classifier_dict3[company]['f1']['f1']}")
    clusters_dict[company]["f1_params"] = best_params
    print("==================================")



 # remove GSE object from dictionary:
final_dict_export = {}
for company in clusters_dict:
    final_dict_export[company] = clusters_dict[company].copy()
    final_dict_export[company].pop('model', None)
    final_dict_export[company].pop('CPC_dict', None)
    final_dict_export[company].pop('CPC_dict2', None)


# save final_dict to a json file
import json
with open('dict_final_class_parameters.json', 'w') as file:
    json.dump(final_dict_export, file, indent=4)  # indent=4 is optional for pretty printing      

#### Test Classifier results

In [None]:
import json
with open('dict_incl_class_parameters.json', 'r') as file:
    classifier_dict = json.load(file)

for company in classifier_dict:
    classifier_dict[company].pop('CPC_dict2')
    classifier_dict[company].pop('CPC_dict')
    classifier_dict[company]['model'] = clusters_dict[company]['model']

categorical_features = ['LAADPC', 'dayofweekcreation', 'weeknr']
continuous_features = ['PALLETPLAATSEN', 'AANTALORDERS']
features = categorical_features + continuous_features
for company in classifier_dict:
    print(f"Processing {company}")
    df_condensed = clusters_dict[company]["model"].condense_orders()   
    df_condensed["weeknr"] = df_condensed["CREATIONDATETIME"].dt.strftime("%V")
    region_columns = [col for col in df_condensed.columns if col.startswith('REGION_')]
        
    # Filter data for the specific company
    df_to_use = df_condensed[df_condensed["OPDRACHTGEVERNAAM"] == company]

    # Calculate the totals for each region column and aquire targets
    totals = df_to_use[region_columns].sum()
    non_zero_totals = totals[totals != 0]
    targets = list(non_zero_totals.keys())

    df_to_use = df_to_use[features + targets]

    # Prepare the feature matrix and target matrix
    X = df_to_use[features]
    Y = df_to_use[targets]

    # Convert target values to binary (0 = False, 1 = True)
    Y_binary = Y.applymap(lambda x: 1 if x > 0 else 0)

    # Handling categorical variables
    encoder = OneHotEncoder(sparse_output=False)
    encoded_categorical_features = encoder.fit_transform(df_to_use[categorical_features])

    # Scale continuous variables
    scaler = MinMaxScaler(feature_range=(0, 1))
    scaled_continuous_features = scaler.fit_transform(df_to_use[continuous_features])

    # Combine all features
    X_formatted = np.hstack([scaled_continuous_features, encoded_categorical_features])

    X_train, X_test, Y_train, Y_test = train_test_split(X_formatted, Y_binary, test_size=0.2, random_state=41)

    model_recall = xgb.XGBClassifier(random_state=42, 
                                     n_estimators=classifier_dict[company]['recall_params']['n_estimators'],
                                     max_depth=classifier_dict[company]['recall_params']['max_depth'], 
                                     learning_rate=classifier_dict[company]['recall_params']['learning_rate'])
    model_f1 = xgb.XGBClassifier(random_state=42,
                                    n_estimators=classifier_dict[company]['f1_params']['n_estimators'],
                                    max_depth=classifier_dict[company]['f1_params']['max_depth'], 
                                    learning_rate=classifier_dict[company]['f1_params']['learning_rate'])

    classifier_dict[company]['recall_model_scores'] = evaluate_model(X_train, X_test, Y_train, Y_test, model_recall, 'XGBoost_recall')
    classifier_dict[company]['f1_model_scores'] = evaluate_model(X_train, X_test, Y_train, Y_test, model_f1, 'XGBoost_f1')


In [None]:
total_pallets = sum([classifier_dict[company]['model'].df_input['PALLETPLAATSEN'].sum() for company in classifier_dict])
for company in classifier_dict:
    classifier_dict[company]['pallets'] = classifier_dict[company]['model'].df_input['PALLETPLAATSEN'].sum()
    classifier_dict[company]['weight'] = classifier_dict[company]['pallets'] / total_pallets

print("Mean accuracy optimized on recall ", np.mean([x for x in [classifier_dict[company]['recall_model_scores']['accuracy'] for company in classifier_dict]]))
print("Mean precision optimized on recall ", np.mean([x for x in [classifier_dict[company]['recall_model_scores']['precision'] for company in classifier_dict]]))
print("Mean recall optimized on recall ", np.mean([x for x in [classifier_dict[company]['recall_model_scores']['recall'] for company in classifier_dict]]))
print("Mean f1 optimized on recall ", np.mean([x for x in [classifier_dict[company]['recall_model_scores']['f1'] for company in classifier_dict]]))

print("Mean accuracy optimized on f1     ", np.mean([x for x in [classifier_dict[company]['f1_model_scores']['accuracy'] for company in classifier_dict]]))
print("Mean precision optimized on f1     ", np.mean([x for x in [classifier_dict[company]['f1_model_scores']['precision'] for company in classifier_dict]]))
print("Mean recall optimized on f1     ", np.mean([x for x in [classifier_dict[company]['f1_model_scores']['recall'] for company in classifier_dict]]))
print("Mean f1 optimized on f1     ", np.mean([x for x in [classifier_dict[company]['f1_model_scores']['f1'] for company in classifier_dict]]))
print(" ==================================")

print("weighed scores")
print("Mean accuracy optimized on recall ", sum([classifier_dict[company]['recall_model_scores']['accuracy']*classifier_dict[company]['weight'] for company in classifier_dict]))
print("Mean precision optimized on recall ", sum([classifier_dict[company]['recall_model_scores']['precision']*classifier_dict[company]['weight'] for company in classifier_dict]))
print("Mean recall optimized on recall ", sum([classifier_dict[company]['recall_model_scores']['recall']*classifier_dict[company]['weight'] for company in classifier_dict]))
print("Mean f1 optimized on recall ", sum([classifier_dict[company]['recall_model_scores']['f1']*classifier_dict[company]['weight'] for company in classifier_dict]))

print("Mean accuracy optimized on f1     ", sum([classifier_dict[company]['f1_model_scores']['accuracy']*classifier_dict[company]['weight'] for company in classifier_dict]))
print("Mean precision optimized on f1     ", sum([classifier_dict[company]['f1_model_scores']['precision']*classifier_dict[company]['weight'] for company in classifier_dict]))
print("Mean recall optimized on f1     ", sum([classifier_dict[company]['f1_model_scores']['recall']*classifier_dict[company]['weight'] for company in classifier_dict]))
print("Mean f1 optimized on f1     ", sum([classifier_dict[company]['f1_model_scores']['f1']*classifier_dict[company]['weight'] for company in classifier_dict]))

In [None]:
biggest_companies = sorted(classifier_dict, key=lambda x: classifier_dict[x]['pallets'], reverse=True)[:20]

def plot_grouped_bars(dataframe, chunk_size, title=None, max_plots=None):
    chunks = len(dataframe) // chunk_size + (1 if len(dataframe) % chunk_size > 0 else 0)
    
    for i in range(chunks):
        if max_plots is not None and i >= max_plots:
            break
        chunk = dataframe.iloc[i * chunk_size:(i + 1) * chunk_size]
        if title is None:
            title = f'Model Performance Metrics for Companies {i * chunk_size + 1} to {(i + 1) * chunk_size}'
        ax = chunk.plot(kind='bar', figsize=(12, 8), title=title)
        ax.set_xlabel('Company')
        ax.set_ylabel('Scores')
        ax.grid(True)
        plt.legend(title='Metric', loc='lower left')
        plt.xticks(rotation=45, ticks=range(len(chunk)), labels=chunk['Company'])
        plt.tight_layout()
        plt.show()

data = pd.DataFrame({
    'Company': [classifier_dict[company]['fakename'] for company in biggest_companies],
    'Recall optimized model - Accuracy': [classifier_dict[company]['recall_model_scores']['accuracy'] for company in biggest_companies],
    'Recall optimized model - Precision': [classifier_dict[company]['recall_model_scores']['precision'] for company in biggest_companies],
    'Recall optimized model - Recall': [classifier_dict[company]['recall_model_scores']['recall'] for company in biggest_companies],
    'Recall optimized model - F1': [classifier_dict[company]['recall_model_scores']['f1'] for company in biggest_companies],
    'F1 optimized model - Accuracy': [classifier_dict[company]['f1_model_scores']['accuracy'] for company in biggest_companies],
    'F1 optimized model - Precision': [classifier_dict[company]['f1_model_scores']['precision'] for company in biggest_companies],
    'F1 optimized model - Recall': [classifier_dict[company]['f1_model_scores']['recall'] for company in biggest_companies],
    'F1 optimized model - F1': [classifier_dict[company]['f1_model_scores']['f1'] for company in biggest_companies]
})

plot_grouped_bars(data, 10, 'Model Performance Metrics for Top 10 Companies')

# Final parameters for regressors

In [None]:
# read the final_dict from file
import json
with open('clusters_final_dict.json', 'r') as file:
    clusters_dict = json.load(file)

# read the final_dict from file
import json
with open('dict_incl_class_parameters.json', 'r') as file:
    classifier_dict = json.load(file)
companies = list(clusters_dict.keys())
for company in clusters_dict:
    df_to_use = df_orders[(df_orders['OPDRACHTGEVERNAAM'] == company)]
    clusters_dict[company]["GSE"] = GeoSpatialEncoder(PC_obj)
    clusters_dict[company]["GSE"].set_verbose(False)
    clusters_dict[company]["GSE"].set_input_df(df_to_use)
    clusters_dict[company]["GSE"].clean_input_df()
    clusters_dict[company]["GSE"].train_kmeans(clusters_dict[company]['nclusters'], 'SHIPMENT_COUNT')
    clusters_dict[company]["Classifier"] = xgb.XGBClassifier(random_state=42,
                                                              n_estimators=classifier_dict[company]['recall_params']['n_estimators'],
                                                              max_depth=classifier_dict[company]['recall_params']['max_depth'],
                                                              learning_rate=classifier_dict[company]['recall_params']['learning_rate'])

In [132]:
for company in companies:
    clusters_dict[company+"_f1"] = clusters_dict[company].copy()
    clusters_dict[company+"_f1"]["Classifier"] = xgb.XGBClassifier(random_state=42,
                                                                   n_estimators=classifier_dict[company]['f1_params']['n_estimators'],
                                                                    max_depth=classifier_dict[company]['f1_params']['max_depth'],
                                                                    learning_rate=classifier_dict[company]['f1_params']['learning_rate'])

## Creation of pipeline class


In [133]:
class pipeline:
    def __init__(self, classifier, regressor, mode, df, company):
        self.classifier = classifier
        self.regressor = regressor
        self.mode = mode
        self.df = df
        self.company = company
        self.encoder = OneHotEncoder(sparse_output=False)
        self.scaler = MinMaxScaler(feature_range=(0, 1))
        self.categorical_features = ['LAADPC', 'dayofweekcreation', 'weeknr']
        self.continuous_features = ['PALLETPLAATSEN', 'AANTALORDERS']

    def prepare_X_values(self, X):
        encoded_categorical_features = self.encoder.transform(X[categorical_features])
        scaled_continuous_features = self.scaler.transform(X[continuous_features])
        return np.hstack([scaled_continuous_features, encoded_categorical_features])
    
    def get_X_and_Y(self):
        
        features = self.categorical_features + self.continuous_features

        region_columns = [col for col in self.df.columns if col.startswith('REGION_')]
        # Filter data for the specific company
        df_to_use = self.df

        # Calculate the totals for each region column and aquire targets
        totals = df_to_use[region_columns].sum()
        non_zero_totals = totals[totals != 0]
        targets = list(non_zero_totals.keys())
              
        encoded_categorical_features = self.encoder.fit_transform(df_to_use[self.categorical_features])
        scaled_continuous_features = self.scaler.fit_transform(df_to_use[self.continuous_features])

        # Combine all features
        X_formatted = np.hstack([scaled_continuous_features, encoded_categorical_features])
        Y = df_to_use[targets]  
        
        self.X_train, self.X_test, self.Y_train, self.Y_test = train_test_split(X_formatted, Y, test_size=0.2, random_state=42)
        self.Y_train_binary = self.Y_train.applymap(lambda x: 1 if x > 0 else 0)
        self.Y_test_binary = self.Y_test.applymap(lambda x: 1 if x > 0 else 0)

    def add_binary_to_X(self, X, Y_binary):
        return np.hstack([X, Y_binary])

    def get_original_palletplaatsen(self, X_scaled):
        original_values = self.scaler.inverse_transform(np.column_stack((X_scaled[:, 0], X_scaled[:, 1])))
        original_df = pd.DataFrame(original_values, columns=['PALLETPLAATSEN', 'AANTALORDERS'])
        return original_df["PALLETPLAATSEN"]

    def train_classifier(self):
        self.classifier.fit(self.X_train, self.Y_train_binary)

    def predict_classifier(self, X):
        Y_pred_df = pd.DataFrame(self.classifier.predict(X), columns=self.Y_train_binary.columns)
        return Y_pred_df
    
    def score_classifier(self):
        Y_pred = self.predict_classifier(self.X_test)
        zerodiv = 1
        accuracy = accuracy_score( self.Y_test_binary.values.flatten(), Y_pred.values.flatten())
        precision = precision_score( self.Y_test_binary,  Y_pred, average='macro', zero_division=zerodiv)
        recall = recall_score( self.Y_test_binary,  Y_pred, average='macro', zero_division=zerodiv)
        f1 = f1_score(self.Y_test_binary, Y_pred, average='macro', zero_division=zerodiv)
        
        return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}
    
    def predict_destination(self, X):
        inputvalues = self.prepare_X_values(X)
        return self.classifier.predict(inputvalues) 

    def predict_demands(self, X):
        inputvalues = self.prepare_X_values(X)
        pred_test_Y = self.predict_classifier(inputvalues)
        X = self.add_binary_to_X(inputvalues, pred_test_Y)
        Y_pred = self.regressor.predict(X)
        Y_pred = self.filter_by_binary(Y_pred, pred_test_Y)
        return Y_pred

    def train_clean_regressor(self):
        X = self.add_binary_to_X(self.X_train, self.Y_train_binary)
        self.regressor.fit(X, self.Y_train)
        
    def train_dirty_regressor(self):
        pred_train_Y = self.predict_classifier(self.X_train)
        X = self.add_binary_to_X(self.X_train, pred_train_Y)
        self.regressor.fit(X, self.Y_train)

    def train_paralel_regressor(self):
        self.regressor.fit(self.X_train, self.Y_train)


    def score_clean_regressor(self):
        X = self.add_binary_to_X(self.X_test, self.Y_test_binary)
        Y_pred = self.regressor.predict(X)
        mse = mean_squared_error(self.Y_test, Y_pred)
        r2 = r2_score(self.Y_test, Y_pred)
        return {"mean_squared_error": mse, "r2_score": r2}
    
    def score_dirty_regressor(self):
        pred_test_Y = self.predict_classifier(self.X_test)
        X = self.add_binary_to_X(self.X_test, pred_test_Y)
        Y_pred = self.regressor.predict(X)
        mse = mean_squared_error(self.Y_test, Y_pred)
        r2 = r2_score(self.Y_test, Y_pred)
        return {"mean_squared_error": mse, "r2_score": r2}
    
    def score_paralel_regressor(self):
        Y_pred = self.regressor.predict(self.X_test)
        mse = mean_squared_error(self.Y_test, Y_pred)
        r2 = r2_score(self.Y_test, Y_pred)
        return {"mean_squared_error": mse, "r2_score": r2}
    

    def filter_by_binary(self, df_float, df_binary):
        df_float = pd.DataFrame(df_float, columns=self.Y_train.columns)
        # Ensure the dataframes have the same shape
        if df_float.shape != df_binary.shape:
            raise ValueError("The two dataframes must have the same shape.")
        
        # Apply the mask
        result_df = df_float.where(df_binary == 1, 0)
        return result_df
    
    def softmax_to_demand(self, df_predicted, given_demand):
        # Ensure given_demand is a Series for iteration
        df_predicted.reset_index(inplace=True, drop=True)
        given_demand.reset_index(inplace=True, drop=True)
        given_demand = given_demand.squeeze()  # This works if given_demand is a DataFrame with one column or a Series
        
        # Iterate over each row by index, assuming both df_predicted and given_demand use the same index
        for row_index in df_predicted.index:
            demand = given_demand.loc[row_index]
            row_values = df_predicted.loc[row_index]
            
            if row_values.sum() == 0:
                continue  # Skip rows where the sum is zero to avoid division by zero

            # Apply scaling factor only to non-zero elements
            non_zero_indices = row_values != 0
            non_zero_values = row_values[non_zero_indices]
            scaling_factor = demand / non_zero_values.sum()
            
            # Scale and apply softmax logic to non-zero values
            non_zero_adjusted = np.exp(non_zero_values * scaling_factor - np.max(non_zero_values * scaling_factor))
            df_predicted.loc[row_index, non_zero_indices] = np.round((non_zero_adjusted / non_zero_adjusted.sum()) * demand)

        return df_predicted

    def scale_to_demand(self, df_predicted, given_demand):
        df_predicted.reset_index(inplace=True, drop=True)
        given_demand.reset_index(inplace=True, drop=True)
        for row, demand in given_demand.items():  # assuming given_demand is a dict or series
            total_predicted = df_predicted.loc[row].sum()
            if total_predicted == 0:
                continue  # Skip if no demand is predicted to avoid division by zero
            scaling_factor = demand / total_predicted
            df_predicted.loc[row] = np.round(df_predicted.loc[row] * scaling_factor)
        return df_predicted
    
    def score_pipeline(self):
        pred_test_Y = self.predict_classifier(self.X_test)
        X = self.add_binary_to_X(self.X_test, pred_test_Y)
        Y_pred = self.regressor.predict(X)
        Y_pred = self.filter_by_binary(Y_pred, pred_test_Y)
        mse = mean_squared_error(self.Y_test, Y_pred)
        r2 = r2_score(self.Y_test, Y_pred)
        return {"mean_squared_error": mse, "r2_score": r2}
    
    def score_pipeline_with_correction(self):
        pred_test_Y = self.predict_classifier(self.X_test)
        X = self.add_binary_to_X(self.X_test, pred_test_Y)
        Y_pred = self.regressor.predict(X)
        Y_pred = self.filter_by_binary(Y_pred, pred_test_Y)
        original_palletplaatsen = self.get_original_palletplaatsen(self.X_test)
        Y_pred = self.scale_to_demand(Y_pred, original_palletplaatsen)
        mse = mean_squared_error(self.Y_test, Y_pred)
        r2 = r2_score(self.Y_test, Y_pred)
        return {"mean_squared_error": mse, "r2_score": r2}
    
    def score_pipeline_with_correction2(self):
        pred_test_Y = self.predict_classifier(self.X_test)
        X = self.add_binary_to_X(self.X_test, pred_test_Y)
        Y_pred = self.regressor.predict(X)
        Y_pred = self.filter_by_binary(Y_pred, pred_test_Y)
        original_palletplaatsen = self.get_original_palletplaatsen(self.X_test)
        Y_pred = self.softmax_to_demand(Y_pred, original_palletplaatsen)
        mse = mean_squared_error(self.Y_test, Y_pred)
        r2 = r2_score(self.Y_test, Y_pred)
        return {"mean_squared_error": mse, "r2_score": r2}


    
    def score_paralel_pipeline(self):
        Y_pred_bin = self.predict_classifier(self.X_test)
        Y_pred = self.regressor.predict(self.X_test)
        Y_pred_filtered = self.filter_by_binary(Y_pred, Y_pred_bin)
        mse = mean_squared_error(self.Y_test, Y_pred_filtered)
        r2 = r2_score(self.Y_test, Y_pred_filtered)
        return {"mean_squared_error": mse, "r2_score": r2}
    
    def score_paralel_pipeline_with_correction(self):
        Y_pred_bin = self.predict_classifier(self.X_test)
        Y_pred = self.regressor.predict(self.X_test)
        Y_pred_filtered = self.filter_by_binary(Y_pred, Y_pred_bin)
        original_palletplaatsen = self.get_original_palletplaatsen(self.X_test)
        Y_pred_scaled = self.scale_to_demand(Y_pred_filtered, original_palletplaatsen)
        mse = mean_squared_error(self.Y_test, Y_pred_scaled)
        r2 = r2_score(self.Y_test, Y_pred_scaled)
        return {"mean_squared_error": mse, "r2_score": r2}
    
    def score_paralel_pipeline_with_correction2(self):
        Y_pred_bin = self.predict_classifier(self.X_test)
        Y_pred = self.regressor.predict(self.X_test)
        Y_pred_filtered = self.filter_by_binary(Y_pred, Y_pred_bin)
        original_palletplaatsen = self.get_original_palletplaatsen(self.X_test)
        Y_pred_softmax = self.softmax_to_demand(Y_pred_filtered, original_palletplaatsen)
        mse = mean_squared_error(self.Y_test, Y_pred_softmax)
        r2 = r2_score(self.Y_test, Y_pred_softmax)
        return {"mean_squared_error": mse, "r2_score": r2}

    
        

## Running pipeline class and finding best parameters

In [None]:
from custom.Pipeline import Pipeline
results_dict = {}
for company in clusters_dict:
    results_dict[company] = {}
    print(company)
    print("====================================")
    pipe1 = Pipeline(clusters_dict[company]["Classifier"], None, "test", clusters_dict[company]["GSE"].condense_orders(), company)
    pipe1.get_X_and_Y()
    print("Classifier")
    pipe1.train_classifier()
    # prepare_X = pipe1.prepare_X_values(pipe1.X_test)
    results_dict[company]["classifier"] = pipe1.score_classifier()
    print(results_dict[company]["classifier"])

    best_r2_clean = -9999
    best_r2_dirty = -9999
    best_r2_parallel = -9999 

    for n_est in [1, 20 ,50, 100, 200, 300]:
        for max_depth in [1,3,5, 10, 20, 50]:
            for learn_rate in [0.01, 0.05, 0.1]:
                paramset = (n_est, max_depth, learn_rate)
                results_dict[company][paramset] = {}                
                # print(paramset)
                
                xgb_regressor = xgb.XGBRegressor(
                    random_state=42,
                    n_estimators=n_est, #int(clusters_dict[company]['params']['n_estimators']),
                    max_depth=max_depth, #clusters_dict[company]['params']['max_depth'],
                    learning_rate= learn_rate #clusters_dict[company]['params']['learning_rate']
                )
                pipe1.regressor = xgb_regressor

                # print("Clean regressor")
                pipe1.train_clean_regressor()
                results_dict[company][paramset]["clean_regressor"] = {}
                results_dict[company][paramset]["clean_regressor"]['clean_input'] = pipe1.score_clean_regressor()
                results_dict[company][paramset]["clean_regressor"]['dirty_input'] = pipe1.score_dirty_regressor()
                # print(results_dict[company][paramset]["clean_regressor"])
                results_dict[company][paramset]["clean_regressor"]["pipeline"] = pipe1.score_pipeline()
                # print(results_dict[company][paramset]["clean_regressor"]["pipeline"])
                results_dict[company][paramset]["clean_regressor"]["pipeline_with_correction"] = pipe1.score_pipeline_with_correction()
                # print(results_dict[company][paramset]["clean_regressor"]["pipeline_with_correction"])
                results_dict[company][paramset]["clean_regressor"]["pipeline_with_correction2"] = pipe1.score_pipeline_with_correction2()
                # print(results_dict[company][paramset]["clean_regressor"]["pipeline_with_correction2"])
                

                # print("Dirty regressor")
                pipe1.train_dirty_regressor()
                results_dict[company][paramset]["dirty_regressor"] = {}
                results_dict[company][paramset]["dirty_regressor"]['clean_input'] = pipe1.score_clean_regressor()
                results_dict[company][paramset]["dirty_regressor"]['dirty_input'] = pipe1.score_dirty_regressor()
                # print(results_dict[company][paramset]["dirty_regressor"])
                results_dict[company][paramset]["dirty_regressor"]["pipeline"] = pipe1.score_pipeline()
                # print(results_dict[company][paramset]["dirty_regressor"]["pipeline"])
                results_dict[company][paramset]["dirty_regressor"]["pipeline_with_correction"] = pipe1.score_pipeline_with_correction()
                # print(results_dict[company][paramset]["dirty_regressor"]["pipeline_with_correction"])
                results_dict[company][paramset]["dirty_regressor"]["pipeline_with_correction2"] = pipe1.score_pipeline_with_correction2()
                # print(results_dict[company][paramset]["dirty_regressor"]["pipeline_with_correction2"])


                # print("parallel regressor")
                pipe1.train_parallel_regressor()
                results_dict[company][paramset]["parallel_regressor"] = pipe1.score_parallel_regressor()
                # print(results_dict[company][paramset]["parallel_regressor"])
                results_dict[company][paramset]["parallel_regressor"]["pipeline"] = pipe1.score_parallel_pipeline()
                # print(results_dict[company][paramset]["parallel_regressor"]["pipeline"])
                results_dict[company][paramset]["parallel_regressor"]["pipeline_with_correction"] = pipe1.score_parallel_pipeline_with_correction()
                # print(results_dict[company][paramset]["parallel_regressor"]["pipeline_with_correction"])
                results_dict[company][paramset]["parallel_regressor"]["pipeline_with_correction2"] = pipe1.score_parallel_pipeline_with_correction2()
                # print(results_dict[company][paramset]["parallel_regressor"]["pipeline_with_correction2"])
                

                if results_dict[company][paramset]["clean_regressor"]['pipeline']["r2_score"] > best_r2_clean:
                    best_r2_clean = results_dict[company][paramset]["clean_regressor"]['pipeline']["r2_score"]
                    best_params_clean = paramset
                    print(f"Best clean regressor: {best_r2_clean} with parameters: {best_params_clean}")
                if results_dict[company][paramset]["dirty_regressor"]['pipeline']["r2_score"] > best_r2_dirty:
                    best_r2_dirty = results_dict[company][paramset]["dirty_regressor"]['pipeline']["r2_score"]
                    best_params_dirty = paramset
                    print(f"Best dirty regressor: {best_r2_dirty} with parameters: {best_params_dirty}")
                if results_dict[company][paramset]["parallel_regressor"]['pipeline']["r2_score"] > best_r2_parallel:
                    best_r2_parallel = results_dict[company][paramset]["parallel_regressor"]['pipeline']["r2_score"]
                    best_params_parallel = paramset
                    print(f"Best parallel regressor: {best_r2_parallel} with parameters: {best_params_parallel}")
    print("====================================")
    print(f"Best clean regressor: {best_r2_clean} with parameters: {best_params_clean}")
    print(f"Best dirty regressor: {best_r2_dirty} with parameters: {best_params_dirty}")
    print(f"Best parallel regressor: {best_r2_parallel} with parameters: {best_params_parallel}")
    print("====================================")

# # save final_dict to a json file
final_dict_export = {}
for company in results_dict:
    final_dict_export[company] = {}
    for key in results_dict[company]:
        if type(key) == tuple:
            final_dict_export[company][str(key[0])+"_"+str(key[1])+"_"+str(key[2])] = results_dict[company][key].copy()
    final_dict_export[company].pop('model', None)
    final_dict_export[company].pop('Classifier', None)

import json
with open('results_dict_pipelinecomparison_recall_f1_v3.json', 'w') as file:
    json.dump(final_dict_export, file, indent=4)  # indent=4 is optional for pretty printing

#### Export classifier scores to xlsx

In [19]:
data = {
    'Fake Company': [],
    'Company': [],
    'accuracy': [],
    'recall': [],
    'precission': [],
    'f1': [],
}

for company in results_dict:
    data['Fake Company'].append(clusters_dict[company]['fakename'])
    data['Company'].append(company)
    data['accuracy'].append(results_dict[company]['classifier']['accuracy'])
    data['recall'].append(results_dict[company]['classifier']['recall'])
    data['precission'].append(results_dict[company]['classifier']['precision'])
    data['f1'].append(results_dict[company]['classifier']['f1'])

df = pd.DataFrame(data)

# #export to XLSX
# df.to_excel("Classifier_scores_f1.xlsx")

#### Export model scores

In [19]:
# remove GSE object from dictionary:
final_dict_export = {}
for company in results_dict:
    final_dict_export[company] = {}
    for key in results_dict[company]:
        if type(key) == tuple:
            final_dict_export[company][str(key[0])+"_"+str(key[1])+"_"+str(key[2])] = results_dict[company][key].copy()
    final_dict_export[company].pop('model', None)
    final_dict_export[company].pop('Classifier', None)


# # save final_dict to a json file
# import json
# with open('results_dict_pipelinecomparison_recall_v2.json', 'w') as file:
#     json.dump(final_dict_export, file, indent=4)  # indent=4 is optional for pretty printing


In [None]:
import json
with open('results_dict_pipelinecomparison_recall_f1_v3.json', 'r') as file:
    results_dict = json.load(file)
results_dict

In [257]:

results_dict_best = {}
for company in results_dict:
    results_dict_best[company] = {}
    for model in results_dict[company]['1_1_0.01']:
        if model == "parallel_regressor":
            results_dict_best[company]['parallel_regressor'] = {'regressor' : {"r2_score" : -999999, "MSE" : 0, "params" : None,},
                                                           'pipeline': {"r2_score" : -999999, "MSE" : 0, "params" : None,},
                                                           'pipeline_with_correction': {"r2_score" : -999999, "MSE" : 0, "params" : None,},
                                                           'pipeline_with_correction2': {"r2_score" : -999999, "MSE" : 0, "params" : None,}}
        else:
            results_dict_best[company][model] = {}
            for input_type in results_dict[company]['1_1_0.01']['clean_regressor']:
                results_dict_best[company][model][input_type] = {"r2_score" : -999999, "MSE" : 0, "params" : None,}





    for paramset in results_dict[company]:
        paramset_dict = {"n_estimators" : int(paramset.split("_")[0]), "max_depth" : int(paramset.split("_")[1]), "learning_rate" : float(paramset.split("_")[2])}
        for model in results_dict[company][paramset]:
            if model == "parallel_regressor":
                if results_dict[company][paramset][model]["r2_score"] > results_dict_best[company][model]['regressor']["r2_score"]:
                    results_dict_best[company][model]['regressor']["r2_score"] = results_dict[company][paramset][model]["r2_score"]
                    results_dict_best[company][model]['regressor']['params'] = paramset_dict
                if results_dict[company][paramset][model]["pipeline"]["r2_score"] > results_dict_best[company][model]['pipeline']["r2_score"]:
                    results_dict_best[company][model]['pipeline']["r2_score"] = results_dict[company][paramset][model]["pipeline"]["r2_score"]
                    results_dict_best[company][model]['pipeline']['params'] = paramset_dict
                if results_dict[company][paramset][model]["pipeline_with_correction"]["r2_score"] > results_dict_best[company][model]['pipeline_with_correction']["r2_score"]:
                    results_dict_best[company][model]['pipeline_with_correction']["r2_score"] = results_dict[company][paramset][model]["pipeline_with_correction"]["r2_score"]
                    results_dict_best[company][model]['pipeline_with_correction']['params'] = paramset_dict
                if results_dict[company][paramset][model]["pipeline_with_correction2"]["r2_score"] > results_dict_best[company][model]['pipeline_with_correction2']["r2_score"]:
                    results_dict_best[company][model]['pipeline_with_correction2']["r2_score"] = results_dict[company][paramset][model]["pipeline_with_correction2"]["r2_score"]
                    results_dict_best[company][model]['pipeline_with_correction2']['params'] = paramset_dict



            else:
                for input_type in results_dict[company][paramset][model]:
                    if results_dict[company][paramset][model][input_type]["r2_score"] > results_dict_best[company][model][input_type]["r2_score"]:
                        results_dict_best[company][model][input_type]["r2_score"] = results_dict[company][paramset][model][input_type]["r2_score"]
                        results_dict_best[company][model][input_type]['params'] = paramset_dict

final_results_dict_best = {}
for company in results_dict_best:
    final_results_dict_best[company] = {}
    if "f1" in company:
        final_results_dict_best[company[:-3]]["f1_optimised"] = results_dict_best[company]
        final_results_dict_best.pop(company)
    else:
        final_results_dict_best[company]["recall_optimised"] = results_dict_best[company]
        final_results_dict_best[company]['volume'] = classifier_dict[company]['model'].df_input['PALLETPLAATSEN'].sum()
        final_results_dict_best[company]['fakename'] = classifier_dict[company]['fakename']
results_dict_best = final_results_dict_best

# save final_dict to a json file
import json
with open('results_dict_best_v3.json', 'w') as file:
    json.dump(results_dict_best, file, indent=4)  # indent=4 is optional for pretty printing

## R2 Scores Parallel Model


In [50]:
# read results dict best 3
import json
with open('results_dict_best_v3.json', 'r') as file:
    results_dict_best = json.load(file)

In [None]:
sorted_companies = sorted(results_dict_best.items(), 
                          key=lambda x: (len(x[1]['fakename'].split()[1]), x[1]['fakename'].split()[1]))[:25]

# Extract data for plotting
data = []
for company, details in sorted_companies:
    fakename = details['fakename']
    recall_score = details['recall_optimised']['parallel_regressor']['regressor']['r2_score']
    data.append([fakename, recall_score])

# Create a DataFrame
df = pd.DataFrame(data, columns=['Company', 'Parallel Regressor R2'])

# Plotting
fig, ax = plt.subplots(figsize=(12, 7))
width = 0.8  # the width of the bars
x = df.index  # the label locations

rects1 = ax.bar(x, df['Parallel Regressor R2'], width, label='Parallel Regressor')
# Add some text for labels, title, and custom x-axis tick labels, etc.
ax.set_xlabel('Company', fontsize=14)
ax.set_ylabel('R2 Score', fontsize=14)
ax.set_title('R2 Scores by Company for Recall Optimized Parallel Regressor', fontsize=18)
ax.set_xticks(x)
plt.yticks(fontsize=14)
ax.set_xticklabels(df['Company'], rotation=45, fontsize=14)
# ax.set_ylim(-0.5, 1)
ax.legend(fontsize=14)
ax.grid(True)

plt.show()

In [None]:
sorted_companies = sorted(results_dict_best.items(), 
                          key=lambda x: (len(x[1]['fakename'].split()[1]), x[1]['fakename'].split()[1]))

# Extract data for plotting
data = []
for company, details in sorted_companies:
    fakename = details['fakename']
    volume = details['volume']
    f1_parallel = details['f1_optimised']['parallel_regressor']['pipeline']['r2_score']

    data.append([fakename, volume, f1_parallel])

# Create a DataFrame
df = pd.DataFrame(data, columns=['Company', 'Volume','Parallel regressor'
                                 ])



# Calculating the mean of each column (excluding 'Company' and 'Volume')
mean_values = df.drop(['Company', 'Volume'], axis=1).mean()

# Calculating the weighted mean of each column (excluding 'Company' and 'Volume')
weighted_means = (df.drop(['Company', 'Volume'], axis=1).multiply(df['Volume'], axis=0).sum() / df['Volume'].sum())

# Displaying the results
print("Mean of each column:")
print(mean_values)
print("\nWeighted mean of each column:")
print(weighted_means)

## R2 Scores Series models with clean input

In [None]:
# Sorting and selecting the first 10 companies
sorted_companies = sorted(results_dict_best.items(), 
                          key=lambda x: (len(x[1]['fakename'].split()[1]), x[1]['fakename'].split()[1]))[:10]

# Extract data for plotting
data = []
for company, details in sorted_companies:
    fakename = details['fakename']
    recall_clean_clean = details['recall_optimised']['clean_regressor']['clean_input']['r2_score']
    f1_clean_clean = details['f1_optimised']['clean_regressor']['clean_input']['r2_score']
    recall_dirty_clean = details['recall_optimised']['dirty_regressor']['clean_input']['r2_score']
    f1_dirty_clean = details['f1_optimised']['dirty_regressor']['clean_input']['r2_score']
    data.append([fakename, recall_clean_clean, f1_clean_clean, recall_dirty_clean, f1_dirty_clean])

# Create a DataFrame
df = pd.DataFrame(data, columns=['Company', 'Recall (Clean Regressor)', 'F1 (Clean Regressor)', 
                                 'Recall (Dirty Regressor)', 'F1 (Dirty Regressor)'])

# Plotting
fig, ax = plt.subplots(figsize=(14, 8))
width = 0.2  # the width of the bars
x = np.arange(len(df))  # the label locations

# ax.bar(x - width*1.5, df['Recall (Clean Regressor)'], width, label='Recall (Clean Regressor)')
# ax.bar(x - width/2, df['F1 (Clean Regressor)'], width, label='F1 (Clean Regressor)')
# ax.bar(x + width/2, df['Recall (Dirty Regressor)'], width, label='Recall (Dirty Regressor)')
# ax.bar(x + width*1.5, df['F1 (Dirty Regressor)'], width, label='F1 (Dirty Regressor)')

ax.bar(x - width*1.5, df['Recall (Clean Regressor)'], width, label='Clean Regressor; Classifier optimized on Recall')
ax.bar(x - width/2, df['F1 (Clean Regressor)'], width, label='Clean Regressor; Classifier optimized on F1')
ax.bar(x + width/2, df['Recall (Dirty Regressor)'], width, label='Dirty Regressor; Classifier optimized on Recall')
ax.bar(x + width*1.5, df['F1 (Dirty Regressor)'], width, label='Dirty Regressor; Classifier optimized on F1')

# Add labels, title, and custom x-axis tick labels with font sizes
ax.set_xlabel('Company', fontsize=16)  # Increase font size for the x-axis label
ax.set_ylabel('R2 Score', fontsize=16)  # Increase font size for the y-axis label
ax.set_title('R2 Scores per Company with Known Regions', fontsize=22)  # Increase font size for the title
ax.set_xticks(x)
ax.set_xticklabels(df['Company'], rotation=45, fontsize=14)  # Rotate x-tick labels and set font size
ax.set_ylim(-0.2, 1)  # Set y-axis limits
ax.legend(fontsize=16)  # Set font size for the legend
ax.grid(True)  # Enable grid
plt.yticks(fontsize=14)  # Increase font size for y-ticks
plt.tight_layout()
plt.show()

In [None]:
sorted_companies = sorted(results_dict_best.items(), 
                          key=lambda x: (len(x[1]['fakename'].split()[1]), x[1]['fakename'].split()[1]))

# Extract data for plotting
data = []
for company, details in sorted_companies:
    fakename = details['fakename']
    volume = details['volume']
    recall_clean = details['recall_optimised']['clean_regressor']['clean_input']['r2_score']
    f1_clean = details['f1_optimised']['clean_regressor']['clean_input']['r2_score']
    recall_dirty = details['recall_optimised']['dirty_regressor']['clean_input']['r2_score']
    f1_dirty = details['f1_optimised']['dirty_regressor']['clean_input']['r2_score']

    data.append([fakename, volume, recall_clean, f1_clean, recall_dirty, f1_dirty])

# Create a DataFrame
df = pd.DataFrame(data, columns=['Company', 'Volume','Recall (Clean Regressor)', 'F1 (Clean Regressor)', 
                                 'Recall (Dirty Regressor)', 'F1 (Dirty Regressor)',
                                 ])



# Calculating the mean of each column (excluding 'Company' and 'Volume')
mean_values = df.drop(['Company', 'Volume'], axis=1).mean()

# Calculating the weighted mean of each column (excluding 'Company' and 'Volume')
weighted_means = (df.drop(['Company', 'Volume'], axis=1).multiply(df['Volume'], axis=0).sum() / df['Volume'].sum())

# Displaying the results
print("Mean of each column:")
print(mean_values)
print("\nWeighted mean of each column:")
print(weighted_means)

## R2 scores series models with dirty input

In [None]:
# Sorting and selecting the first 10 companies
sorted_companies = sorted(results_dict_best.items(), 
                          key=lambda x: (len(x[1]['fakename'].split()[1]), x[1]['fakename'].split()[1]))[:10]

# Extract data for plotting
data = []
for company, details in sorted_companies:
    fakename = details['fakename']
    recall_clean_clean = details['recall_optimised']['clean_regressor']['dirty_input']['r2_score']
    f1_clean_clean = details['f1_optimised']['clean_regressor']['dirty_input']['r2_score']
    recall_dirty_clean = details['recall_optimised']['dirty_regressor']['dirty_input']['r2_score']
    f1_dirty_clean = details['f1_optimised']['dirty_regressor']['dirty_input']['r2_score']
    data.append([fakename, recall_clean_clean, f1_clean_clean, recall_dirty_clean, f1_dirty_clean])

# Create a DataFrame
df = pd.DataFrame(data, columns=['Company', 'Recall (Clean Regressor)', 'F1 (Clean Regressor)', 
                                 'Recall (Dirty Regressor)', 'F1 (Dirty Regressor)'])

# Plotting
fig, ax = plt.subplots(figsize=(14, 8))
width = 0.2  # the width of the bars
x = np.arange(len(df))  # the label locations

# ax.bar(x - width*1.5, df['Recall (Clean Regressor)'], width, label='Recall (Clean Regressor)')
# ax.bar(x - width/2, df['F1 (Clean Regressor)'], width, label='F1 (Clean Regressor)')
# ax.bar(x + width/2, df['Recall (Dirty Regressor)'], width, label='Recall (Dirty Regressor)')
# ax.bar(x + width*1.5, df['F1 (Dirty Regressor)'], width, label='F1 (Dirty Regressor)')

ax.bar(x - width*1.5, df['Recall (Clean Regressor)'], width, label='Clean Regressor; Classifier optimized on Recall')
ax.bar(x - width/2, df['F1 (Clean Regressor)'], width, label='Clean Regressor; Classifier optimized on F1')
ax.bar(x + width/2, df['Recall (Dirty Regressor)'], width, label='Dirty Regressor; Classifier optimized on Recall')
ax.bar(x + width*1.5, df['F1 (Dirty Regressor)'], width, label='Dirty Regressor; Classifier optimized on F1')

# Add labels, title, and custom x-axis tick labels with font sizes
ax.set_xlabel('Company', fontsize=16)  # Increase font size for the x-axis label
ax.set_ylabel('R2 Score', fontsize=16)  # Increase font size for the y-axis label
ax.set_title('R2 Scores per Company with Unknown Regions', fontsize=22)  # Increase font size for the title
ax.set_xticks(x)
ax.set_xticklabels(df['Company'], rotation=45, fontsize=14)  # Rotate x-tick labels and set font size
ax.set_ylim(-0.2, 1)  # Set y-axis limits
ax.legend(fontsize=16)  # Set font size for the legend
ax.grid(True)  # Enable grid
plt.yticks(fontsize=14)  # Increase font size for y-ticks
plt.tight_layout()
plt.show()
plt.tight_layout()
plt.show()

In [None]:
sorted_companies = sorted(results_dict_best.items(), 
                          key=lambda x: (len(x[1]['fakename'].split()[1]), x[1]['fakename'].split()[1]))

# Extract data for plotting
data = []
for company, details in sorted_companies:
    fakename = details['fakename']
    volume = details['volume']
    recall_clean = details['recall_optimised']['clean_regressor']['dirty_input']['r2_score']
    f1_clean = details['f1_optimised']['clean_regressor']['dirty_input']['r2_score']
    recall_dirty = details['recall_optimised']['dirty_regressor']['dirty_input']['r2_score']
    f1_dirty = details['f1_optimised']['dirty_regressor']['dirty_input']['r2_score']

    data.append([fakename, volume, recall_clean, f1_clean, recall_dirty, f1_dirty])

# Create a DataFrame
df = pd.DataFrame(data, columns=['Company', 'Volume','Recall (Clean Regressor)', 'F1 (Clean Regressor)', 
                                 'Recall (Dirty Regressor)', 'F1 (Dirty Regressor)',
                                 ])



# Calculating the mean of each column (excluding 'Company' and 'Volume')
mean_values = df.drop(['Company', 'Volume'], axis=1).mean()

# Calculating the weighted mean of each column (excluding 'Company' and 'Volume')
weighted_means = (df.drop(['Company', 'Volume'], axis=1).multiply(df['Volume'], axis=0).sum() / df['Volume'].sum())

# Displaying the results
print("Mean of each column:")
print(mean_values)
print("\nWeighted mean of each column:")
print(weighted_means)

## R2 Scores Pipelines

In [None]:
for company in results_dict_best.keys():
    df = clusters_dict[company]['model'].df_input
    df[df["OPDRACHTGEVERNAAM"] == company]

    results_dict_best[company]['volume'] = df["PALLETPLAATSEN"].sum()

# Sorting and selecting the first 10 companies
sorted_companies = sorted(results_dict_best.items(), 
                          key=lambda x: (len(x[1]['fakename'].split()[1]), x[1]['fakename'].split()[1]))[:10]



# Extract data for plotting
data = []
for company, details in sorted_companies:
    fakename = details['fakename']
    volume = details['volume']
    recall_clean = details['recall_optimised']['clean_regressor']['pipeline']['r2_score']
    f1_clean = details['f1_optimised']['clean_regressor']['pipeline']['r2_score']
    recall_dirty = details['recall_optimised']['dirty_regressor']['pipeline']['r2_score']
    f1_dirty = details['f1_optimised']['dirty_regressor']['pipeline']['r2_score']
    recall_parallel = details['recall_optimised']['parallel_regressor']['pipeline']['r2_score']
    f1_parallel = details['f1_optimised']['parallel_regressor']['pipeline']['r2_score']
    data.append([fakename, volume, recall_clean, f1_clean, recall_dirty, f1_dirty, recall_parallel, f1_parallel])

# Create a DataFrame
df = pd.DataFrame(data, columns=['Company', 'Volume','Recall (Clean Regressor)', 'F1 (Clean Regressor)', 
                                 'Recall (Dirty Regressor)', 'F1 (Dirty Regressor)',
                                 'Recall (Parallel Regressor)', 'F1 (Parallel Regressor)'])

# Plotting
fig, ax = plt.subplots(figsize=(14, 8))
width = 0.15  # the width of the bars
x = np.arange(len(df))  # the label locations

ax.bar(x - width*2, df['Recall (Clean Regressor)'], width, label='Series pipeline with clean regressor; Recall Optimized')
ax.bar(x - width, df['F1 (Clean Regressor)'], width, label='Series pipeline with clean regressor; F1 Optimized')
ax.bar(x, df['Recall (Dirty Regressor)'], width, label='Series pipeline with dirty regressor; Recall Optimized')
ax.bar(x + width, df['F1 (Dirty Regressor)'], width, label='Series pipeline with dirty regressor; F1 Optimized')
ax.bar(x + width*2, df['Recall (Parallel Regressor)'], width, label='Parallel pipeline; Recall Optimized')
ax.bar(x + width*3, df['F1 (Parallel Regressor)'], width, label='Parallel pipeline; F1 Optimized')

# Add some text for labels, title, and custom x-axis tick labels, etc.
ax.set_xlabel('Company', fontsize=16)
ax.set_ylabel('R2 Score', fontsize=16)
ax.set_title('R2 Scores per Company with Different Regressor Configurations', fontsize=20)
ax.set_xticks(x + width)
ax.set_xticklabels(df['Company'], rotation=45, fontsize=14)
plt.yticks(fontsize=14)
ax.legend(loc='lower left', fontsize=16)
ax.grid(True)
plt.tight_layout()
plt.show()

In [None]:
sorted_companies = sorted(results_dict_best.items(), 
                          key=lambda x: (len(x[1]['fakename'].split()[1]), x[1]['fakename'].split()[1]))

# Extract data for plotting
data = []
for company, details in sorted_companies:
    fakename = details['fakename']
    volume = details['volume']
    recall_clean = details['recall_optimised']['clean_regressor']['pipeline']['r2_score']
    f1_clean = details['f1_optimised']['clean_regressor']['pipeline']['r2_score']
    recall_dirty = details['recall_optimised']['dirty_regressor']['pipeline']['r2_score']
    f1_dirty = details['f1_optimised']['dirty_regressor']['pipeline']['r2_score']
    recall_parallel = details['recall_optimised']['parallel_regressor']['pipeline']['r2_score']
    f1_parallel = details['f1_optimised']['parallel_regressor']['pipeline']['r2_score']
    data.append([fakename, volume, recall_clean, f1_clean, recall_dirty, f1_dirty, recall_parallel, f1_parallel])

# Create a DataFrame
df = pd.DataFrame(data, columns=['Company', 'Volume','Recall (Clean Regressor)', 'F1 (Clean Regressor)', 
                                 'Recall (Dirty Regressor)', 'F1 (Dirty Regressor)',
                                 'Recall (Parallel Regressor)', 'F1 (Parallel Regressor)'])



# Calculating the mean of each column (excluding 'Company' and 'Volume')
mean_values = df.drop(['Company', 'Volume'], axis=1).mean()

# Calculating the weighted mean of each column (excluding 'Company' and 'Volume')
weighted_means = (df.drop(['Company', 'Volume'], axis=1).multiply(df['Volume'], axis=0).sum() / df['Volume'].sum())

# Displaying the results
print("Mean of each column:")
print(mean_values)
print("\nWeighted mean of each column:")
print(weighted_means)

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Sample DataFrame creation (replace this with your actual DataFrame)
# df = pd.read_csv('path_to_your_data.csv')  # Uncomment and modify this line to load your DataFrame

# Generating x and y data for the graph
x = np.arange(0.0, 1.0, 0.01)
y = [df[df["Recall (Parallel Regressor)"] > i]["Volume"].sum() / df["Volume"].sum() for i in x]

# Creating the line graph
plt.figure(figsize=(10, 6))
plt.plot(x, y, marker='')
plt.title('Volume Proportion vs. Recall Threshold')
plt.xlabel('Recall Threshold')
plt.ylabel('Proportion of Total Volume')
plt.grid(True)
plt.show()

In [None]:
df.describe()

## Pipeline comparison

In [None]:
# Sorting and selecting the first 10 companies
sorted_companies = sorted(results_dict_best.items(), 
                          key=lambda x: (len(x[1]['fakename'].split()[1]), x[1]['fakename'].split()[1]))[:10]

# Extract data for plotting
data = []
for company, details in sorted_companies:
    fakename = details['fakename']
    recall_dirty_pipeline = details['recall_optimised']['dirty_regressor']['pipeline']['r2_score']
    recall_dirty_pipeline_corr = details['recall_optimised']['dirty_regressor']['pipeline_with_correction']['r2_score']
    recall_dirty_pipeline_corr2 = details['recall_optimised']['dirty_regressor']['pipeline_with_correction2']['r2_score']
    f1_dirty_pipeline = details['f1_optimised']['dirty_regressor']['pipeline']['r2_score']
    f1_dirty_pipeline_corr = details['f1_optimised']['dirty_regressor']['pipeline_with_correction']['r2_score']
    f1_dirty_pipeline_corr2 = details['f1_optimised']['dirty_regressor']['pipeline_with_correction2']['r2_score']

    data.append([
        fakename,
        recall_dirty_pipeline, recall_dirty_pipeline_corr, recall_dirty_pipeline_corr2,
        f1_dirty_pipeline, f1_dirty_pipeline_corr, f1_dirty_pipeline_corr2
    ])

# Create a DataFrame
df = pd.DataFrame(data, columns=[
    'Company', 
    'Recall - Pipeline', 'Recall - Pipeline Correction', 'Recall - Pipeline Correction2',
    'F1 - Pipeline', 'F1 - Pipeline Correction', 'F1 - Pipeline Correction2'
])

# Plotting
fig, ax = plt.subplots(figsize=(14, 8))
width = 0.12  # the width of the bars
x = np.arange(len(df))  # the label locations

ax.bar(x - width, df['Recall - Pipeline'], width, label='Recall - Pipeline')
ax.bar(x, df['Recall - Pipeline Correction'], width, label='Recall - Pipeline Correction')
ax.bar(x + width, df['Recall - Pipeline Correction2'], width, label='Recall - Pipeline Correction2')
ax.bar(x + 2*width, df['F1 - Pipeline'], width, label='F1 - Pipeline')
ax.bar(x + 3*width, df['F1 - Pipeline Correction'], width, label='F1 - Pipeline Correction')
ax.bar(x + 4*width, df['F1 - Pipeline Correction2'], width, label='F1 - Pipeline Correction2')

# Add some text for labels, title, and custom x-axis tick labels, etc.
ax.set_xlabel('Company')
ax.set_ylabel('R2 Score')
ax.set_title('R2 Scores by Company and Configuration for Dirty Regressor')
ax.set_xticks(x + 1.5*width)
ax.set_xticklabels(df['Company'], rotation=45)
ax.legend()

plt.show()

## Export final parameters

In [None]:
final_pipeline_parameters = {}
import json
with open('class_parameters_recall.json', 'r') as file:
    class_parameters = json.load(file)


with open('results_dict_best_v3.json', 'r') as file:
    regg_parameters = json.load(file)

for company in class_parameters:
    final_pipeline_parameters[company] = {}
    final_pipeline_parameters[company]['fakename'] = class_parameters[company]['fakename'] 
    final_pipeline_parameters[company]['n_clusters'] = class_parameters[company]['nclusters']

    final_pipeline_parameters[company]['class_parameters'] = {}
    final_pipeline_parameters[company]['class_parameters']['learning_rate'] = float(class_parameters[company]['params']['learning_rate'])
    final_pipeline_parameters[company]['class_parameters']['max_depth'] = int(class_parameters[company]['params']['max_depth'])
    final_pipeline_parameters[company]['class_parameters']['n_estimators'] = int(class_parameters[company]['params']['n_estimators'])

    final_pipeline_parameters[company]['regressor_parameters'] = {}
    final_pipeline_parameters[company]['regressor_parameters']['learning_rate'] = float(regg_parameters[company]['recall_optimised']['parallel_regressor']['pipeline']['params']['learning_rate'])
    final_pipeline_parameters[company]['regressor_parameters']['max_depth'] = int(regg_parameters[company]['recall_optimised']['parallel_regressor']['pipeline']['params']['max_depth'])
    final_pipeline_parameters[company]['regressor_parameters']['n_estimators'] = int(regg_parameters[company]['recall_optimised']['parallel_regressor']['pipeline']['params']['n_estimators'])

# save final_dict to a json file

with open('final_pipeline_parameters.json', 'w') as file:
    json.dump(final_pipeline_parameters, file, indent=4)  # indent=4 is optional for pretty printing
final_pipeline_parameters

In [None]:
regg_parameters[company]['recall_optimised']['parallel_regressor']['pipeline']

## Pip

#### Print regressor pipeline scores 

In [None]:
for company in results_dict:
    print(company)
    print("====================================")
    best_r2_clean = -9999
    best_r2_dirty = -9999
    best_r2_paralel = -9999 
    for paramset in results_dict[company]:
        if paramset == "classifier":
            continue
        if results_dict[company][paramset]["clean_regressor"]['pipeline']["r2_score"] > best_r2_clean:
            best_r2_clean = results_dict[company][paramset]["clean_regressor"]['pipeline']["r2_score"]
            best_params_clean = paramset
        if results_dict[company][paramset]["dirty_regressor"]['pipeline']["r2_score"] > best_r2_dirty:
            best_r2_dirty = results_dict[company][paramset]["dirty_regressor"]['pipeline']["r2_score"]
            best_params_dirty = paramset
        if results_dict[company][paramset]["parallel_regressor"]['pipeline']["r2_score"] > best_r2_paralel:
            best_r2_paralel = results_dict[company][paramset]["parallel_regressor"]['pipeline']["r2_score"]
            best_params_paralel = paramset
    print(f"Best clean regressor: {best_r2_clean} with parameters: {best_params_clean}")
    print(f"Best dirty regressor: {best_r2_dirty} with parameters: {best_params_dirty}")
    print(f"Best parallel regressor: {best_r2_paralel} with parameters: {best_params_paralel}")
    

### Single Regressor scores

In [None]:
# Initialize data storage
data = {
    'Real Company': [],
    'Company': [],
    'Clean Regressor': [],
    'Dirty Regressor': [],
    'Parallel Regressor': []
}

# Extract data
for company, metrics in results_dict.items():
    best_r2_clean = -9999
    best_r2_dirty = -9999
    best_r2_parallel = -9999
    
    for paramset in metrics:
        if paramset == "classifier":
            continue
        r2_clean = metrics[paramset]["clean_regressor"]["r2_score"]
        r2_dirty = metrics[paramset]["dirty_regressor"]["r2_score"]
        r2_parallel = metrics[paramset]["paralel_regressor"]["r2_score"]

        if r2_clean > best_r2_clean:
            best_r2_clean = r2_clean
        if r2_dirty > best_r2_dirty:
            best_r2_dirty = r2_dirty
        if r2_parallel > best_r2_parallel:
            best_r2_parallel = r2_parallel
    data['Real Company'].append(company)
    data['Company'].append(clusters_dict[company]['fakename'])
    data['Clean Regressor'].append(best_r2_clean)
    data['Dirty Regressor'].append(best_r2_dirty)
    data['Parallel Regressor'].append(best_r2_parallel)

# Convert to DataFrame
df_scores = pd.DataFrame(data)
# Define a function to plot in chunks
def plot_grouped_bars(dataframe, chunk_size):
    chunks = len(dataframe) // chunk_size + (1 if len(dataframe) % chunk_size > 0 else 0)
    
    for i in range(chunks):
        chunk = dataframe.iloc[i * chunk_size:(i + 1) * chunk_size]
        ax = chunk.plot(kind='bar', x='Company', figsize=(12, 8), title=f'R2 Scores for Regressor Models for Companies {i * chunk_size + 1} to {(i + 1) * chunk_size}')
        ax.set_xlabel('Company')
        ax.set_ylabel('R2 Score')
        ax.grid(True)
        plt.legend(title='Regressor Type')
        plt.xticks(rotation=45)
        plt.tight_layout()
        plt.show()

# Plotting in groups of 10
plot_grouped_bars(df_scores, 10)

In [None]:
print(f"Mean scores for Clean Regressor: {df_scores['Clean Regressor'].mean()}")
print(f"Mean scores for Dirty Regressor: {df_scores['Dirty Regressor'].mean()}")
print(f"Mean scores for Parallel Regressor: {df_scores['Parallel Regressor'].mean()}")

In [179]:
df_scores
#export to XLSX
df_scores.to_excel("Regressor_scores.xlsx")

### Pipeline Regressor Scores


#### Clean vs dirty trained regressor with clean input vs dirty input

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Initialize data storage
data = {
    'Company': [],
    'Parallel (Recall)': [],
    }

# Define a variable to select the metric type
metric_type = None  # You can change this to compare various types

# Extract data
for company, metrics_recall in results_dict_recall.items():
    metrics_f1 = results_dict_f1[company]
    
    best_r2_parallel_recall = -9999
    
    for paramset in metrics_recall:
        if paramset == "classifier":
            continue

        r2_parallel_recall = metrics_recall[paramset]["paralel_regressor"]["r2_score"]
        
        if r2_parallel_recall > best_r2_parallel_recall:
            best_r2_parallel_recall = r2_parallel_recall
   

    data['Company'].append(clusters_dict[company]['fakename'])
    data['Parallel (Recall)'].append(best_r2_parallel_recall)

    # Convert to DataFrame
df_scores = pd.DataFrame(data)

# Define a function to plot in chunks
def plot_grouped_bars(dataframe, chunk_size, title=None, max_plots=None):
    chunks = len(dataframe) // chunk_size + (1 if len(dataframe) % chunk_size > 0 else 0)
    
    for i in range(chunks):
        if max_plots is not None and i >= max_plots:
            break
        chunk = dataframe.iloc[i * chunk_size:(i + 1) * chunk_size]
        if title is None:
            title =f'R2 Scores for regressor for Companies {i * chunk_size + 1} to {(i + 1) * chunk_size}'
        ax = chunk.plot(kind='bar', x='Company', figsize=(12, 8), title=title)
        ax.set_xlabel('Company')
        ax.set_ylabel('R2 Score')
        ax.grid(True)
        ax.set_ylim(-1, 1)
        plt.legend(title='Regressor Type')
        plt.xticks(rotation=45)
        plt.tight_layout()
        plt.show()

print(f"Parallel Regressor: {df_scores['Parallel (Recall)'].mean()}")
print(f"Parallel Regressor score non negative: {df_scores[df_scores['Parallel (Recall)'] >= 0]['Parallel (Recall)'].mean()}")

# Plotting in groups of 10
plot_grouped_bars(df_scores[['Company', 'Parallel (Recall)']], 26,
                  "R2 score for companies A thru Z for parallel regressor", 9)

In [None]:
class Predictor:
    def __init__(self, df_orders, pc=None):
        if pc is None:
            pc = PC()
        else:
            self.pc = pc
        self.pipelines_list = {}
        self.GSE_list = {}
        self.df_orders = df_orders

    def add_pipeline(self, company, parameters):
        self.pipelines_list[company] = pipeline

    def add_GSE(self, company, n_clusters):
        df_temp = self.df_orders[self.df_orders['OPDRACHTGEVERNAAM'] == company]
        self.GSE_list[company] = GeoSpatialEncoder(self.pc)
        self.GSE_list[company].set_verbose(False)
        self.GSE_list[company].set_input_df(df_temp)
        self.GSE_list[company].clean_input_df()
        self.GSE_list[company].train_kmeans(n_clusters, 'SHIPMENT_COUNT')
    
    def predict_order_row(self, row):
        # Generate input row
        comp = row["OPDRACHTGEVERNAAM"]

        
    
    def create_d_orders_from_a_row(self, df):
        # Generate input row
        pass




    

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Initialize data storage
data = {
    'Company': [],
    'Clean Regressor Clean Input (Recall)': [],
    'Dirty Regressor Clean Input (Recall)': [],
    'Clean Regressor Dirty Input (Recall)': [],
    'Dirty Regressor Dirty Input (Recall)': [],
    
    'Clean Regressor Clean Input (F1)': [],
    'Dirty Regressor Clean Input (F1)': [],
    'Clean Regressor Dirty Input (F1)': [],
    'Dirty Regressor Dirty Input (F1)': []
    }

# Define a variable to select the metric type
metric_type = None  # You can change this to compare various types

# Extract data
for company, metrics_recall in results_dict_recall.items():
    metrics_f1 = results_dict_f1[company]
    
    best_r2_cleanmodel_recall_clean_input = -9999
    best_r2_dirtymodel_recall_clean_input = -9999
    best_r2_cleanmodel_recall_dirty_input = -9999
    best_r2_dirtymodel_recall_dirty_input = -9999

    best_r2_cleanmodel_f1_clean_input = -9999
    best_r2_dirtymodel_f1_clean_input = -9999
    best_r2_cleanmodel_f1_dirty_input = -9999
    best_r2_dirtymodel_f1_dirty_input = -9999
    
    for paramset in metrics_recall:
        if paramset == "classifier":
            continue

        r2_cleanmodel_recall_clean_input = metrics_recall[paramset]["clean_regressor"]['clean_input']["r2_score"]
        r2_dirtymodel_recall_clean_input = metrics_recall[paramset]["dirty_regressor"]['clean_input']["r2_score"]
        r2_cleanmodel_recall_dirty_input = metrics_recall[paramset]["clean_regressor"]['dirty_input']["r2_score"]
        r2_dirtymodel_recall_dirty_input = metrics_recall[paramset]["dirty_regressor"]['dirty_input']["r2_score"]

        
        r2_dirtymodel_f1_clean_input = metrics_f1[paramset]["dirty_regressor"]['clean_input']["r2_score"]
        r2_cleanmodel_f1_dirty_input = metrics_f1[paramset]["clean_regressor"]['dirty_input']["r2_score"]
        r2_dirtymodel_f1_dirty_input = metrics_f1[paramset]["dirty_regressor"]['dirty_input']["r2_score"]
        r2_cleanmodel_f1_clean_input = metrics_f1[paramset]["clean_regressor"]['clean_input']["r2_score"]


        if r2_cleanmodel_recall_clean_input > best_r2_cleanmodel_recall_clean_input:
            best_r2_cleanmodel_recall_clean_input = r2_cleanmodel_recall_clean_input
        if r2_dirtymodel_recall_clean_input > best_r2_dirtymodel_recall_clean_input:
            best_r2_dirtymodel_recall_clean_input = r2_dirtymodel_recall_clean_input
        if r2_cleanmodel_recall_dirty_input > best_r2_cleanmodel_recall_dirty_input:
            best_r2_cleanmodel_recall_dirty_input = r2_cleanmodel_recall_dirty_input
        if r2_dirtymodel_recall_dirty_input > best_r2_dirtymodel_recall_dirty_input:
            best_r2_dirtymodel_recall_dirty_input = r2_dirtymodel_recall_dirty_input

        if r2_cleanmodel_f1_clean_input > best_r2_cleanmodel_f1_clean_input:
            best_r2_cleanmodel_f1_clean_input = r2_cleanmodel_f1_clean_input
        if r2_dirtymodel_f1_clean_input > best_r2_dirtymodel_f1_clean_input:
            best_r2_dirtymodel_f1_clean_input = r2_dirtymodel_f1_clean_input
        if r2_cleanmodel_f1_dirty_input > best_r2_cleanmodel_f1_dirty_input:
            best_r2_cleanmodel_f1_dirty_input = r2_cleanmodel_f1_dirty_input
        if r2_dirtymodel_f1_dirty_input > best_r2_dirtymodel_f1_dirty_input:
            best_r2_dirtymodel_f1_dirty_input = r2_dirtymodel_f1_dirty_input


    data['Company'].append(clusters_dict[company]['fakename'])
    data['Clean Regressor Clean Input (Recall)'].append(best_r2_cleanmodel_recall_clean_input)
    data['Dirty Regressor Clean Input (Recall)'].append(best_r2_dirtymodel_recall_clean_input)
    data['Clean Regressor Dirty Input (Recall)'].append(best_r2_cleanmodel_recall_dirty_input)
    data['Dirty Regressor Dirty Input (Recall)'].append(best_r2_dirtymodel_recall_dirty_input)

    data['Clean Regressor Clean Input (F1)'].append(best_r2_cleanmodel_f1_clean_input)
    data['Dirty Regressor Clean Input (F1)'].append(best_r2_dirtymodel_f1_clean_input)
    data['Clean Regressor Dirty Input (F1)'].append(best_r2_cleanmodel_f1_dirty_input)
    data['Dirty Regressor Dirty Input (F1)'].append(best_r2_dirtymodel_f1_dirty_input)

    


# Convert to DataFrame
df_scores = pd.DataFrame(data)

# Define a function to plot in chunks
def plot_grouped_bars(dataframe, chunk_size, title=None, max_plots=None):
    chunks = len(dataframe) // chunk_size + (1 if len(dataframe) % chunk_size > 0 else 0)
    
    for i in range(chunks):
        if max_plots is not None and i >= max_plots:
            break
        chunk = dataframe.iloc[i * chunk_size:(i + 1) * chunk_size]
        if title is None:
            title =f'R2 Scores for regressor for Companies {i * chunk_size + 1} to {(i + 1) * chunk_size}'
        ax = chunk.plot(kind='bar', x='Company', figsize=(12, 8), title=title)
        ax.set_xlabel('Company')
        ax.set_ylabel('R2 Score')
        ax.grid(True)
        ax.set_ylim(-1, 1)
        plt.legend(title='Regressor Type')
        plt.xticks(rotation=45)
        plt.tight_layout()
        plt.show()

print(f"Clean Regressor Clean Input (Recall): {df_scores['Clean Regressor Clean Input (Recall)'].mean()}")
print(f"Dirty Regressor Clean Input (Recall): {df_scores['Dirty Regressor Clean Input (Recall)'].mean()}")
print(f"Clean Regressor Dirty Input (Recall): {df_scores['Clean Regressor Dirty Input (Recall)'].mean()}")
print(f"Dirty Regressor Dirty Input (Recall): {df_scores['Dirty Regressor Dirty Input (Recall)'].mean()}")
print(f"Clean Regressor Clean Input (F1): {df_scores['Clean Regressor Clean Input (F1)'].mean()}")
print(f"Dirty Regressor Clean Input (F1): {df_scores['Dirty Regressor Clean Input (F1)'].mean()}")
print(f"Clean Regressor Dirty Input (F1): {df_scores['Clean Regressor Dirty Input (F1)'].mean()}")
print(f"Dirty Regressor Dirty Input (F1): {df_scores['Dirty Regressor Dirty Input (F1)'].mean()}")



# Plotting in groups of 10
plot_grouped_bars(df_scores[['Company', 'Clean Regressor Clean Input (Recall)', 'Dirty Regressor Clean Input (Recall)', 'Clean Regressor Clean Input (F1)', 'Dirty Regressor Clean Input (F1)']], 10,
                  "R2 score for companies A thru J with known testing regions", 1)

# Plotting in groups of 10
plot_grouped_bars(df_scores[['Company', 'Clean Regressor Dirty Input (Recall)', 'Dirty Regressor Dirty Input (Recall)', 'Clean Regressor Dirty Input (F1)', 'Dirty Regressor Dirty Input (F1)']], 10,
                  "R2 score for companies A thru J with predicted testing regions", 1)


#### Pipeline scores


In [None]:
# Initialize data storage
data = {
    'Company': [],
    'Clean Regressor': [],
    'Dirty Regressor': [],
    'Parallel Regressor': []
}

# Extract data
for company, metrics in results_dict.items():
    best_r2_clean = -9999
    best_r2_dirty = -9999
    best_r2_parallel = -9999
    
    for paramset in metrics:
        if paramset == "classifier":
            continue
        r2_clean = metrics[paramset]["clean_regressor"]['pipeline']["r2_score"]
        r2_dirty = metrics[paramset]["dirty_regressor"]['pipeline']["r2_score"]
        r2_parallel = metrics[paramset]["paralel_regressor"]['pipeline']["r2_score"]

        if r2_clean > best_r2_clean:
            best_r2_clean = r2_clean
        if r2_dirty > best_r2_dirty:
            best_r2_dirty = r2_dirty
        if r2_parallel > best_r2_parallel:
            best_r2_parallel = r2_parallel

    data['Company'].append(company)
    data['Clean Regressor'].append(best_r2_clean)
    data['Dirty Regressor'].append(best_r2_dirty)
    data['Parallel Regressor'].append(best_r2_parallel)

# Convert to DataFrame
df_scores = pd.DataFrame(data)
# Define a function to plot in chunks
def plot_grouped_bars(dataframe, chunk_size):
    chunks = len(dataframe) // chunk_size + (1 if len(dataframe) % chunk_size > 0 else 0)
    
    for i in range(chunks):
        chunk = dataframe.iloc[i * chunk_size:(i + 1) * chunk_size]
        ax = chunk.plot(kind='bar', x='Company', figsize=(12, 8), title=f'R2 Scores for Pipeline for Companies {i * chunk_size + 1} to {(i + 1) * chunk_size}')
        ax.set_xlabel('Company')
        ax.set_ylabel('R2 Score')
        ax.grid(True)
        plt.legend(title='Regressor Type')
        plt.xticks(rotation=45)
        plt.tight_layout()
        plt.show()

# Plotting in groups of 10
plot_grouped_bars(df_scores, 10)

#### compare f1 with recall

In [None]:
df_scores

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Initialize data storage
data = {
    'Company': [],
    'PALLETPLAATSEN': [],
    'Clean Regressor (Recall)': [],
    'Dirty Regressor (Recall)': [],
    'Parallel Regressor (Recall)': [],
    'Clean Regressor (F1)': [],
    'Dirty Regressor (F1)': [],
    'Parallel Regressor (F1)': []
}

# Define a variable to select the metric type
metric_type = 'pipeline'  # You can change this to compare various types

# Extract data
for company, metrics_recall in results_dict_recall.items():
    metrics_f1 = results_dict[company]
    
    best_r2_clean_recall = -9999
    best_r2_dirty_recall = -9999
    best_r2_parallel_recall = -9999
    best_r2_clean_f1 = -9999
    best_r2_dirty_f1 = -9999
    best_r2_parallel_f1 = -9999
    
    for paramset in metrics_recall:
        if paramset == "classifier":
            continue
        
        r2_clean_recall = metrics_recall[paramset]["clean_regressor"][metric_type]["r2_score"]
        r2_dirty_recall = metrics_recall[paramset]["dirty_regressor"][metric_type]["r2_score"]
        r2_parallel_recall = metrics_recall[paramset]["paralel_regressor"][metric_type]["r2_score"]
        
        r2_clean_f1 = metrics_f1[paramset]["clean_regressor"][metric_type]["r2_score"]
        r2_dirty_f1 = metrics_f1[paramset]["dirty_regressor"][metric_type]["r2_score"]
        r2_parallel_f1 = metrics_f1[paramset]["paralel_regressor"][metric_type]["r2_score"]

        if r2_clean_recall > best_r2_clean_recall:
            best_r2_clean_recall = r2_clean_recall
        if r2_dirty_recall > best_r2_dirty_recall:
            best_r2_dirty_recall = r2_dirty_recall
        if r2_parallel_recall > best_r2_parallel_recall:
            best_r2_parallel_recall = r2_parallel_recall

        if r2_clean_f1 > best_r2_clean_f1:
            best_r2_clean_f1 = r2_clean_f1
        if r2_dirty_f1 > best_r2_dirty_f1:
            best_r2_dirty_f1 = r2_dirty_f1
        if r2_parallel_f1 > best_r2_parallel_f1:
            best_r2_parallel_f1 = r2_parallel_f1

    data['Company'].append(company)
    data['PALLETPLAATSEN'].append(df_orders[df_orders['OPDRACHTGEVERNAAM'] == company]['PALLETPLAATSEN'].sum())
    data['Clean Regressor (Recall)'].append(best_r2_clean_recall)
    data['Dirty Regressor (Recall)'].append(best_r2_dirty_recall)
    data['Parallel Regressor (Recall)'].append(best_r2_parallel_recall)
    data['Clean Regressor (F1)'].append(best_r2_clean_f1)
    data['Dirty Regressor (F1)'].append(best_r2_dirty_f1)
    data['Parallel Regressor (F1)'].append(best_r2_parallel_f1)

# Convert to DataFrame
df_scores = pd.DataFrame(data)

# Define a function to plot in chunks
def plot_grouped_bars(dataframe, chunk_size):
    dataframe = dataframe.copy().drop(columns='PALLETPLAATSEN')
    chunks = len(dataframe) // chunk_size + (1 if len(dataframe) % chunk_size > 0 else 0)
    
    for i in range(chunks):
        chunk = dataframe.iloc[i * chunk_size:(i + 1) * chunk_size]
        ax = chunk.plot(kind='bar', x='Company', figsize=(12, 8), title=f'R2 Scores for {metric_type.capitalize()} for Companies {i * chunk_size + 1} to {(i + 1) * chunk_size}')
        ax.set_xlabel('Company')
        ax.set_ylabel('R2 Score')
        ax.grid(True)
        plt.legend(title='Regressor Type')
        plt.xticks(rotation=45)
        plt.tight_layout()
        plt.show()


print("====================================")
print("Normal")
print("Taking all values")
print(f"Mean scores for Clean Regressor (Recall): {df_scores['Clean Regressor (Recall)'].mean()}")
print(f"Mean scores for Dirty Regressor (Recall): {df_scores['Dirty Regressor (Recall)'].mean()}")
print(f"Mean scores for Parallel Regressor (Recall): {df_scores['Parallel Regressor (Recall)'].mean()}")
print(f"Mean scores for Clean Regressor (F1): {df_scores['Clean Regressor (F1)'].mean()}")
print(f"Mean scores for Dirty Regressor (F1): {df_scores['Dirty Regressor (F1)'].mean()}")
print(f"Mean scores for Parallel Regressor (F1): {df_scores['Parallel Regressor (F1)'].mean()}")

print("Taking non negative values")
print(f"Mean scores for Clean Regressor (Recall): {df_scores[df_scores['Clean Regressor (Recall)'] >= 0]['Clean Regressor (Recall)'].mean()}")
print(f"Mean scores for Dirty Regressor (Recall): {df_scores[df_scores['Dirty Regressor (Recall)'] >= 0]['Dirty Regressor (Recall)'].mean()}")
print(f"Mean scores for Parallel Regressor (Recall): {df_scores[df_scores['Parallel Regressor (Recall)'] >= 0]['Parallel Regressor (Recall)'].mean()}")
print(f"Mean scores for Clean Regressor (F1): {df_scores[df_scores['Clean Regressor (F1)'] >= 0]['Clean Regressor (F1)'].mean()}")
print(f"Mean scores for Dirty Regressor (F1): {df_scores[df_scores['Dirty Regressor (F1)'] >= 0]['Dirty Regressor (F1)'].mean()}")
print(f"Mean scores for Parallel Regressor (F1): {df_scores[df_scores['Parallel Regressor (F1)'] >= 0]['Parallel Regressor (F1)'].mean()}")
print("====================================")
df_scores["Weighed Clean Regressor (Recall)"] = df_scores['Clean Regressor (Recall)'] * df_scores['PALLETPLAATSEN'] / df_scores['PALLETPLAATSEN'].sum()
df_scores["Weighed Dirty Regressor (Recall)"] = df_scores['Dirty Regressor (Recall)'] * df_scores['PALLETPLAATSEN'] / df_scores['PALLETPLAATSEN'].sum()
df_scores["Weighed Parallel Regressor (Recall)"] = df_scores['Parallel Regressor (Recall)'] * df_scores['PALLETPLAATSEN'] / df_scores['PALLETPLAATSEN'].sum()
df_scores["Weighed Clean Regressor (F1)"] = df_scores['Clean Regressor (F1)'] * df_scores['PALLETPLAATSEN'] / df_scores['PALLETPLAATSEN'].sum()
df_scores["Weighed Dirty Regressor (F1)"] = df_scores['Dirty Regressor (F1)'] * df_scores['PALLETPLAATSEN'] / df_scores['PALLETPLAATSEN'].sum()
df_scores["Weighed Parallel Regressor (F1)"] = df_scores['Parallel Regressor (F1)'] * df_scores['PALLETPLAATSEN'] / df_scores['PALLETPLAATSEN'].sum()



print("Weighing each companies score by the amount of palletplaatsen")
print("Taking all values")
print(f"Mean scores for Clean Regressor (Recall):{df_scores['Weighed Clean Regressor (Recall)'].sum()}")
print(f"Mean scores for Dirty Regressor (Recall): {df_scores['Weighed Dirty Regressor (Recall)'].sum()}")
print(f"Mean scores for Parallel Regressor (Recall): {df_scores['Weighed Parallel Regressor (Recall)'].sum()}")
print(f"Mean scores for Clean Regressor (F1): {df_scores['Weighed Clean Regressor (F1)'].sum()}")
print(f"Mean scores for Dirty Regressor (F1): {df_scores['Weighed Dirty Regressor (F1)'].sum()}")
print(f"Mean scores for Parallel Regressor (F1): {df_scores['Weighed Parallel Regressor (F1)'].sum()}")
print("====================================")


# Plotting in groups of 10
plot_grouped_bars(df_scores[[column for column in df_scores.columns if "Weighed" not in column]], 10)


In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Initialize data storage
data = {
    'Company': [],
    'Clean Regressor (Recall)': [],
    'Dirty Regressor (Recall)': [],
    'Parallel Regressor (Recall)': [],
    'Clean Regressor (F1)': [],
    'Dirty Regressor (F1)': [],
    'Parallel Regressor (F1)': []
}

# Define a variable to select the metric type
metric_type = None  # You can change this to compare various types

# Extract data
for company, metrics_recall in results_dict_recall.items():
    metrics_f1 = results_dict_f1[company]
    
    best_r2_clean_recall = -9999
    best_r2_dirty_recall = -9999
    best_r2_parallel_recall = -9999
    best_r2_clean_f1 = -9999
    best_r2_dirty_f1 = -9999
    best_r2_parallel_f1 = -9999
    
    for paramset in metrics_recall:
        if paramset == "classifier":
            continue


        r2_clean_recall = metrics_recall[paramset]["clean_regressor"]['dirty_input']["r2_score"]
        r2_dirty_recall = metrics_recall[paramset]["dirty_regressor"]['dirty_input']["r2_score"]
        r2_parallel_recall = metrics_recall[paramset]["paralel_regressor"]["r2_score"]
        
        r2_clean_f1 = metrics_f1[paramset]["clean_regressor"]['dirty_input']["r2_score"]
        r2_dirty_f1 = metrics_f1[paramset]["dirty_regressor"]['dirty_input']["r2_score"]
        r2_parallel_f1 = metrics_f1[paramset]["paralel_regressor"]["r2_score"]

        if r2_clean_recall > best_r2_clean_recall:
            best_r2_clean_recall = r2_clean_recall
        if r2_dirty_recall > best_r2_dirty_recall:
            best_r2_dirty_recall = r2_dirty_recall
        if r2_parallel_recall > best_r2_parallel_recall:
            best_r2_parallel_recall = r2_parallel_recall

        if r2_clean_f1 > best_r2_clean_f1:
            best_r2_clean_f1 = r2_clean_f1
        if r2_dirty_f1 > best_r2_dirty_f1:
            best_r2_dirty_f1 = r2_dirty_f1
        if r2_parallel_f1 > best_r2_parallel_f1:
            best_r2_parallel_f1 = r2_parallel_f1

    data['Company'].append(company)
    data['Clean Regressor (Recall)'].append(best_r2_clean_recall)
    data['Dirty Regressor (Recall)'].append(best_r2_dirty_recall)
    data['Parallel Regressor (Recall)'].append(best_r2_parallel_recall)
    data['Clean Regressor (F1)'].append(best_r2_clean_f1)
    data['Dirty Regressor (F1)'].append(best_r2_dirty_f1)
    data['Parallel Regressor (F1)'].append(best_r2_parallel_f1)

# Convert to DataFrame
df_scores = pd.DataFrame(data)

# Define a function to plot in chunks
def plot_grouped_bars(dataframe, chunk_size):
    chunks = len(dataframe) // chunk_size + (1 if len(dataframe) % chunk_size > 0 else 0)
    
    for i in range(chunks):
        chunk = dataframe.iloc[i * chunk_size:(i + 1) * chunk_size]
        ax = chunk.plot(kind='bar', x='Company', figsize=(12, 8), title=f'R2 Scores for regressor for Companies {i * chunk_size + 1} to {(i + 1) * chunk_size}')
        ax.set_xlabel('Company')
        ax.set_ylabel('R2 Score')
        ax.grid(True)
        ax.set_ylim(-1, 1)
        plt.legend(title='Regressor Type')
        plt.xticks(rotation=45)
        plt.tight_layout()
        plt.show()

print(f"Mean R2 scores for Clean Regressor (Recall): {df_scores['Clean Regressor (Recall)'].mean()}")
print(f"Mean R2 scores for Dirty Regressor (Recall): {df_scores['Dirty Regressor (Recall)'].mean()}")
print(f"Mean R2 scores for Clean Regressor (F1): {df_scores['Clean Regressor (F1)'].mean()}")
print(f"Mean R2 scores for Dirty Regressor (F1): {df_scores['Dirty Regressor (F1)'].mean()}")

# Plotting in groups of 10
plot_grouped_bars(df_scores[['Company', 'Clean Regressor (Recall)', 'Dirty Regressor (Recall)', 'Clean Regressor (F1)', 'Dirty Regressor (F1)']], 10)



#### Continue


In [181]:
#export to XLSX
df_scores.to_excel("Pipeline_scores.xlsx")

In [None]:
print(f"Mean scores for Clean Regressor Pipeline: {df_scores['Clean Regressor'].mean()}")
print(f"Mean scores for Dirty Regressor Pipeline: {df_scores['Dirty Regressor'].mean()}")
print(f"Mean scores for Parallel Regressor Pipeline: {df_scores['Parallel Regressor'].mean()}")

### output definitive regressor parameters


In [26]:
regg_final_output = {}
for company, metrics_recall in results_dict_f1.items():
    regg_final_output[company] = {}
    regg_final_output[company]['params'] = {}
    metrics_f1 = results_dict_f1[company]
    
    best_r2_parallel_f1 = -9999
    
    for paramset in metrics_recall:
        if paramset == "classifier":
            continue

        r2_parallel_f1 = metrics_f1[paramset]["paralel_regressor"]["r2_score"]

       
        if r2_parallel_f1 > best_r2_parallel_f1:
            best_r2_parallel_f1 = r2_parallel_f1
            best_params_parallel_f1 = paramset
    n_est = best_params_parallel_f1.split("_")[0]
    max_depth = best_params_parallel_f1.split("_")[1]
    learn_rate = best_params_parallel_f1.split("_")[2]

    regg_final_output[company]['params']['n_estimators'] = n_est
    regg_final_output[company]['params']['max_depth'] = max_depth
    regg_final_output[company]['params']['learning_rate'] = learn_rate

# save file to a json file
import json
with open('reg_parameters_f1.json', 'w') as file:
    json.dump(regg_final_output, file, indent=4)  # indent=4 is optional for pretty printing

    
    



In [None]:
paramset

# Create definitive pipeline inputs


In [None]:
final_pipeline_parameters = {}
import json
with open('class_parameters_recall.json', 'r') as file:
    class_parameters = json.load(file)


with open('reg_parameters_f1.json', 'r') as file:
    recal_parameters = json.load(file)

for company in class_parameters:
    final_pipeline_parameters[company] = {}
    final_pipeline_parameters[company]['fakename'] = class_parameters[company]['fakename'] 
    final_pipeline_parameters[company]['n_clusters'] = class_parameters[company]['nclusters']

    final_pipeline_parameters[company]['class_parameters'] = {}
    final_pipeline_parameters[company]['class_parameters']['learning_rate'] = float(class_parameters[company]['params']['learning_rate'])
    final_pipeline_parameters[company]['class_parameters']['max_depth'] = int(class_parameters[company]['params']['max_depth'])
    final_pipeline_parameters[company]['class_parameters']['n_estimators'] = int(class_parameters[company]['params']['n_estimators'])

    final_pipeline_parameters[company]['regressor_parameters'] = {}
    final_pipeline_parameters[company]['regressor_parameters']['learning_rate'] = float(recal_parameters[company]['params']['learning_rate'])
    final_pipeline_parameters[company]['regressor_parameters']['max_depth'] = int(recal_parameters[company]['params']['max_depth'])
    final_pipeline_parameters[company]['regressor_parameters']['n_estimators'] = int(recal_parameters[company]['params']['n_estimators'])

# save final_dict to a json file

with open('final_pipeline_parameters.json', 'w') as file:
    json.dump(final_pipeline_parameters, file, indent=4)  # indent=4 is optional for pretty printing
final_pipeline_parameters