# Most probable outcomes

In [12]:
import pandas as pd

# Load the dataset
df = pd.read_excel('shorterData.xlsx')

# List of iDoc columns
idoc_columns = ['iDoc02', 'iDoc03', 'iDoc04', 'iDoc05', 'iDoc06', 'iDoc07', 'iDoc08', 'iDoc09', 'iDoc10']

def sort_idocs(row):
    # Extract non-empty iDoc values into a list
    idoc_values = [row[col] for col in idoc_columns if pd.notna(row[col]) and row[col] != '']
    
    # Sort the list alphabetically
    sorted_idocs = sorted(idoc_values)
    
    # Fill remaining columns with empty strings if there are fewer than 9 iDoc values
    sorted_idocs += [''] * (len(idoc_columns) - len(sorted_idocs))
    
    # Create a dictionary with sorted iDoc values assigned back to the iDoc columns
    return pd.Series({col: sorted_idocs.pop(0) for col in idoc_columns})

# Apply the sorting to each row and replace the iDoc columns with the sorted values
df[idoc_columns] = df.apply(sort_idocs, axis=1)

# Save the new sorted dataset to a new Excel file
df.to_excel('alphaData.xlsx', index=False)

print("iDoc values have been sorted alphabetically and saved to 'sortedData.xlsx'.")


iDoc values have been sorted alphabetically and saved to 'sortedData.xlsx'.


In [5]:
import pandas as pd

df = pd.read_excel('shorterData.xlsx', dtype={'Commodity Code': str})

# Define the iDoc columns
idoc_columns = ['iDoc02', 'iDoc03', 'iDoc04', 'iDoc05', 'iDoc06', 'iDoc07', 'iDoc08', 'iDoc09', 'iDoc10']

# Remove all whitespace from each entry in the iDoc columns
for col in idoc_columns:
    df[col] = df[col].str.replace(r'\s+', '', regex=True)

# Save the cleaned DataFrame back to a new Excel file
df.to_excel('cleanedData.xlsx', index=False)
print(df.head())

                    ImpName            ExName Commodity Code  \
0  Lensbower, Gregory L Esq   US Clipper Inc.     7308100000   
1            Mcmahan, Ben L   US Clipper Inc.     4820200000   
2    Rowley/hansell Petetin   US Clipper Inc.     4820200000   
3    Rowley/hansell Petetin   US Clipper Inc.     4820200000   
4               Appbyte Ltd  Demo Company Ltd     0105993000   

  CountryofOrigin_key                          iDoc02  \
0                  US                             NaN   
1                  US                             NaN   
2                  US                             NaN   
3                  US                             NaN   
4                  FR  C505~Guaranteenotrequired~CC~~   

                     iDoc03          iDoc04 iDoc05 iDoc06 iDoc07 iDoc08  \
0                       NaN             NaN    NaN    NaN    NaN    NaN   
1                       NaN             NaN    NaN    NaN    NaN    NaN   
2                       NaN             NaN    

In [9]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.utils.class_weight import compute_class_weight
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping
import joblib

# Load and preprocess the dataset
df = pd.read_excel('alphaData.xlsx', dtype={'Commodity Code': str})
df.fillna('Missing', inplace=True)  # Replace NaNs with 'Missing'
df = df.astype(str)

X = df[['ImpName', 'ExName', 'Commodity Code', 'CountryofOrigin_key']]
y = df[['iDoc02', 'iDoc03', 'iDoc04', 'iDoc05', 'iDoc06', 'iDoc07', 'iDoc08', 'iDoc09', 'iDoc10']]

# One-Hot Encoding with Unknown variable handling
encoder_X = OneHotEncoder(handle_unknown='ignore', sparse_output=False, drop='first')
X_encoded = encoder_X.fit_transform(X)
joblib.dump(encoder_X, 'onehot_encoder_X_with_unknown.pkl')

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

# Build the neural network model
def build_complex_model(output_size):
    model = Sequential([
        Dense(256, input_dim=X_train.shape[1], activation='relu'),
        BatchNormalization(),
        Dropout(0.3),
        Dense(128, activation='relu'),
        BatchNormalization(),
        Dropout(0.3),
        Dense(64, activation='relu'),
        BatchNormalization(),
        Dropout(0.3),
        Dense(output_size, activation='softmax')
    ])
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
                  loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

# Early stopping callback to prevent overfitting
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

# Dictionary to store models and label encoders
models = {}
label_encoders = {}

# Train a model for each iDoc column
for col in y_train.columns:
    le = LabelEncoder()
    
    # Replace missing values with 'Missing' before encoding
    y_train[col] = y_train[col].replace('', 'Missing')
    y_test[col] = y_test[col].replace('', 'Missing')
    
    # Fit the label encoder on the training data
    y_train_encoded = le.fit_transform(y_train[col].astype(str))
    
    # Transform the test data, filtering out any rows where the label is unseen
    valid_indices = y_test[col].isin(le.classes_)
    y_test_filtered = y_test[col][valid_indices]
    y_test_encoded = le.transform(y_test_filtered)
    X_test_filtered = X_test[valid_indices]
    
    # Compute class weights to handle the "Missing" data weighting
    class_weights = compute_class_weight(
        class_weight='balanced',
        classes=np.unique(y_train_encoded),
        y=y_train_encoded
    )
    class_weights_dict = dict(enumerate(class_weights))
    
    # Build the model
    model = build_complex_model(output_size=len(le.classes_))
    
    # Train the model with early stopping and class weights
    model.fit(X_train, y_train_encoded, 
              epochs=20, 
              batch_size=64, 
              validation_data=(X_test_filtered, y_test_encoded), 
              callbacks=[early_stopping],
              class_weight=class_weights_dict)  # Include class weights here
    
    # Save the model and label encoders
    model.save(f'nn_model_{col}.h5')
    joblib.dump(le, f'label_encoder_{col}.pkl')

    # Evaluate the model
    loss, accuracy = model.evaluate(X_test_filtered, y_test_encoded)
    print(f"Accuracy for {col}: {accuracy:.2f}")
    print("-" * 30)




Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Accuracy for iDoc02: 0.31
------------------------------


  saving_api.save_model(


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Accuracy for iDoc03: 0.44
------------------------------


  saving_api.save_model(


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


  saving_api.save_model(


Accuracy for iDoc04: 0.46
------------------------------
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


  saving_api.save_model(


Accuracy for iDoc05: 0.47
------------------------------
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


  saving_api.save_model(


Accuracy for iDoc06: 0.46
------------------------------
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


  saving_api.save_model(


Accuracy for iDoc07: 0.31
------------------------------
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20


  saving_api.save_model(


Accuracy for iDoc08: 0.00
------------------------------
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20


  saving_api.save_model(


Accuracy for iDoc09: 0.00
------------------------------
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Accuracy for iDoc10: 0.91
------------------------------


  saving_api.save_model(


In [6]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.utils.class_weight import compute_class_weight
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping
import joblib

# Prediction with new unseen data
def predict_new_data(new_data):
    encoder_X = joblib.load('onehot_encoder_X_with_unknown.pkl')
    idoc_columns = ['iDoc02', 'iDoc03', 'iDoc04', 'iDoc05', 'iDoc06', 'iDoc07', 'iDoc08', 'iDoc09', 'iDoc10']
    
    # One-Hot Encode the new data using the encoder
    new_data_encoded = encoder_X.transform(new_data)
    
    predictions = {}
    
    # Iterate over each iDoc column to make predictions
    for col in idoc_columns:
        model = tf.keras.models.load_model(f'nn_model_{col}.h5')
        le = joblib.load(f'label_encoder_{col}.pkl')
        
        # Make predictions
        encoded_pred = model.predict(new_data_encoded)
        prob_pred = encoded_pred[0]
        
        # Store the predictions sorted by probability
        sorted_preds = sorted(zip(le.classes_, prob_pred), key=lambda item: item[1], reverse=True)
        predictions[col] = sorted_preds
    
    return predictions


new_data = pd.DataFrame({
    'ImpName': ['Appbyte Ltd'], 
    'ExName': ['Demo Company Ltd'], 
    'Commodity Code': ['8529101100'], 
    'CountryofOrigin_key': ['PL']
})

predictions = predict_new_data(new_data)
if predictions:
    for col, preds in predictions.items():
        print(f"Predictions for {col}:")
        for label, prob in preds:
            if prob > 0.005:  
                print(f"  {label}: {prob:.4f}")
        print("-" * 30)



































Predictions for iDoc02:
  C119~4320~AE~TEST~: 0.3932
  C505~GBCGU02511510000720200206120129~CC~~: 0.3113
  C505~Guaranteenotrequired~CC~ ~: 0.1978
  C505~GBCGU02511510000720200206120129~CC~ ~: 0.0184
  C064~3211~AC~Test~: 0.0134
------------------------------
Predictions for iDoc03:
  U110~Invoice~AE~Invoice~: 0.6505
  C506~GBDPO3712600~~~: 0.2213
  C505~Guaranteenotrequired~CC~ ~: 0.0565
  C505~GBCGU55320273485220191113093459~CC~ ~: 0.0111
  C672~Invoice~AE~test~: 0.0095
------------------------------
Predictions for iDoc04:
  U110~Invoice~AE~Invoice~: 0.3889
  Missing: 0.1674
  N935~INV0003~AC~ ~: 0.0239
  C506~GBDPO1108654~ ~ ~: 0.0228
  C672~Invoice~AE~test~: 0.0177
  Y053~3221~ ~test~: 0.0154
  C506~GBDPO1108654~~~: 0.0146
  N853~7655~AE~test~: 0.0129
  Y922~3111~ ~test~: 0.0120
  C601~GBIPO02511510000720200206120129~~~: 0.0114
  C505~Guaranteenotrequired~CC~ ~: 0.0099
  C601~ GBIPO89645889501520190613093448~~~: 0.0075
  Y929~4542~ ~test~: 0.0067
  Y123~2110~ ~test2~: 0.0066
  Y90

In [8]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import joblib

# Prediction with new unseen data
def predict_new_data(new_data):
    # Load the encoder and models
    encoder_X = joblib.load('onehot_encoder_X_with_unknown.pkl')
    idoc_columns = ['iDoc02', 'iDoc03', 'iDoc04', 'iDoc05', 'iDoc06', 'iDoc07', 'iDoc08', 'iDoc09', 'iDoc10']
    
    # One-Hot Encode the new data using the encoder
    new_data_encoded = encoder_X.transform(new_data)
    
    predictions = {}
    
    # Iterate over each iDoc column to make predictions
    for col in idoc_columns:
        model = tf.keras.models.load_model(f'nn_model_{col}.h5')
        le = joblib.load(f'label_encoder_{col}.pkl')
        
        # Make predictions
        encoded_pred = model.predict(new_data_encoded)
        prob_pred = encoded_pred[0]  # Assuming single data point for prediction
        
        # Store the predictions sorted by probability
        sorted_preds = sorted(zip(le.classes_, prob_pred), key=lambda item: item[1], reverse=True)
        predictions[col] = sorted_preds[:5]  # Only keep the top 5 predictions
    
    return predictions


new_data = pd.DataFrame({
    'ImpName': ['Appbyte Ltd'], 
    'ExName': ['Demo Company Ltd'], 
    'Commodity Code': ['3917330010'], 
    'CountryofOrigin_key': ['PL']
})

predictions = predict_new_data(new_data)
if predictions:
    for col, preds in predictions.items():
        print(f"Top 5 Predictions for {col}:")
        for label, prob in preds:
            if prob > 0.005:  
                print(f"  {label}: {prob:.4f}")
        print("-" * 30)




































Top 5 Predictions for iDoc02:
  C505~Guaranteenotrequired~CC~~: 0.5098
  C506~GBDPO3712600~~~: 0.0476
  1207~20231020~AG~~: 0.0266
  1207~20230529~AG~~: 0.0227
  1207~20240215~AG~~: 0.0196
------------------------------
Top 5 Predictions for iDoc03:
  U110~Invoice~AE~Invoice~: 0.8049
  9120~RefNumber~AE~Reason~: 0.0163
  Y032~4444~~test~: 0.0146
  C644~2133~AE~test1~: 0.0093
  C506~GBDPO1108654~~~: 0.0083
------------------------------
Top 5 Predictions for iDoc04:
  C119~4320~AE~TEST~: 0.7521
  U110~Invoice~AE~Invoice~: 0.0746
  Missing: 0.0513
  C064~3211~AC~Test~: 0.0463
  C672~Invoice~AE~test~: 0.0089
------------------------------
Top 5 Predictions for iDoc05:
  Missing: 0.3074
  C505~Guaranteeenotrequired~CC~~: 0.0632
  Y926~3221~~test~: 0.0488
  U110~Invoice~AE~Invoice~: 0.0403
  C505~Guaranteenotrequired~CC~~: 0.0346
------------------------------
Top 5 Predictions for iDoc06:
  Missing: 0.0974
  Y922~3111~~test~: 0.0503
  Y931~Invoice~~Invoice~: 0.0492
  U110~Invoice~AE~Invoic

# Accuracy with top 5 

In [10]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import joblib

# Prediction with new unseen data
def predict_new_data(new_data, actual_values):
    encoder_X = joblib.load('onehot_encoder_X_with_unknown.pkl')
    idoc_columns = ['iDoc02', 'iDoc03', 'iDoc04', 'iDoc05', 'iDoc06', 'iDoc07', 'iDoc08', 'iDoc09', 'iDoc10']
    
    # One-Hot Encode the new data using the encoder
    new_data_encoded = encoder_X.transform(new_data)
    
    predictions = {}
    
    # Iterate over each iDoc column to make predictions
    for col in idoc_columns:
        model = tf.keras.models.load_model(f'nn_model_{col}.h5')
        le = joblib.load(f'label_encoder_{col}.pkl')
        
        # Make predictions
        encoded_pred = model.predict(new_data_encoded)
        prob_pred = encoded_pred[0] 
        
        # Store the predictions sorted by probability
        sorted_preds = sorted(zip(le.classes_, prob_pred), key=lambda item: item[1], reverse=True)
        predictions[col] = sorted_preds[:5]  # Only keep the top 5 predictions
    
    # Evaluate top-5 accuracy
    correct_predictions = 0
    total_predictions = len(idoc_columns)
    
    for col in idoc_columns:
        top_preds = [label for label, prob in predictions[col]]
        actual_value = actual_values[col].iloc[0]  # Get the actual value for the corresponding iDoc column
        
        if actual_value in top_preds:
            correct_predictions += 1
            print(f"Actual {col}: {actual_value} was in the top 5 predictions")
        else:
            print(f"Actual {col}: {actual_value} was NOT in the top 5 predictions")
    
    top_5_accuracy = correct_predictions / total_predictions
    print(f"Top-5 accuracy: {top_5_accuracy:.2f}")
    
    return predictions

# Example usage:
new_data = pd.DataFrame({
    'ImpName': ['Appbyte Ltd'], 
    'ExName': ['Demo Company Ltd'], 
    'Commodity Code': ['204505900'], 
    'CountryofOrigin_key': ['US']
})

actual_values = pd.DataFrame({
    'iDoc02': ['9120~2211~AE~test3~'],  
    'iDoc03': ['C505~GBCGU02511510000720200206120129~CC~ ~'],
    'iDoc04': ['C506~GBDPO3712600~~ ~'],
    'iDoc05': ['C644~1210~AE~ ~'],
    'iDoc06': ['N853~3431~AE~ ~'],
    'iDoc07': ['Missing'],
    'iDoc08': ['Missing'],
    'iDoc09': ['Missing'],
    'iDoc10': ['Missing']
})

predictions = predict_new_data(new_data, actual_values)
if predictions:
    for col, preds in predictions.items():
        print(f"Top 5 Predictions for {col}:")
        for label, prob in preds:
            if prob > 0.005:  
                print(f"  {label}: {prob:.4f}")
        print("-" * 30)




































Actual iDoc02: 9120~2211~AE~test3~ was in the top 5 predictions
Actual iDoc03: C505~GBCGU02511510000720200206120129~CC~ ~ was in the top 5 predictions
Actual iDoc04: C506~GBDPO3712600~~ ~ was NOT in the top 5 predictions
Actual iDoc05: C644~1210~AE~ ~ was in the top 5 predictions
Actual iDoc06: N853~3431~AE~ ~ was in the top 5 predictions
Actual iDoc07: Missing was NOT in the top 5 predictions
Actual iDoc08: Missing was NOT in the top 5 predictions
Actual iDoc09: Missing was NOT in the top 5 predictions
Actual iDoc10: Missing was in the top 5 predictions
Top-5 accuracy: 0.56
Top 5 Predictions for iDoc02:
  9120~2211~AE~test3~: 0.4262
  1207~20230614 ~AG~~: 0.2208
  1207~20230613 ~AG~~: 0.1626
  C505~GBCGU02511510000720200206120129~CC~~: 0.0287
  C505~GBCGU0835173300020200828141500~CC~ ~: 0.0149
------------------------------
Top 5 Predictions for iDoc03:
  C505~Guaranteenotrequired~CC~ ~: 0.3996
  C505~GBCGU02511510000720200206120129~CC~ ~: 0.0731
  C506~GBDPO3712600~~~: 0.0607
  C119~

In [2]:
model = tf.keras.models.load_model('nn_model_iDoc02.h5')

# Convert the model to TensorFlow Lite format
converter = tf.lite.TFLiteConverter.from_keras_model(model)
tflite_model = converter.convert()

# Save the converted model
with open('nn_model_iDoc02.tflite', 'wb') as f:
    f.write(tflite_model)








INFO:tensorflow:Assets written to: /var/folders/j8/s3w557v576v0dtnghwcjfxm40000gn/T/tmpcwgkxruh/assets


INFO:tensorflow:Assets written to: /var/folders/j8/s3w557v576v0dtnghwcjfxm40000gn/T/tmpcwgkxruh/assets
2024-09-13 12:12:24.328375: W tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc:364] Ignored output_format.
2024-09-13 12:12:24.328399: W tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc:367] Ignored drop_control_dependency.
2024-09-13 12:12:24.328848: I tensorflow/cc/saved_model/reader.cc:45] Reading SavedModel from: /var/folders/j8/s3w557v576v0dtnghwcjfxm40000gn/T/tmpcwgkxruh
2024-09-13 12:12:24.329973: I tensorflow/cc/saved_model/reader.cc:91] Reading meta graph with tags { serve }
2024-09-13 12:12:24.329978: I tensorflow/cc/saved_model/reader.cc:132] Reading SavedModel debug info (if present) from: /var/folders/j8/s3w557v576v0dtnghwcjfxm40000gn/T/tmpcwgkxruh
2024-09-13 12:12:24.332517: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:375] MLIR V1 optimization pass is not enabled
2024-09-13 12:12:24.333520: I tensorflow/cc/saved_model/load