# ASME Autodesk Hackathon 2023

### Notebook 4: Inference on Test set

- This is the final notebook, for predicitng the result on the test set, and final submission

In [112]:
## Required python libraries
import pandas as pd
import numpy as np
import scipy as sp
from pathlib import Path
from sklearn.metrics import roc_curve, auc
import json
from itertools import cycle
import os
import math
import sys
import itertools
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score, accuracy_score, confusion_matrix
# import graphviz
import pickle
from tqdm import tqdm

In [110]:
# For plotting
import plotly.io as pio
import plotly.graph_objects as go
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import label_binarize
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.utils import shuffle

# For ML model saving
import pickle
# sns.set_theme(style="whitegrid")
sns.set_theme(style="white", palette=None)

import matplotlib as mpl
import matplotlib.pyplot as plt
from matplotlib.pyplot import gca
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)
%matplotlib inline
import seaborn as sns

In [93]:
plt.rcParams["axes.edgecolor"] = "black"
plt.rcParams["axes.linewidth"] = 2.50

# Where to save the figures, and dataset locations
PROJECT_ROOT_DIR = "../"
IMAGE_PATH = os.path.join(PROJECT_ROOT_DIR, "result_images", 'inference')
Feature_PATH = os.path.join(PROJECT_ROOT_DIR, "feature_extraction")
ML_model_PATH = os.path.join(PROJECT_ROOT_DIR, "trained_models")

os.makedirs(IMAGE_PATH, exist_ok=True)
os.makedirs(Feature_PATH, exist_ok=True)
os.makedirs(ML_model_PATH, exist_ok=True)

## function for automatically save the diagram/graph into the folder 
def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGE_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

# Ignore useless warnings (see SciPy issue #5998)
import warnings
warnings.filterwarnings(action="ignore", message="^internal gelsd")

In [122]:
def show_confusion_matrix(y_true, y_pred, classes=None, classes_categorical=None, normalize=None, figsize=(10, 10), dpi=600, fontsize=10, axis_fontsize=14, tick_size=12):
    cm = confusion_matrix(y_true, y_pred, normalize=normalize, labels=classes)
    
    if normalize == 'true':
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
    
    fig, ax = plt.subplots(figsize=figsize, dpi=dpi)
    cax = ax.matshow(cm, cmap=plt.cm.Blues)
    
    cbar = plt.colorbar(cax, ax=ax, shrink=0.65)
    
    if classes:
        tick_marks = np.arange(len(classes))
        plt.xticks(tick_marks, classes_categorical, rotation=45, fontsize=tick_size, ha='left')
        plt.yticks(tick_marks, classes_categorical, fontsize=tick_size)
    
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], '.2f' if normalize else 'd'),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black",
                 fontsize=fontsize)
    
    ax.set_xlabel('Predicted label', fontsize=axis_fontsize)
    ax.set_ylabel('True label', fontsize=axis_fontsize)
    # plt.tight_layout()
    # plt.show()

### Feature extraction functions

In [26]:
%store -r encoder_test
%store -r transformed_test_dataset_final

In [27]:
transformed_test_dataset_final

Unnamed: 0,uuid,com_distance,center_of_mass_x,center_of_mass_y,center_of_mass_z,body_area,body_volume,material_category,volume_fraction,bounding_box_max_x,...,Toys,Utilities & Telecom,Virtual Reality,Water & Wastewater,Wood Working,Civil Infrastructure,Engineering & Construction,Media & Entertainment,Other Industries,Product Design & Manufacturing
0,bbdf29da-060c-11ec-a52a-02ef91e90f5f,18.789386,-3.018946e-11,-1.019470e-16,0.872410,7.679887,1.089020,Metal_Ferrous_Steel,0.000889,72.50,...,0,0,0,0,0,0,0,0,0,1
1,bbdf9f22-060c-11ec-ac81-02ef91e90f5f,17.384202,1.921808e+00,-6.312883e-01,-1.718481,972.513358,475.446755,Other,0.387934,72.50,...,0,0,0,0,0,0,0,0,0,1
2,bbe89f86-060c-11ec-8111-02ef91e90f5f,15.731161,3.467308e+00,2.811920e+00,6.277969,460.819725,241.470603,Other,0.197025,72.50,...,0,0,0,0,0,0,0,0,0,1
3,bbee1dec-060c-11ec-aabe-02ef91e90f5f,13.623338,5.457923e+00,-1.478449e+00,0.795617,73.200000,21.600000,Metal_Ferrous_Steel,0.017624,72.50,...,0,0,0,0,0,0,0,0,0,1
4,bbee44f6-060c-11ec-bd48-02ef91e90f5f,10.639008,1.037246e+01,-1.650000e+00,-3.440546,54.622106,11.563476,Other,0.009435,72.50,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
150,3b4ce74c-05cc-11ec-a9ce-064a63348d37,66.832452,2.540000e+00,4.000000e+01,2.540000,1677.212800,2064.512000,Wood,0.035476,127.54,...,0,0,0,0,0,0,0,0,0,1
151,3b4d0e4c-05cc-11ec-9c6d-064a63348d37,66.832452,1.025400e+02,4.000000e+01,72.540000,1677.212800,2064.512000,Wood,0.035476,127.54,...,0,0,0,0,0,0,0,0,0,1
152,3b4d355a-05cc-11ec-9f42-064a63348d37,66.832452,1.025400e+02,4.000000e+01,2.540000,1677.212800,2064.512000,Wood,0.035476,127.54,...,0,0,0,0,0,0,0,0,0,1
153,3b4d8388-05cc-11ec-acde-064a63348d37,66.832452,2.540000e+00,4.000000e+01,72.540000,1677.212800,2064.512000,Wood,0.035476,127.54,...,0,0,0,0,0,0,0,0,0,1


In [None]:
# df_feature_dataset_cleaned = df_feature_dataset.fillna(df_feature_dataset.median())
# df_feature_dataset_cleaned

In [75]:
selected_columns = ['Material Category (Target)', 'material_category', 'uuid', 
                   'body_area', 'volume_fraction', 'body_volume',  'com_distance', 'center_of_mass_y', 
                   'center_of_mass_z', 'center_of_mass_x', 'assembly_density', 'body_count', 'shell_count', 
                   'vertex_valence_1_2', 'face_to_loop_ratio', 'assembly_views_count', 'vertex_to_edge_ratio', 
                   'aspect_ratio_yz', 'assembly_area', 'bounding_box_z_dim', 'bounding_box_max_z', 'assembly_volume', 
                   'assembly_center_of_mass_z', 'bounding_box_max_y', 'aspect_ratio_xz', 'aspect_ratio_xy', 
                   'vertex_valence_2_5', 'face_count', 'loop_count','vertex_valence_5_10', 'tree_depth', 
                   'Infrastructure Design', "Industrial Asset Creation", "Engineering", 'assembly_comments_count', 
                   'occurrences_count', 'unique_component_occurrences', 'vertex_count', 'Miscellaneous', 
                   'Jewelry', 'bounding_box_x_dim', 'Wood Working', 'moment_of_inertia_yz', 'moment_of_inertia_xx']

selected_columns_v2 = ['Material Category (Target)', 'material_category', 'uuid', 
                   'body_area', 'volume_fraction', 'body_volume',  'com_distance', 'center_of_mass_y', 
                   'center_of_mass_z', 'center_of_mass_x', 'assembly_density', 'body_count', 'shell_count', 
                   'vertex_valence_1_2', 'face_to_loop_ratio', 'assembly_views_count', 'vertex_to_edge_ratio', 
                   'aspect_ratio_yz', 'assembly_area', 'bounding_box_z_dim', 'bounding_box_max_z', 'assembly_volume', 
                   'assembly_center_of_mass_z', 'bounding_box_max_y', 'aspect_ratio_xz', 'aspect_ratio_xy', 
                   'vertex_valence_2_5', 'face_count', 'loop_count','vertex_valence_5_10', 'tree_depth', 
                   'assembly_comments_count', 'occurrences_count', 'unique_component_occurrences',
                    'vertex_count', 'bounding_box_x_dim', 'moment_of_inertia_yz', 'moment_of_inertia_xx']

df_feature_test_dataset_selected = transformed_test_dataset_final[selected_columns_v2]
df_feature_test_dataset_selected

Unnamed: 0,Material Category (Target),material_category,uuid,body_area,volume_fraction,body_volume,com_distance,center_of_mass_y,center_of_mass_z,center_of_mass_x,...,loop_count,vertex_valence_5_10,tree_depth,assembly_comments_count,occurrences_count,unique_component_occurrences,vertex_count,bounding_box_x_dim,moment_of_inertia_yz,moment_of_inertia_xx
0,2,Metal_Ferrous_Steel,bbdf29da-060c-11ec-a52a-02ef91e90f5f,7.679887,0.000889,1.089020,18.789386,-1.019470e-16,0.872410,-3.018946e-11,...,580,0,3,0,5,5,804,80.695369,-1.148057e+02,4.106230e+02
1,4,Other,bbdf9f22-060c-11ec-ac81-02ef91e90f5f,972.513358,0.387934,475.446755,17.384202,-6.312883e-01,-1.718481,1.921808e+00,...,580,0,3,0,5,5,804,80.695369,-1.148057e+02,4.106230e+02
2,4,Other,bbe89f86-060c-11ec-8111-02ef91e90f5f,460.819725,0.197025,241.470603,15.731161,2.811920e+00,6.277969,3.467308e+00,...,580,0,3,0,5,5,804,80.695369,-1.148057e+02,4.106230e+02
3,2,Metal_Ferrous_Steel,bbee1dec-060c-11ec-aabe-02ef91e90f5f,73.200000,0.017624,21.600000,13.623338,-1.478449e+00,0.795617,5.457923e+00,...,580,0,3,0,5,5,804,80.695369,-1.148057e+02,4.106230e+02
4,4,Other,bbee44f6-060c-11ec-bd48-02ef91e90f5f,54.622106,0.009435,11.563476,10.639008,-1.650000e+00,-3.440546,1.037246e+01,...,580,0,3,0,5,5,804,80.695369,-1.148057e+02,4.106230e+02
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
150,6,Wood,3b4ce74c-05cc-11ec-a9ce-064a63348d37,1677.212800,0.035476,2064.512000,66.832452,4.000000e+01,2.540000,2.540000e+00,...,90,0,3,0,4,4,120,150.000000,-1.152969e+06,3.319517e+06
151,6,Wood,3b4d0e4c-05cc-11ec-9c6d-064a63348d37,1677.212800,0.035476,2064.512000,66.832452,4.000000e+01,72.540000,1.025400e+02,...,90,0,3,0,4,4,120,150.000000,-1.152969e+06,3.319517e+06
152,6,Wood,3b4d355a-05cc-11ec-9f42-064a63348d37,1677.212800,0.035476,2064.512000,66.832452,4.000000e+01,2.540000,1.025400e+02,...,90,0,3,0,4,4,120,150.000000,-1.152969e+06,3.319517e+06
153,6,Wood,3b4d8388-05cc-11ec-acde-064a63348d37,1677.212800,0.035476,2064.512000,66.832452,4.000000e+01,72.540000,2.540000e+00,...,90,0,3,0,4,4,120,150.000000,-1.152969e+06,3.319517e+06


In [76]:
Label_test = df_feature_test_dataset_selected['material_category'].to_list()
label_test_numeric = df_feature_test_dataset_selected['Material Category (Target)'].to_list()
y_test = df_feature_test_dataset_selected['Material Category (Target)'].to_list()
X_test = df_feature_test_dataset_selected.drop(columns=['Material Category (Target)', 'material_category', 'uuid']).to_numpy()

In [77]:
X_test

array([[ 7.67988651e+00,  8.88570896e-04,  1.08901988e+00, ...,
         8.06953685e+01, -1.14805739e+02,  4.10623042e+02],
       [ 9.72513358e+02,  3.87934285e-01,  4.75446755e+02, ...,
         8.06953685e+01, -1.14805739e+02,  4.10623042e+02],
       [ 4.60819725e+02,  1.97024640e-01,  2.41470603e+02, ...,
         8.06953685e+01, -1.14805739e+02,  4.10623042e+02],
       ...,
       [ 1.67721280e+03,  3.54763705e-02,  2.06451200e+03, ...,
         1.50000000e+02, -1.15296886e+06,  3.31951712e+06],
       [ 1.67721280e+03,  3.54763705e-02,  2.06451200e+03, ...,
         1.50000000e+02, -1.15296886e+06,  3.31951712e+06],
       [ 2.77863042e+04,  5.74567365e-01,  3.34363748e+04, ...,
         1.50000000e+02, -1.15296886e+06,  3.31951712e+06]])

## Step 2: Load the Trained Model

In [78]:
# ["KNN", "DT", "AdaBoost", "RF", "BalancedRandomForest", 'LightGBM', "XGBoost"]
metamodel_KNN_file = os.path.join(ML_model_PATH, 'v2_feature','metamodel_KNN.sav')
metamodel_DT_file = os.path.join(ML_model_PATH, 'v2_feature', 'metamodel_DT.sav')
metamodel_ada_file = os.path.join(ML_model_PATH, 'v2_feature', 'metamodel_ada.sav')
metamodel_RF_file = os.path.join(ML_model_PATH, 'v2_feature', 'metamodel_RF.sav')
metamodel_BRF_file = os.path.join(ML_model_PATH, 'v2_feature', 'metamodel_BRF.sav')
metamodel_LightGBM_file = os.path.join(ML_model_PATH, 'v2_feature', 'metamodel_LightGBM.sav')
metamodel_XGBoost_file = os.path.join(ML_model_PATH, 'v2_feature','metamodel_XGBoost.sav')
# metamodel_voting_file = os.path.join(ML_model_PATH, 'metamodel_ensemble_voting.sav')

In [79]:
# Load the Random Forest model (or any other model you want) from disk
loaded_KNN = pickle.load(open(metamodel_KNN_file, 'rb'))
loaded_DT = pickle.load(open(metamodel_DT_file, 'rb'))
loaded_ada = pickle.load(open(metamodel_ada_file, 'rb'))
loaded_RF = pickle.load(open(metamodel_RF_file, 'rb'))
loaded_brf = pickle.load(open(metamodel_BRF_file, 'rb'))
loaded_lgbm = pickle.load(open(metamodel_LightGBM_file, 'rb'))
loaded_xgb = pickle.load(open(metamodel_XGBoost_file, 'rb'))
# voting_clf = pickle.load(open(metamodel_voting_file, 'rb'))

In [80]:
# To load the scaler later
with open('StandardScaler.pkl', 'rb') as file:
    loaded_StandardScaler = pickle.load(file)

with open('MinMaxScaler.pkl', 'rb') as file:
    loaded_MinMaxScaler = pickle.load(file)

In [81]:
# Standardize the test data using the loaded scaler
X_test_standardized = loaded_StandardScaler.transform(X_test)
# X_test_standardized_minmax = loaded_MinMaxScaler.transform(X_test)

## Step 3: Conduct Inference on the Test Set

In [116]:
# Predict using the loaded model
predictions_DT = loaded_DT.predict(X_test_standardized)
predictions_KNN = loaded_KNN.predict(X_test_standardized)
predictions_ada = loaded_ada.predict(X_test_standardized)
predictions_RF = loaded_RF.predict(X_test_standardized)
predictions_brf = loaded_brf.predict(X_test_standardized)
predictions_lgbm = loaded_lgbm.predict(X_test_standardized)
predictions_xgb = loaded_xgb.predict(X_test_standardized)
# predictions_voting_clf = voting_clf.predict(X_test_standardized)

In [132]:
predictions_RF

array([2, 0, 0, 0, 0, 0, 2, 2, 3, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0,
       0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 4, 4, 3, 4, 4, 4, 4, 4, 4, 3, 3, 1, 1, 3, 1, 3, 1, 3, 2, 2,
       3, 3, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 3, 1, 3, 0, 1, 3,
       1, 1, 3, 1, 1, 3, 3, 0, 3, 3, 5, 3, 1, 6, 6, 6, 6, 6, 6, 6, 6, 6,
       6, 6, 6, 6, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 0, 5, 5, 5, 5, 5, 0, 0, 0, 0, 0, 0, 0, 0, 6, 6, 6, 6,
       6])

## Step 4. Append the results to submission file

In [55]:
# Load the submission file
submission_df = pd.read_csv(os.path.join(PROJECT_ROOT_DIR, "submission", "submission_templete.csv"))
submission_df

Unnamed: 0,assembly_id,body_id,material_category
0,131068_085c0ed9,bbdf29da-060c-11ec-a52a-02ef91e90f5f,
1,131068_085c0ed9,bbdf9f22-060c-11ec-ac81-02ef91e90f5f,
2,131068_085c0ed9,bbe89f86-060c-11ec-8111-02ef91e90f5f,
3,131068_085c0ed9,bbee1dec-060c-11ec-aabe-02ef91e90f5f,
4,131068_085c0ed9,bbee44f6-060c-11ec-bd48-02ef91e90f5f,
...,...,...,...
150,74576_73ac0093,3b4ce74c-05cc-11ec-a9ce-064a63348d37,
151,74576_73ac0093,3b4d0e4c-05cc-11ec-9c6d-064a63348d37,
152,74576_73ac0093,3b4d355a-05cc-11ec-9f42-064a63348d37,
153,74576_73ac0093,3b4d8388-05cc-11ec-acde-064a63348d37,


In [58]:
# Convert numerical predictions to categorical
categorical_predictions_RF = encoder_test.inverse_transform(predictions_RF)
categorical_predictions_RF

array(['Metal_Ferrous_Steel', 'Metal_Ferrous_Steel',
       'Metal_Ferrous_Steel', 'Metal_Ferrous_Steel',
       'Metal_Ferrous_Steel', 'Metal_Ferrous_Steel',
       'Metal_Ferrous_Steel', 'Metal_Ferrous_Steel',
       'Metal_Ferrous_Steel', 'Metal_Ferrous_Steel',
       'Metal_Ferrous_Steel', 'Metal_Ferrous_Steel',
       'Metal_Ferrous_Steel', 'Metal_Ferrous_Steel',
       'Metal_Ferrous_Steel', 'Metal_Ferrous_Steel',
       'Metal_Ferrous_Steel', 'Metal_Ferrous_Steel',
       'Metal_Ferrous_Steel', 'Metal_Ferrous_Steel',
       'Metal_Ferrous_Steel', 'Metal_Ferrous_Steel',
       'Metal_Ferrous_Steel', 'Metal_Non-Ferrous', 'Metal_Non-Ferrous',
       'Metal_Non-Ferrous', 'Metal_Non-Ferrous', 'Metal_Non-Ferrous',
       'Metal_Non-Ferrous', 'Metal_Non-Ferrous', 'Metal_Non-Ferrous',
       'Metal_Non-Ferrous', 'Metal_Non-Ferrous', 'Metal_Non-Ferrous',
       'Metal_Non-Ferrous', 'Metal_Non-Ferrous', 'Metal_Non-Ferrous',
       'Metal_Non-Ferrous', 'Metal_Non-Ferrous', 'Metal_Non-Ferro

In [59]:
# Create a temporary DataFrame from uuids and predictions
temp_df = pd.DataFrame({
    'body_id': transformed_test_dataset_final['uuid'],
    'Predicted_Material_Category': categorical_predictions_DT
})

In [60]:
# Merge the predictions with the original submission_df based on uuid
submission_df = submission_df.merge(temp_df, on='body_id', how='left')

# Drop the original material_category column
submission_df = submission_df.drop(columns=['material_category'])

# Rename the Predicted_Material_Category back to material_category
submission_df = submission_df.rename(columns={"Predicted_Material_Category": "material_category"})

# Save the updated DataFrame back to the CSV file
submission_df.to_csv('submission_Lequn_Chen.csv', index=False)

In [61]:
submission_df

Unnamed: 0,assembly_id,body_id,material_category
0,131068_085c0ed9,bbdf29da-060c-11ec-a52a-02ef91e90f5f,Metal_Ferrous_Steel
1,131068_085c0ed9,bbdf9f22-060c-11ec-ac81-02ef91e90f5f,Metal_Ferrous_Steel
2,131068_085c0ed9,bbe89f86-060c-11ec-8111-02ef91e90f5f,Metal_Ferrous_Steel
3,131068_085c0ed9,bbee1dec-060c-11ec-aabe-02ef91e90f5f,Metal_Ferrous_Steel
4,131068_085c0ed9,bbee44f6-060c-11ec-bd48-02ef91e90f5f,Metal_Ferrous_Steel
...,...,...,...
150,74576_73ac0093,3b4ce74c-05cc-11ec-a9ce-064a63348d37,Metal_Ferrous_Steel
151,74576_73ac0093,3b4d0e4c-05cc-11ec-9c6d-064a63348d37,Metal_Ferrous_Steel
152,74576_73ac0093,3b4d355a-05cc-11ec-9f42-064a63348d37,Metal_Ferrous_Steel
153,74576_73ac0093,3b4d8388-05cc-11ec-acde-064a63348d37,Metal_Ferrous_Steel
