In [10]:
# Import pandas and numpy for data manipulation
import pandas as pd
import numpy as np
import time

# Import modules from Scikit-learn
from sklearn.ensemble import RandomForestClassifier  # Import Random Forest Model
from sklearn.model_selection import train_test_split   # Import train_test_split function
from sklearn import metrics   # import metrics modules for accuracy calculation
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import MinMaxScaler

from tqdm import tqdm

In [11]:
# Read data
PATH = "../../my_data/identification-dataset/my_custom_data/anblock-error-dataset.csv"
df = pd.read_csv(PATH)

# Drop uncomplete rows
df.dropna(inplace=True)

In [12]:
# Set training data
train_df = df.drop('material', axis=1)

# Extracted features 
X = train_df.drop('encoded_material', axis=1)
y = train_df['encoded_material'] # Labels

In [13]:
# Feature Scaling
#scaler = MinMaxScaler()
#X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

In [14]:
# Split dataset into training and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3) # 70% training and 30% test

In [15]:
no_of_dt_in_forrest = [5, 10, 20, 30, 50, 100, 200, 500]

In [16]:
# Create an empty list to store results
def trainer(no_of_dt_in_forrest):
    results = []
    for trees in no_of_dt_in_forrest:
        # Create a pipeline object for the model
        # Create a pipeline object for our model
        pipe_RF = make_pipeline(StandardScaler(),
                        RandomForestClassifier(n_estimators=trees,   # no. of decision trees in the forest
                                               verbose=0)
                        )
        
        # Measure training time
        start_train = time.time()
        pipe_RF.fit(X_train, y_train)
        end_train = time.time()
        train_time_per_sample = (end_train - start_train) / len(X_train)

        # Measure test time
        start_test = time.time()
        y_pred = pipe_RF.predict(X_test)
        end_test = time.time()
        test_time_per_sample = (end_test - start_test) / len(X_test)
        
        # Evaluate the pipeline and store the results
        accuracy = metrics.accuracy_score(y_test, y_pred)
        precision = metrics.precision_score(y_test, y_pred, average="macro")
        recall = metrics.recall_score(y_test, y_pred, average="macro")
        f1 = metrics.recall_score(y_test, y_pred, average="macro")


        results.append({
            'number_of_trees': trees,
            'accuracy': accuracy,
            'precision': precision,
            'recall': recall,
            'f1': f1,
            'train_time_per_sample': train_time_per_sample,
            'test_time_per_sample': test_time_per_sample 
        })
    return results

In [17]:
results = []
for x in tqdm(np.arange(10)): # should be 10
    r = trainer(no_of_dt_in_forrest)
    results.append(r)

100%|██████████| 10/10 [13:04<00:00, 78.47s/it]


In [18]:
# show one of 10 samples
results_df = pd.DataFrame(results[0])
results_df

Unnamed: 0,number_of_trees,accuracy,precision,recall,f1,train_time_per_sample,test_time_per_sample
0,5,0.875315,0.875201,0.875074,0.875074,1.2e-05,1e-06
1,10,0.894575,0.894337,0.894358,0.894358,2.1e-05,2e-06
2,20,0.904821,0.904599,0.904627,0.904627,4e-05,4e-06
3,30,0.910699,0.910463,0.91054,0.91054,6.1e-05,6e-06
4,50,0.916018,0.915632,0.915837,0.915837,0.000102,9e-06
5,100,0.915234,0.914889,0.915042,0.915042,0.000201,1.7e-05
6,200,0.917474,0.917097,0.917303,0.917303,0.000465,4.1e-05
7,500,0.918426,0.918156,0.918236,0.918236,0.001127,0.000116


In [19]:
markdown_table = results_df.to_markdown(index=False)
markdown_table

'|   number_of_trees |   accuracy |   precision |   recall |       f1 |   train_time_per_sample |   test_time_per_sample |\n|------------------:|-----------:|------------:|---------:|---------:|------------------------:|-----------------------:|\n|                 5 |   0.875315 |    0.875201 | 0.875074 | 0.875074 |             1.16734e-05 |            1.04104e-06 |\n|                10 |   0.894575 |    0.894337 | 0.894358 | 0.894358 |             2.13847e-05 |            1.93785e-06 |\n|                20 |   0.904821 |    0.904599 | 0.904627 | 0.904627 |             4.04045e-05 |            3.57075e-06 |\n|                30 |   0.910699 |    0.910463 | 0.91054  | 0.91054  |             6.14222e-05 |            5.64604e-06 |\n|                50 |   0.916018 |    0.915632 | 0.915837 | 0.915837 |             0.0001019   |            8.68943e-06 |\n|               100 |   0.915234 |    0.914889 | 0.915042 | 0.915042 |             0.000200548 |            1.6954e-05  |\n|              

|   number_of_trees |   accuracy |   precision |   recall |       f1 |   train_time_per_sample |   test_time_per_sample |
|------------------:|-----------:|------------:|---------:|---------:|------------------------:|-----------------------:|
|                 5 |   0.8719   |    0.873856 | 0.87239  | 0.87239  |             1.28354e-05 |            1.12619e-06 |
|                10 |   0.901237 |    0.902022 | 0.90161  | 0.90161  |             2.03947e-05 |            1.96707e-06 |
|                20 |   0.904597 |    0.905448 | 0.904936 | 0.904936 |             4.03076e-05 |            3.68428e-06 |
|                30 |   0.906892 |    0.907208 | 0.907242 | 0.907242 |             6.0645e-05  |            5.43059e-06 |
|                50 |   0.910475 |    0.910688 | 0.910839 | 0.910839 |             0.000100613 |            8.82269e-06 |
|               100 |   0.913779 |    0.914085 | 0.91412  | 0.91412  |             0.000201685 |            1.75537e-05 |
|               200 |   0.913779 |    0.914384 | 0.914084 | 0.914084 |             0.000403243 |            3.5128e-05  |
|               500 |   0.913779 |    0.91414  | 0.914113 | 0.914113 |             0.000999109 |            8.805e-05   |

In [20]:
# Convert DataFrame to LaTeX table
latex_table = results_df.to_latex(index=False)
print(latex_table)

\begin{tabular}{rrrrrrr}
\toprule
 number\_of\_trees &  accuracy &  precision &   recall &       f1 &  train\_time\_per\_sample &  test\_time\_per\_sample \\
\midrule
               5 &  0.875315 &   0.875201 & 0.875074 & 0.875074 &               0.000012 &              0.000001 \\
              10 &  0.894575 &   0.894337 & 0.894358 & 0.894358 &               0.000021 &              0.000002 \\
              20 &  0.904821 &   0.904599 & 0.904627 & 0.904627 &               0.000040 &              0.000004 \\
              30 &  0.910699 &   0.910463 & 0.910540 & 0.910540 &               0.000061 &              0.000006 \\
              50 &  0.916018 &   0.915632 & 0.915837 & 0.915837 &               0.000102 &              0.000009 \\
             100 &  0.915234 &   0.914889 & 0.915042 & 0.915042 &               0.000201 &              0.000017 \\
             200 &  0.917474 &   0.917097 & 0.917303 & 0.917303 &               0.000465 &              0.000041 \\
             500 &  0

  latex_table = results_df.to_latex(index=False)
