In [2]:
# Import pandas and numpy for data manipulation
import pandas as pd
import numpy as np
import time

# Import modules from Scikit-learn
from sklearn.ensemble import RandomForestClassifier  # Import Random Forest Model
from sklearn.model_selection import train_test_split   # Import train_test_split function
from sklearn import metrics   # import metrics modules for accuracy calculation
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import MinMaxScaler

from tqdm import tqdm

In [3]:
# Read data
PATH = "../../my_data/identification-dataset/my_custom_data/anblock-error-dataset.csv"
df = pd.read_csv(PATH)

# Drop uncomplete rows
df.dropna(inplace=True)

In [4]:
# Set training data
train_df = df.drop('material', axis=1)

# Extracted features 
X = train_df.drop('encoded_material', axis=1)
y = train_df['encoded_material'] # Labels

In [5]:
# Feature Scaling
#scaler = MinMaxScaler()
#X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

In [6]:
no_of_dt_in_forrest = [5, 10, 20, 30, 50, 100, 200, 500]

In [7]:
def trainer(no_of_dt_in_forrest, X, y):
    # Randomly split dataset into training and test set
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=22) # 70% training and 30% test
    
    # Create an empty list to store results
    results = []
    for trees in no_of_dt_in_forrest:
        # Create a pipeline object for the model
        # Create a pipeline object for our model
        pipe_RF = make_pipeline(StandardScaler(),
                        RandomForestClassifier(n_estimators=trees,   # no. of decision trees in the forest
                                               verbose=0)
                        )
        
        # Measure training time
        start_train = time.time()
        pipe_RF.fit(X_train, y_train)
        end_train = time.time()
        train_time_per_sample = (end_train - start_train) / len(X_train)

        # Measure test time
        start_test = time.time()
        y_pred = pipe_RF.predict(X_test)
        end_test = time.time()
        test_time_per_sample = (end_test - start_test) / len(X_test)
        
        # Evaluate the pipeline and store the results
        accuracy = metrics.accuracy_score(y_test, y_pred)
        precision = metrics.precision_score(y_test, y_pred, average="macro")
        recall = metrics.recall_score(y_test, y_pred, average="macro")
        f1 = metrics.recall_score(y_test, y_pred, average="macro")


        results.append({
            'number_of_trees': trees,
            'accuracy': accuracy,
            'precision': precision,
            'recall': recall,
            'f1': f1,
            'train_time_per_sample': train_time_per_sample,
            'test_time_per_sample': test_time_per_sample 
        })
    return results

In [8]:
results = []
for x in tqdm(np.arange(10)): # should be 10
    r = trainer(no_of_dt_in_forrest, X, y)
    results.append(r)

  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 10/10 [13:16<00:00, 79.66s/it]


In [16]:
# Flatten the list of lists
flattened_results = [item for sublist in results for item in sublist]

# Convert to DataFrame
flattend_results_df = pd.DataFrame(flattened_results)

In [51]:
mean_df = flattend_results_df.groupby('number_of_trees').mean().reset_index()
mean_df

Unnamed: 0,number_of_trees,accuracy,precision,recall,f1,train_time_per_sample,test_time_per_sample
0,5,0.874665,0.875112,0.874588,0.874588,1.1e-05,1e-06
1,10,0.894927,0.894952,0.894858,0.894858,2e-05,2e-06
2,20,0.90383,0.903569,0.903795,0.903795,4.1e-05,4e-06
3,30,0.907822,0.907486,0.907806,0.907806,6e-05,5e-06
4,50,0.911416,0.911005,0.911406,0.911406,0.000101,9e-06
5,100,0.913706,0.913299,0.913693,0.913693,0.000201,1.7e-05
6,200,0.914977,0.91461,0.914963,0.914963,0.0004,3.4e-05
7,500,0.915145,0.914832,0.91512,0.91512,0.001003,9e-05


In [52]:
# Round the columns to the 4th decimal place
mean_df['accuracy'] = mean_df['accuracy'].round(2)
mean_df['precision'] = mean_df['precision'].round(2)
mean_df['recall'] = mean_df['recall'].round(2)
mean_df['f1'] = mean_df['f1'].round(2)

# Scale from seconds to milliseconds
mean_df['train_time_per_sample'] = mean_df['train_time_per_sample'] * 1000000 # to microsec. 
mean_df['train_time_per_sample'] = mean_df['train_time_per_sample'].round(2)

mean_df['test_time_per_sample'] = mean_df['test_time_per_sample'] * 1000000 # to microsec.
mean_df['test_time_per_sample'] = mean_df['test_time_per_sample'].round(2)

mean_df

Unnamed: 0,number_of_trees,accuracy,precision,recall,f1,train_time_per_sample,test_time_per_sample
0,5,0.87,0.88,0.87,0.87,10.51,1.1
1,10,0.89,0.89,0.89,0.89,20.17,1.91
2,20,0.9,0.9,0.9,0.9,40.71,3.6
3,30,0.91,0.91,0.91,0.91,60.35,5.22
4,50,0.91,0.91,0.91,0.91,101.06,8.8
5,100,0.91,0.91,0.91,0.91,200.86,17.3
6,200,0.91,0.91,0.91,0.91,399.93,34.06
7,500,0.92,0.91,0.92,0.92,1002.92,89.63


In [57]:
std_df = flattend_results_df.groupby('number_of_trees').std().reset_index()
std_df

Unnamed: 0,number_of_trees,accuracy,precision,recall,f1,train_time_per_sample,test_time_per_sample
0,5,0.004958,0.004891,0.004978,0.004978,6.294522e-07,9.004239e-08
1,10,0.002978,0.002967,0.00298,0.00298,5.56541e-07,7.702969e-08
2,20,0.00228,0.002377,0.002263,0.002263,1.191206e-06,2.314203e-08
3,30,0.002122,0.002134,0.002128,0.002128,1.422213e-06,1.950308e-07
4,50,0.001645,0.001624,0.001646,0.001646,2.077765e-06,1.429429e-07
5,100,0.001152,0.001168,0.001149,0.001149,3.217492e-06,2.585078e-07
6,200,0.000895,0.000907,0.000901,0.000901,3.991134e-06,7.279317e-07
7,500,0.000473,0.000485,0.000467,0.000467,1.607497e-05,6.375441e-06


In [58]:
# Round the columns to the 4th decimal place
std_df['accuracy'] = std_df['accuracy'].round(4)
std_df['precision'] = std_df['precision'].round(4)
std_df['recall'] = std_df['recall'].round(4)
std_df['f1'] = std_df['f1'].round(4)

# Scale from seconds to milliseconds
std_df['train_time_per_sample'] = std_df['train_time_per_sample'] * 1000000 # to microsec. 
std_df['train_time_per_sample'] = std_df['train_time_per_sample'].round(2)

std_df['test_time_per_sample'] = std_df['test_time_per_sample'] * 1000000 # to microsec.
std_df['test_time_per_sample'] = std_df['test_time_per_sample'].round(2)

std_df

Unnamed: 0,number_of_trees,accuracy,precision,recall,f1,train_time_per_sample,test_time_per_sample
0,5,0.005,0.0049,0.005,0.005,0.63,0.09
1,10,0.003,0.003,0.003,0.003,0.56,0.08
2,20,0.0023,0.0024,0.0023,0.0023,1.19,0.02
3,30,0.0021,0.0021,0.0021,0.0021,1.42,0.2
4,50,0.0016,0.0016,0.0016,0.0016,2.08,0.14
5,100,0.0012,0.0012,0.0011,0.0011,3.22,0.26
6,200,0.0009,0.0009,0.0009,0.0009,3.99,0.73
7,500,0.0005,0.0005,0.0005,0.0005,16.07,6.38


In [40]:
markdown_table = mean_df.to_markdown(index=False)
markdown_table

'|   number_of_trees |   accuracy |   precision |   recall |       f1 |   train_time_per_sample |   test_time_per_sample |\n|------------------:|-----------:|------------:|---------:|---------:|------------------------:|-----------------------:|\n|                 5 |   0.874665 |    0.875112 | 0.874588 | 0.874588 |             1.0512e-05  |            1.09501e-06 |\n|                10 |   0.894927 |    0.894952 | 0.894858 | 0.894858 |             2.0173e-05  |            1.91371e-06 |\n|                20 |   0.90383  |    0.903569 | 0.903795 | 0.903795 |             4.07087e-05 |            3.60219e-06 |\n|                30 |   0.907822 |    0.907486 | 0.907806 | 0.907806 |             6.035e-05   |            5.22043e-06 |\n|                50 |   0.911416 |    0.911005 | 0.911406 | 0.911406 |             0.000101059 |            8.79969e-06 |\n|               100 |   0.913706 |    0.913299 | 0.913693 | 0.913693 |             0.000200865 |            1.73036e-05 |\n|              

|   number_of_trees |   accuracy |   precision |   recall |       f1 |   train_time_per_sample |   test_time_per_sample |
|------------------:|-----------:|------------:|---------:|---------:|------------------------:|-----------------------:|
|                 5 |   0.8719   |    0.873856 | 0.87239  | 0.87239  |             1.28354e-05 |            1.12619e-06 |
|                10 |   0.901237 |    0.902022 | 0.90161  | 0.90161  |             2.03947e-05 |            1.96707e-06 |
|                20 |   0.904597 |    0.905448 | 0.904936 | 0.904936 |             4.03076e-05 |            3.68428e-06 |
|                30 |   0.906892 |    0.907208 | 0.907242 | 0.907242 |             6.0645e-05  |            5.43059e-06 |
|                50 |   0.910475 |    0.910688 | 0.910839 | 0.910839 |             0.000100613 |            8.82269e-06 |
|               100 |   0.913779 |    0.914085 | 0.91412  | 0.91412  |             0.000201685 |            1.75537e-05 |
|               200 |   0.913779 |    0.914384 | 0.914084 | 0.914084 |             0.000403243 |            3.5128e-05  |
|               500 |   0.913779 |    0.91414  | 0.914113 | 0.914113 |             0.000999109 |            8.805e-05   |

In [59]:
# Convert DataFrame to LaTeX table
latex_table = mean_df.to_latex(index=False)
print(latex_table)

\begin{tabular}{rrrrrrr}
\toprule
 number\_of\_trees &  accuracy &  precision &  recall &   f1 &  train\_time\_per\_sample &  test\_time\_per\_sample \\
\midrule
               5 &      0.87 &       0.88 &    0.87 & 0.87 &                  10.51 &                  1.10 \\
              10 &      0.89 &       0.89 &    0.89 & 0.89 &                  20.17 &                  1.91 \\
              20 &      0.90 &       0.90 &    0.90 & 0.90 &                  40.71 &                  3.60 \\
              30 &      0.91 &       0.91 &    0.91 & 0.91 &                  60.35 &                  5.22 \\
              50 &      0.91 &       0.91 &    0.91 & 0.91 &                 101.06 &                  8.80 \\
             100 &      0.91 &       0.91 &    0.91 & 0.91 &                 200.86 &                 17.30 \\
             200 &      0.91 &       0.91 &    0.91 & 0.91 &                 399.93 &                 34.06 \\
             500 &      0.92 &       0.91 &    0.92 & 0.92 & 

  latex_table = mean_df.to_latex(index=False)
