In [1]:
# Import pandas and numpy for data manipulation
import pandas as pd
import numpy as np
import time

# Import modules from Scikit-learn
from sklearn.ensemble import RandomForestClassifier  # Import Random Forest Model
from sklearn.model_selection import train_test_split   # Import train_test_split function
from sklearn import metrics   # import metrics modules for accuracy calculation
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import MinMaxScaler

In [2]:
# Read data
PATH = "../../my_data/identification-dataset/my_custom_data/anblock-error-dataset.csv"
df = pd.read_csv(PATH)

# Drop uncomplete rows
df.dropna(inplace=True)

In [3]:
# Set training data
train_df = df.drop('material', axis=1)

# Extracted features 
X = train_df.drop('encoded_material', axis=1)
y = train_df['encoded_material'] # Labels

In [4]:
# Feature Scaling
#scaler = MinMaxScaler()
#X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

In [5]:
# Split dataset into training and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3) # 70% training and 30% test

In [6]:
no_of_dt_in_forrest = [5, 10, 20, 30, 50, 100, 200, 500]

In [7]:
# Create an empty list to store results
def trainer(no_of_dt_in_forrest):
    results = []
    for trees in no_of_dt_in_forrest:
        # Create a pipeline object for the model
        # Create a pipeline object for our model
        pipe_RF = make_pipeline(StandardScaler(),
                        RandomForestClassifier(n_estimators=trees,   # no. of decision trees in the forest
                                               verbose=1)
                        )
        
        # Measure training time
        start_train = time.time()
        pipe_RF.fit(X_train, y_train)
        end_train = time.time()
        train_time_per_sample = (end_train - start_train) / len(X_train)

        # Measure test time
        start_test = time.time()
        y_pred = pipe_RF.predict(X_test)
        end_test = time.time()
        test_time_per_sample = (end_test - start_test) / len(X_test)
        
        # Evaluate the pipeline and store the results
        accuracy = metrics.accuracy_score(y_test, y_pred)
        precision = metrics.precision_score(y_test, y_pred, average="macro")
        recall = metrics.recall_score(y_test, y_pred, average="macro")
        f1 = metrics.recall_score(y_test, y_pred, average="macro")


        results.append({
            'number_of_trees': trees,
            'accuracy': accuracy,
            'precision': precision,
            'recall': recall,
            'f1': f1,
            'train_time_per_sample': train_time_per_sample,
            'test_time_per_sample': test_time_per_sample 
        })
    return results

In [8]:
results = []
for x in np.arange(1): # should be 10
    r = trainer(no_of_dt_in_forrest)
    results.append(r)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.5s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.8s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed:    1.7s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed:    0.1s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_j

In [9]:
# show one of 10 samples
results_df = pd.DataFrame(results[0])
results_df

Unnamed: 0,number_of_trees,accuracy,precision,recall,f1,train_time_per_sample,test_time_per_sample
0,5,0.8719,0.873856,0.87239,0.87239,1.3e-05,1e-06
1,10,0.901237,0.902022,0.90161,0.90161,2e-05,2e-06
2,20,0.904597,0.905448,0.904936,0.904936,4e-05,4e-06
3,30,0.906892,0.907208,0.907242,0.907242,6.1e-05,5e-06
4,50,0.910475,0.910688,0.910839,0.910839,0.000101,9e-06
5,100,0.913779,0.914085,0.91412,0.91412,0.000202,1.8e-05
6,200,0.913779,0.914384,0.914084,0.914084,0.000403,3.5e-05
7,500,0.913779,0.91414,0.914113,0.914113,0.000999,8.8e-05
