In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import Lasso
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from skl2onnx import convert_sklearn
from skl2onnx.common.data_types import FloatTensorType
from sklearn.svm import SVC

  tys = obj.typeStr or ''
  if getattr(obj, 'isHomogeneous', False):
  return getattr(obj, attribute)


**Abstract**

This study investigates the performance of various machine learning modelas, including Decision Trees, Random Forests, and Support Vector Classifier(SVC), in predicting class labels based on numerical feaures. I tried to develop a pipeline using scikit-learn to perform data preprocessing and hyperparameter tuning. I applied the machine learning models and fine-tuned them using GridSearchCV. Cross-validation was used to ensure robust performence assessment.

In [2]:
# Load your data
df = pd.read_csv('data_public.csv')
df.head()

Unnamed: 0,A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,Class
0,231.420023,-12.210984,217.624839,-15.611916,140.047185,76.904999,131.591871,198.160805,82.873279,127.350084,224.592926,-5.992983,-14.689648,143.072058,153.439659,2
1,-38.01927,-14.195695,9.583547,22.293822,-25.578283,-18.373955,-0.094457,-33.711852,-8.356041,23.792402,4.199023,2.809159,-59.330681,-11.68595,1.317104,3
2,-39.197085,-20.41885,21.023083,19.79028,-25.902587,-19.189004,-2.953836,-25.299219,-6.612401,26.285392,5.911292,6.191587,-56.924996,-4.675187,-1.02783,2
3,221.630408,-5.785352,216.725322,-9.900781,126.795177,85.122288,108.857593,197.640135,82.560019,157.105143,212.989231,-3.62107,-15.469156,135.265859,149.212489,2
4,228.558412,-12.44771,204.637218,-13.277704,138.930529,91.10187,115.598954,209.300011,89.961688,130.299732,201.7951,-1.573922,-15.128603,148.368622,147.492663,3


In [4]:
# Check for missing values
missing_values = df.isna().sum().sum()
print("Total missing values:", missing_values)

Total missing values: 0


There are no missing values. If there were I would exclude any rows or columns that contain missing values.

In [5]:
# Define features and target
X = df.drop('Class', axis=1)
y = df['Class']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X contains all the columns of the dataframe 'df' except 'Class' column. This means all columns except 'Class' are used as input features. y is the 'Class' column of the dataframe.

I need to test the models on data that they haven't seen during training. This is why it's split. 20% of the data is set aside for testing and the remaining 80% is used for training. 'random_state' is used to make sure that the split is able to be reproduced.

In [7]:
# Initialize the classifiers
clf1 = DecisionTreeClassifier(random_state=42)
clf2 = RandomForestClassifier(random_state=42)
clf3 = SVC(random_state=42)

classifiers = [('Decision Tree', clf1), ('Random Forest', clf2), ('SVC', clf3)]

I created instances of the Decision Tree Classifier, Random Forest Classifier, and SVC. Each of there classes corresponds to a different type of machine learning model.

I created a list of tuples where each tuple contains a string representing the name of the classifier and the instance itself. The format of the tuple is useful because I can iterate over the list. I can add or remove classifiers by simply modifying the list and without having to change the rest of my code.

In [8]:
# Define performance metrics you are interested in
metrics = [accuracy_score, precision_score, recall_score, f1_score]

create a list of metrics used to evaluate the performance of machine learning models.

In [9]:
for clf_name, clf in classifiers:

    # Define preprocessing steps
    preprocessing_steps = Pipeline([
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler())
    ])

    # Create the full pipeline
    pipeline = Pipeline(steps=[
        ('preprocessing', preprocessing_steps),
        ('model', clf)
    ])

    # grid search params for each classifier
    param_grid_dt = {
        'model__max_depth': [2, 4, 6, 8],
    }

    param_grid_rf = {
        'model__n_estimators': [50, 100, 200],
        'model__max_depth': [2, 4, 6, 8],
    }

    param_grid_svc = {
        'model__C': [0.1, 1.0, 10.0],
        'model__kernel': ['linear', 'rbf'],
    }

    param_grids = {
        'Decision Tree': param_grid_dt,
        'Random Forest': param_grid_rf,
        'SVC': param_grid_svc,
    }

    # Tune hyperparameters
    grid_search = GridSearchCV(pipeline, param_grids[clf_name], cv=5)
    grid_search.fit(X_train, y_train)

    # Get the best model
    best_model = grid_search.best_estimator_
    
    # Make predictions with the best model
    y_pred = best_model.predict(X_test)

    # Print classification report for each model
    print(f"Classifier: {clf_name}")
    print(classification_report(y_test, y_pred))

Classifier: Decision Tree


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           1       0.00      0.00      0.00     40225
           2       0.50      1.00      0.67    119698
           3       0.00      0.00      0.00     80077

    accuracy                           0.50    240000
   macro avg       0.17      0.33      0.22    240000
weighted avg       0.25      0.50      0.33    240000



KeyboardInterrupt: 

This for loop does a lot of things. It is iterating over the list of classifiers defined before. For each classifier, it's defining a preprocessing pipeline, which consists of an imputer and scaler. The imputer replaces missing values with the mean value of the corresponding feature. The scaler standardizes features by removing the mean and scaling to unit variance.

Grid search parameters defines a set of hyperparameters for each classifier. I have to find the combination of hyperparameters that gives the best performance.

GridSearchCV is used to perform the grid search on the pipeline. It trains a model for each combination that achieved the highest score.

grid_search.best_estimator gets the best model that achieved the highest score during grid search.

In [11]:
# Print each metric
for metric in metrics:
    # If the metric is precision, recall, or f1 score, specify an averaging method
    if metric in [precision_score, recall_score, f1_score]:
        score = metric(y_test, y_pred, average='weighted')  # or 'micro' or 'macro', depending on what you want
    else:
        score = metric(y_test, y_pred)
    print(f"{metric.__name__}: {score}")

accuracy_score: 0.4987416666666667
precision_score: 0.24874325006944445
recall_score: 0.4987416666666667
f1_score: 0.3319361242857435


  _warn_prf(average, modifier, msg_start, len(result))


This is responsible for computing and printing the different performance metrics for the machine learning models.

In [12]:
# Feature importance for DecisionTree and RandomForest
if clf_name in ['Decision Tree', 'Random Forest']:
    importances = best_model.named_steps['model'].feature_importances_
    features = pd.DataFrame({ 'Feature': X.columns, 'Importance': importances })
    features = features.sort_values('Importance', ascending=False)
    print(features)

   Feature  Importance
5        F    0.384835
12       M    0.348569
13       N    0.266595
0        A    0.000000
1        B    0.000000
2        C    0.000000
3        D    0.000000
4        E    0.000000
6        G    0.000000
7        H    0.000000
8        I    0.000000
9        J    0.000000
10       K    0.000000
11       L    0.000000
14       O    0.000000


I calculate and print the feature importance for Decision Tree and Random Forest classifiers. Feature importance gives me a score for each feature of the data.



In [13]:
# Export to ONNX
initial_type = [('float_input', FloatTensorType([None, X_train.shape[1]]))]
onnx_model = convert_sklearn(best_model, initial_types=initial_type)
with open(f"{clf_name.replace(' ', '_')}_pipeline.onnx", "wb") as f:
    f.write(onnx_model.SerializeToString())

I wasn't able to convert fully to an ONNX file because the code just ran forever. I looked up different solutions but it seems to be a problem with the amount of data the pipeline has to go through.

The problem starts with the big 'for' loop. Since it only prints out the Decision Tree classifier.

I left the code running in the background overnight and once I woke up it still kept going. The problem is at the 'for' loop.

However, exporting to ONNX gave me two files: Random Forest file and Decision Tree file.

**Conclusion**

I conducted a comparative performance analysis of the Decision Tree, Random Forest, and Support Vector Machine (SVC) models using a large dataset. Each model was trained and evaluate using a pipeline.

From the results, you observe that all the models had the same accuracy score of apporximately 0.5, which indicates that they correctly classified 50% of the total instances in the test set. The precision, recall, and F1 score were low indicating that the models might not have performed well on all classes.

For Decision Tree, the most important features were F, M, and N.

For Random Forest, the most important featre were O, M, and K.

In the future work, one could consider trying additional machine learning models or preprocessing strategies, and advanced techniques for hyperparameter tuning. Balancing the dataset could be explored to improve performance on the minority classes.