# Installing the Libraries

In [None]:
!pip install xgboost onnxmltools skl2onnx onnxruntime 



# Importing the libraries

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier

import onnxmltools
from onnxmltools.convert.common.data_types import FloatTensorType
import onnxruntime as ort
import pickle

# Data reading and Visualization

In [None]:
df = pd.read_csv('data_public.csv')
print(f"Initial dataset shape: {df.shape}")
print(f"Columns: {df.columns}")

Initial dataset shape: (1200000, 16)
Columns: Index(['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N',
       'O', 'Class'],
      dtype='object')


In [None]:
df.head(10)

Unnamed: 0,A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,Class
0,231.420023,-12.210984,217.624839,-15.611916,140.047185,76.904999,131.591871,198.160805,82.873279,127.350084,224.592926,-5.992983,-14.689648,143.072058,153.439659,3
1,-38.01927,-14.195695,9.583547,22.293822,-25.578283,-18.373955,-0.094457,-33.711852,-8.356041,23.792402,4.199023,2.809159,-59.330681,-11.68595,1.317104,2
2,-39.197085,-20.41885,21.023083,19.79028,-25.902587,-19.189004,-2.953836,-25.299219,-6.612401,26.285392,5.911292,6.191587,-56.924996,-4.675187,-1.02783,2
3,221.630408,-5.785352,216.725322,-9.900781,126.795177,85.122288,108.857593,197.640135,82.560019,157.105143,212.989231,-3.62107,-15.469156,135.265859,149.212489,3
4,228.558412,-12.44771,204.637218,-13.277704,138.930529,91.10187,115.598954,209.300011,89.961688,130.299732,201.7951,-1.573922,-15.128603,148.368622,147.492663,3
5,235.027198,-16.081132,213.391582,-12.934912,122.413766,80.22254,125.240412,185.694965,66.072251,142.555417,213.442083,0.728989,-26.046291,137.529415,143.348783,3
6,-35.819795,-16.688245,5.738227,17.570011,-31.523595,-20.625764,0.077354,-28.944922,-10.699215,26.982097,2.958726,-2.365574,-56.082634,-6.918042,-0.996789,2
7,-28.620633,-16.324678,6.614499,19.866385,-23.119998,-22.328572,1.477065,-26.383605,-8.616671,28.874319,1.743714,4.953251,-55.702022,-8.014284,2.449081,3
8,-41.092898,-11.525839,12.02701,18.670988,-19.612979,-25.918632,5.266337,-25.972718,-13.37121,25.709532,-1.515002,-1.041023,-54.683633,-12.382565,5.424004,2
9,222.645628,-10.907163,215.098599,-15.552786,126.227053,85.659915,106.932482,195.793702,79.786718,138.150287,223.018301,-11.896866,-15.113817,146.038066,140.611871,1


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1200000 entries, 0 to 1199999
Data columns (total 16 columns):
 #   Column  Non-Null Count    Dtype  
---  ------  --------------    -----  
 0   A       1200000 non-null  float64
 1   B       1200000 non-null  float64
 2   C       1200000 non-null  float64
 3   D       1200000 non-null  float64
 4   E       1200000 non-null  float64
 5   F       1200000 non-null  float64
 6   G       1200000 non-null  float64
 7   H       1200000 non-null  float64
 8   I       1200000 non-null  float64
 9   J       1200000 non-null  float64
 10  K       1200000 non-null  float64
 11  L       1200000 non-null  float64
 12  M       1200000 non-null  float64
 13  N       1200000 non-null  float64
 14  O       1200000 non-null  float64
 15  Class   1200000 non-null  int64  
dtypes: float64(15), int64(1)
memory usage: 146.5 MB


In [None]:
df['Class'].value_counts()

Unnamed: 0_level_0,count
Class,Unnamed: 1_level_1
3,569521
2,449885
1,180594


# Spliting the dependent and Indepemdent variables

In [None]:
X = df.iloc[:, :-1]
y = df.iloc[:, -1]

In [None]:
unique_classes = sorted(y.unique())
class_mapping = {old_class: new_class for new_class, old_class in enumerate(unique_classes)}
y_mapped = y.map(class_mapping)

# Data Pre-processing


In [None]:
def remove_outliers(X, y, threshold=1.5): # Function to remove the Outliers
    Q1 = X.quantile(0.25)
    Q3 = X.quantile(0.75)
    IQR = Q3 - Q1
    mask = ~((X < (Q1 - threshold * IQR)) | (X > (Q3 + threshold * IQR))).any(axis=1)
    X_cleaned = X[mask].reset_index(drop=True)
    y_cleaned = y[mask].reset_index(drop=True)
    return X_cleaned, y_cleaned

In [None]:
X_cleaned, y_cleaned = remove_outliers(X, y_mapped)
print(f"Size before outlier removal: {len(df)}")
print(f"Size after outlier removal: {len(X_cleaned)}")

Size before outlier removal: 1200000
Size after outlier removal: 999473


# Spiting the Train and Test data

In [None]:
X_train, X_test, y_train, y_test = train_test_split( X_cleaned, y_cleaned, test_size=0.2, random_state=42 )

# Pipelining with StandardScalar

In [None]:
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', XGBClassifier(
        n_estimators=50,
        max_depth=5,
        learning_rate=0.1,
        random_state=42
    ))
])

# Model fitting using pipeline  

In [None]:
pipeline.fit(X_train, y_train)

# Saving scaler in pkl format

In [None]:
scaler = pipeline.named_steps['scaler']
with open("scaler.pkl", "wb") as f:
    pickle.dump(scaler, f)

# Accuracy and Predections

In [None]:
y_pred = pipeline.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("XGBoost Accuracy:", accuracy)
print("\nClassification Report:")
print(classification_report(y_test, y_pred, zero_division=0))

XGBoost Accuracy: 0.770104304759999

Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00     15984
           1       0.75      1.00      0.86     89899
           2       0.80      0.68      0.74     94012

    accuracy                           0.77    199895
   macro avg       0.52      0.56      0.53    199895
weighted avg       0.71      0.77      0.73    199895



# Saving the model in Onnx

In [None]:
initial_types = [("input", FloatTensorType([None, X_test.shape[1]]))]

onnx_model = onnxmltools.convert_xgboost(
    pipeline.named_steps['classifier'],
    name="XGBoostClassifier",
    initial_types=initial_types
)

In [None]:
onnx_file_path = "xgboost_model.onnx"
with open(onnx_file_path, "wb") as f:
    f.write(onnx_model.SerializeToString())

print(f"ONNX model saved to {onnx_file_path}")

ONNX model saved to xgboost_model.onnx


# Note

Please Downlad the scaler.pkl file to test the onnx model