In [2]:
!pip install skl2onnx

Collecting skl2onnx
  Downloading skl2onnx-1.8.0-py2.py3-none-any.whl (230 kB)
[K     |████████████████████████████████| 230 kB 2.3 MB/s eta 0:00:01
Collecting onnxconverter-common<1.9,>=1.6.1
  Downloading onnxconverter_common-1.8.1-py2.py3-none-any.whl (77 kB)
[K     |████████████████████████████████| 77 kB 2.6 MB/s eta 0:00:011
Collecting onnx>=1.2.1
  Downloading onnx-1.9.0-cp38-cp38-manylinux2010_x86_64.whl (12.2 MB)
[K     |████████████████████████████████| 12.2 MB 5.5 MB/s eta 0:00:01
Collecting typing-extensions>=3.6.2.1
  Using cached typing_extensions-3.10.0.0-py3-none-any.whl (26 kB)
Installing collected packages: typing-extensions, onnx, onnxconverter-common, skl2onnx
Successfully installed onnx-1.9.0 onnxconverter-common-1.8.1 skl2onnx-1.8.0 typing-extensions-3.10.0.0


In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from skl2onnx import convert_sklearn
from skl2onnx.common.data_types import FloatTensorType

In [5]:
def load_data():
    data_set = pd.read_csv("../../Datasets/Jan_2020_ontime.csv")

    features_columns = [
        "DAY_OF_MONTH",
        "DAY_OF_WEEK",
        "OP_UNIQUE_CARRIER",
        "OP_CARRIER_AIRLINE_ID",
        "OP_CARRIER",
        "TAIL_NUM",
        "OP_CARRIER_FL_NUM",
        "ORIGIN_AIRPORT_ID",
        "ORIGIN_AIRPORT_SEQ_ID",
        "ORIGIN",
        "DEST_AIRPORT_ID",
        "DEST_AIRPORT_SEQ_ID",
        "DEST",
        "DEP_TIME",
        "DEP_DEL15",
        "DEP_TIME_BLK",
        "DISTANCE",
    ]

    targets_columns = ["ARR_TIME"]

    data_set.replace("", float("NaN"), inplace=True)
    data_set.dropna(subset=features_columns + targets_columns, inplace=True)

    x = data_set[features_columns].values

    op_unique_labelEncoder = preprocessing.LabelEncoder()
    x[:, 2] = op_unique_labelEncoder.fit_transform(x[:, 2])
    op_carrier_labelEncoder = preprocessing.LabelEncoder()
    x[:, 4] = op_carrier_labelEncoder.fit_transform(x[:, 4])
    tail_num_labelEncoder = preprocessing.LabelEncoder()
    x[:, 5] = tail_num_labelEncoder.fit_transform(x[:, 5])
    origin_labelEncoder = preprocessing.LabelEncoder()
    x[:, 9] = origin_labelEncoder.fit_transform(x[:, 9])
    dest_labelEncoder = preprocessing.LabelEncoder()
    x[:, 12] = dest_labelEncoder.fit_transform(x[:, 12])
    dep_time_labelEncoder = preprocessing.LabelEncoder()
    x[:, 15] = dep_time_labelEncoder.fit_transform(x[:, 15])

    scaler = StandardScaler()
    x = scaler.fit_transform(x)

    y = np.ravel(data_set[targets_columns].values)

    print("-------------------------------------")
    print("X shape : ", x.shape)
    print("X samples : ", x[:5])
    print("-------------------------------------")
    print("y shape : ", y.shape)
    print("y samples : ", y[:5])
    print("-------------------------------------")

    X_trainset, X_testset, y_trainset, y_testset = train_test_split(x, y, test_size=0.3, random_state=42)

    return X_trainset, X_testset, y_trainset, y_testset

In [6]:
def test_decision_tree(X_trainset, X_testset, y_trainset, y_testset):
    model = DecisionTreeRegressor(criterion="mse")
    model.fit(X_trainset, y_trainset)
    predictions = model.predict(X_testset)
    mse_score = mean_squared_error(y_testset, predictions)
    r2_score = model.score(X_testset, y_testset)
    print("Decision Tree score (MSE) :", mse_score)
    print("Decision Tree score (R2) :", r2_score)
    return model

In [7]:
def test_random_forest(X_trainset, X_testset, y_trainset, y_testset):
    model = RandomForestRegressor(criterion="mse")
    model.fit(X_trainset, y_trainset)
    predictions = model.predict(X_testset)
    mse_score = mean_squared_error(y_testset, predictions)
    r2_score = model.score(X_testset, y_testset)
    print("Random Forest score (MSE) :", mse_score)
    print("Random Forest score (R2) :", r2_score)
    return model

In [8]:
def test_linear_regression(X_trainset, X_testset, y_trainset, y_testset):
    model = LinearRegression()
    model.fit(X_trainset, y_trainset)
    predictions = model.predict(X_testset)
    mse_score = mean_squared_error(y_testset, predictions)
    r2_score = model.score(X_testset, y_testset)
    print("Linear Regression score (MSE) :", mse_score)
    print("Linear Regression score (R2) :", r2_score)
    return model

In [9]:
X_trainset, X_testset, y_trainset, y_testset = load_data()

print("Results: ")
test_decision_tree(X_trainset, X_testset, y_trainset, y_testset)
rf_model = test_random_forest(X_trainset, X_testset, y_trainset, y_testset)
test_linear_regression(X_trainset, X_testset, y_trainset, y_testset)
print("-------------------------------------")

-------------------------------------
X shape :  (600271, 17)
X samples :  [[-1.66633536 -0.49556467 -0.65923795  0.99130754 -0.65923795 -0.43531104
   0.9764811   0.83576861  0.8357903   0.73722338 -0.44490263 -0.4448962
  -0.39904773 -0.66083924 -0.39698826 -0.62718763 -1.06359251]
 [-1.66633536 -0.49556467 -0.65923795  0.99130754 -0.65923795 -1.48783174
   0.97867688  1.78012723  1.78011772  1.75386264  0.83578902  0.83581072
   0.73721858 -0.61255758 -0.39698826 -0.62718763 -0.36520698]
 [-1.66633536 -0.49556467 -0.65923795  0.99130754 -0.65923795 -1.65590274
   0.98032372 -0.68045163 -0.68046527 -0.61829564  1.80761955  1.80762977
   1.82038832  1.03907761 -0.39698826  1.02630341 -0.28704217]
 [-1.66633536 -0.49556467 -0.65923795  0.99130754 -0.65923795 -1.55301517
   0.98087266 -1.22936008 -1.22935496 -1.25369517 -0.25538912 -0.25539549
  -0.12294563  1.03505413 -0.39698826  1.02630341 -0.92935295]
 [-1.66633536 -0.49556467 -0.65923795  0.99130754 -0.65923795 -0.11642347
   0.981

17

In [None]:
initial_type = [('float_input', FloatTensorType([None, X_trainset.shape[1]]))]
onx = convert_sklearn(rf_model, initial_types=initial_type)

with open("rf.onnx", "wb") as f:
    f.write(onx.SerializeToString())