# Boris Data Experiments

Data link: https://web.stanford.edu/~murmann/gmid

# Content

- Train defferent models using cross-validation on Boris Dataset.

- Build CSV file containing all the models with the measures of different metrics.

- Calculate the inference time (Atomic, Bulk, Throughput).

- Generate Visualization that compare between different models to get more intuition.

## Downloading the data then extracting it

`gdown` is the simplest solution to-date

In [2]:
!pip install gdown
!gdown https://drive.google.com/uc?id=1eDrC3g6pXTQwkxYJMXKkCze7k4-3xo-q

In [3]:
!mkdir boris_data
!unzip -q gm_ID_starter_kit_v2.3.zip -d boris_data/


## Read Matlab files in Python

will use scipy for such task

In [4]:
import scipy.io

n_trans = scipy.io.loadmat('boris_data/180nch.mat')
p_trans = scipy.io.loadmat('boris_data/180pch.mat')

trans = [n_trans, p_trans]


In [5]:
import numpy as np
import pandas as pd

all = []

for i, type_ in enumerate(("nch", "pch")):
  a = trans[i][type_][0][0]
  VGS = a[-6]
  VDS = a[-5]
  VSB = a[-4]
  L = a[-3]

  for i, l in enumerate(L):
    for n, vgs in enumerate(VGS):
      for m, vds in enumerate(VDS):
        for v, vsb in enumerate(VSB):
          this = []
          for out in list(a)[:13]:
            this.append(out[i,n,m,v])
          all.append([l[0], vgs[0], vds[0], vsb[0], type_, *this])

df = pd.DataFrame(all, columns=["L","VGS","VDS","VSB","Type","ID","VT","GM","GMB","GDS","CGG","CGS","CGD","CGB","CDD","CSS","STH","SFL"])
df

In [6]:
import os
os.chdir(r"/kaggle/working")

In [7]:
df.to_csv("boris_data.csv", index=False)

In [8]:
from IPython.display import FileLink
FileLink(r'boris_data.csv')

In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math
import seaborn as sns
import random


data = pd.read_csv('boris_data.csv')
nch_df , pch_df = data[data["Type"]=="nch"],data[data["Type"]=="pch"]
randomData = nch_df.sample(frac=0.1)
#interpolateData =  nch_df.sample(frac=0.1)

randomData.head()

In [12]:
X, y = randomData[["L","VGS","VDS","VSB"]], randomData['CGG']
X

In [13]:
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.tree import  DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import fetch_california_housing
from sklearn.utils import all_estimators
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import mean_squared_error, mean_absolute_error, max_error
from sklearn.base import clone
import time

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme()
import math

import tensorflow as tf
from tensorflow import keras

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=32)
X_train


# Define Models

In [15]:
RS = 15
estimators = {
    
    "SVM" : SVR(),
    "LinearRegression":LinearRegression(),
    "KNN":KNeighborsRegressor(),
    "RandomForest" : RandomForestRegressor(random_state=RS),
    "DecisionTree":DecisionTreeRegressor(random_state=RS),
    "XGB":XGBRegressor(random_state=RS),
}


# Cross Validation

In [16]:
def cross_validation(model, X, y, CV, shuffle=True):
    
    skfolds = KFold(n_splits=CV, shuffle=shuffle, random_state=42)
    RMSE, MAE, MAX = [], [], []

    for train_index, test_index in skfolds.split(X, y):
        clone_reg = clone(model)
        X_train_folds = X[np.array(train_index)]
        X_test_fold = X[test_index]
        y_train_folds = y[train_index]
        y_test_fold = y[test_index]
    
        clone_reg.fit(X_train_folds, y_train_folds)
        y_pred_fold = clone_reg.predict(X_test_fold)
        
        RMSE.append(np.sqrt(mean_squared_error(y_test_fold, y_pred_fold)))
        MAE.append(mean_absolute_error(y_test_fold, y_pred_fold))
        MAX.append(max_error(y_test_fold, y_pred_fold))
        
    #refit the model on all tha dataset
    start = time.time()
    model.fit(X, y)
    fit_time = time.time() - start
    
    return np.mean(RMSE), np.mean(MAE), np.mean(MAX), fit_time

# Append the result to a Dataframe

In [17]:
def cross_on_n_model(models, X, y, CV=10, shuffle=True, scaling=False):
    
    RMSE, MAE, MAX, Fit_time = [], [], [], []
    for model_name, model in models.items():
        if scaling:
            model = Pipeline([('scaler',StandardScaler()),(model_name,model)])
        
        rmse, mae, maximum, fit_time = cross_validation(model, X, y,CV, shuffle)
        print("ee")
        RMSE.append(rmse)
        MAE.append(mae)
        MAX.append(maximum)
        Fit_time.append(fit_time)
    
    data = {'Model': models.keys(), "RMSE": RMSE, "MAE": MAE, "MAX": MAX,
            "Fit Time (s)": Fit_time}
    result = pd.DataFrame(data)
    result.index = result.Model.values
    result.drop('Model', axis=1, inplace=True)
    return result

# The First result we obtain from Cross-Validation

In [18]:
y_train = y_train.to_numpy()

In [19]:
X_train =X_train.to_numpy()

In [20]:
result_1 = cross_on_n_model(estimators, X_train, y_train, 3, scaling=True)
result_1

# MAE Error

In [21]:
ax = sns.barplot(x=result_1.index, y=result_1["MAE"])
ax.set_xticklabels(ax.get_xticklabels(), rotation = 90)
plt.title("MAE Error", fontsize=18)
plt.show()

# Fit time (s)

In [22]:
ax = sns.barplot(x=result_1.index, y=result_1["Fit Time (s)"])
ax.set_xticklabels(ax.get_xticklabels(),rotation = 90)
plt.title("Fit Time (s)", fontsize=18)
plt.show()

# Calculate the Inference Time Based on 3 different techniques

### 1- Atomic

Atomic time is estimited by predict only one instance at a time and take the average over the all test-set.

### 2- Bulk

Bulk time is estimated by predict the whole test-set at a time and repeat this process many times and take the average.

### 3- Throughput

Throughput is the measure of the how many instances the model can predict per secound 

In [23]:
def atomic_time_estimator(estimators, X_test, verbose=False, measuring_unit = 10**3):
    """Measure runtime prediction of each instance."""
    n_instances = X_test.shape[0]
    runtimes = {}
    for estimator_name, estimator in estimators.items():
        runtimes[estimator_name] = np.zeros(n_instances, dtype=float)
        for i in range(n_instances):
            instance = X_test[[i], :]
            start = time.time()
            estimator.predict(instance)
            runtimes[estimator_name][i] = (time.time() - start) * measuring_unit
        if verbose:
            print(
                f"atomic runtimes {estimator_name}:",
                min(runtimes[estimator_name]),
                np.percentile(runtimes[estimator_name], 50),
                max(runtimes[estimator_name]),
            )
    return runtimes

In [24]:
X_test =X_test.to_numpy()

In [25]:
estimators_atomic_inference_time = atomic_time_estimator(estimators , X_test[:200], True, 10**3)

In [26]:
def bulk_time_estimator(estimators, X_test, n_bulk_repeats=50, verbose=False, measuring_unit = 10**3):
    """Measure runtime prediction of the whole input."""
    n_instances = X_test.shape[0]
    runtimes = {}
    for estimator_name, estimator in estimators.items():
        runtimes[estimator_name] = np.zeros(n_bulk_repeats, dtype=float)
        for i in range(n_bulk_repeats):
            start = time.time()
            estimator.predict(X_test)
            runtimes[estimator_name][i] = (time.time() - start) * measuring_unit
        runtimes[estimator_name] = np.array(list(map(lambda x: x / float(n_instances), runtimes[estimator_name])))
        if verbose:
            print(
                f"bulk runtimes {estimator_name}:",
                min(runtimes[estimator_name]),
                np.percentile(runtimes[estimator_name], 50),
                max(runtimes[estimator_name]),
            )
    return runtimes

In [27]:
estimators_bulk_inference_time = bulk_time_estimator(estimators , X_test[:200], 50,True)

In [28]:
def boxplot_runtimes(estimators_runtimes, pred_type):
    """
    Plot a new `Figure` with boxplots of prediction runtimes.

    Parameters
    ----------
    runtimes : list of `np.array` of latencies in micro-seconds
    cls_names : list of estimator class names that generated the runtimes
    pred_type : 'bulk' or 'atomic'

    """
    fig, ax1 = plt.subplots(figsize=(10, 6))
    bp = plt.boxplot(estimators_runtimes.values())
    cls_infos = ["%s" % (estimator_name) for estimator_name in estimators_runtimes.keys()]
    plt.setp(ax1, xticklabels=cls_infos)
    plt.setp(bp["boxes"], color="black")
    plt.setp(bp["whiskers"], color="black")
    plt.setp(bp["fliers"], color="red", marker="+")
    plt.xticks(rotation=45)
    ax1.yaxis.grid(True, linestyle="-", which="major", color="lightgrey", alpha=0.5)

    ax1.set_axisbelow(True)
    ax1.set_title(
        "Prediction Time per Instance - %s"
        % (pred_type.capitalize())
    )
    ax1.set_ylabel("Prediction Time (ms)")


In [29]:
def benchmark_throughputs(estimators, X, duration_secs=0.1, verbose=True):
    """benchmark throughput for different estimators."""

    throughputs = {}
    instance = X[0][None, :]
    for estimator_name, estimator in estimators.items():
        start_time = time.time()
        n_predictions = 0
        while (time.time() - start_time) < duration_secs:
            estimator.predict(instance)
            n_predictions += 1
        throughputs[estimator_name] = n_predictions / duration_secs
    
        if verbose:
                print(
                    f"Throughputs {estimator_name}:",
                    throughputs[estimator_name]
                )
    return throughputs

In [30]:
throughputs = benchmark_throughputs(estimators, X_test)

In [31]:
result_1["Throughputs (ins/s)"] = throughputs.values()
result_1["Atomic_time (ms)"] = np.percentile(list(estimators_atomic_inference_time.values()), 50, axis=1)
result_1["Bulk_time (ms)"] = np.percentile(list(estimators_bulk_inference_time.values()), 50, axis=1)
result_1

In [32]:
boxplot_runtimes({estimator_name: time for estimator_name, time in estimators_atomic_inference_time.items()
                  if estimator_name !="Small_Neural_network"}, "atomic")
plt.show()

In [33]:
boxplot_runtimes({estimator_name: time for estimator_name, time in estimators_bulk_inference_time.items()
                  if estimator_name !="SVM"}, "bulk")
plt.ylim([-0.1, 0.6])
plt.show()

In [34]:
boxplot_runtimes({"SVM": estimators_bulk_inference_time["SVM"]}, "bulk")
plt.show()

# Throughputs

In [35]:
sns.barplot(x=result_1.iloc[[1, 2, 4]].index, y=result_1.iloc[[1, 2, 4]]["Throughputs (ins/s)"])
plt.title("Throughputs", fontsize=18)
plt.show()

In [36]:
model = RandomForestRegressor(random_state=RS)
model.fit(X,y)

In [40]:
X

In [44]:
model

In [41]:
import time
inter_time = {10: 0, 100: 0, 1000: 0, 10000: 0,100000:0}
for i in [10, 100, 1000, 10000,100000]:
        t = []
        for dd in range(10):
            start = time.time()
            model.predict(X[:i])
            end = time.time()
            t.append(end-start)
        inter_time[i] = np.mean(t)

In [42]:
inter_time

# Memory crash

In [None]:
man = []
while True:
    man.append(" " * 10**6)

# Tunning Random Forest Model

In [45]:
import pandas as pd
import numpy as np

In [46]:
data = pd.read_csv("boris_data.csv")
data.head()

In [47]:
data.shape

# Split the data.

In [49]:
data.index = range(len(data))
idx_test = range(1, len(data), 3)
idx_train = list(set(range(len(data))) - set(idx_test))
data_train = data.iloc[idx_train]
data_test = data.iloc[idx_test]

In [50]:
data_train.shape

In [51]:
X_train = data_train[["L","VGS","VDS","VSB","Type"]]
y_train = data_train["ID"]
X_train = pd.get_dummies(X_train)

In [52]:
X_test = data_test[["L","VGS","VDS","VSB","Type"]]
y_test = data_test["ID"]
X_test = pd.get_dummies(X_test)
y_test = y_test.values

In [53]:
X_train.head(10)

In [54]:
X_train.shape, y_train.shape

In [55]:
X_test.shape, y_test.shape

In [56]:
import tensorflow as tf
from tensorflow import keras

In [58]:
columns = list(data_train.columns)
columns

In [59]:
target_columns = columns[5:]
target_columns

In [60]:
data_train.shape, data_test.shape

In [61]:
((data_test[target_columns] == 0).sum(axis=1) == 0).values

In [62]:
data_test_no_zero = data_test[((data_test[target_columns] == 0).sum(axis=1) == 0).values]
data_test_no_zero.head()

In [63]:
data_test_no_zero.shape, data_test.shape

In [64]:
from sklearn.utils import shuffle

X_train = data_train[["L","VGS","VDS","VSB","Type"]]
X_train = pd.get_dummies(X_train)
X_test = data_test[["L","VGS","VDS","VSB","Type"]]
X_test = pd.get_dummies(X_test)
y_train = data_train["ID"]
y_test = data_test["ID"] 
y_test = y_test.values
X_train_shuffled, y_train_shuffled = shuffle(X_train, y_train)

In [65]:
X_train.shape

In [66]:
def MAxPE(y_test, y_pred):
    return np.max((np.abs(y_test - y_pred)/y_test)) * 100

In [73]:
def MAPE(y_test, y_pred):
    return (np.abs(y_test - y_pred)/y_test).mean() * 100

In [70]:
from sklearn.utils import shuffle
from sklearn.tree import DecisionTreeRegressor

X_train = data_train[["L","VGS","VDS","VSB","Type"]]
X_train = pd.get_dummies(X_train)
X_test = data_test[["L","VGS","VDS","VSB","Type"]]
X_test = pd.get_dummies(X_test)

models = {}
MAPE = {}

for target in target_columns:
    y_train = data_train[target]
    y_test = data_test[target]
    y_test = y_test.values
    X_train_shuffled, y_train_shuffled = shuffle(X_train, y_train)
    tree = DecisionTreeRegressor()
    tree.fit(X_train_shuffled, y_train_shuffled)
    y_pred = tree.predict(X_test)
    models[target] = tree
    MAPE[target] = keras.metrics.mean_absolute_percentage_error(y_test, y_pred).numpy()
    print(target, keras.metrics.mean_absolute_percentage_error(y_test, y_pred).numpy())

In [74]:
from sklearn.utils import shuffle
from sklearn.tree import DecisionTreeRegressor


X_train = data_train[["L","VGS","VDS","VSB","Type"]]
X_train = pd.get_dummies(X_train)
X_test = data_test_no_zero[["L","VGS","VDS","VSB","Type"]]
X_test = pd.get_dummies(X_test)

models = {}
MAPE_keras = {}
MAPE_ours = {}

for target in target_columns:
    y_train = data_train[target]
    y_test = data_test_no_zero[target]
    y_test = y_test.values
    X_train_shuffled, y_train_shuffled = shuffle(X_train, y_train)
    tree = DecisionTreeRegressor()
    tree.fit(X_train_shuffled, y_train_shuffled)
    y_pred = tree.predict(X_test)
    models[target] = tree
    MAPE_keras[target] = keras.metrics.mean_absolute_percentage_error(y_test, y_pred).numpy()
    MAPE_ours[target] = MAPE(y_test, y_pred)
    print(target,keras.metrics.mean_absolute_percentage_error(y_test, y_pred).numpy(), MAPE(y_test, y_pred))

In [76]:
import os
os.makedirs("models")

In [77]:
import joblib
for target, model in models.items():
    name = "models/" + target + "tree.pkl"
    joblib.dump(model, name)

In [78]:
import shutil
shutil.make_archive("Models", 'zip', "./models")

In [79]:
from IPython.display import FileLink
FileLink(r'./Models.zip')

In [82]:
MAPE_ours

In [83]:
X_test.shape[0]

In [84]:
T = {}
import time
for target, model in models.items():
    y_test = data_test[target]
    y_test = y_test.values
    start = time.time()
    y_pred = model.predict(X_test)
    end = time.time()
    T[target] = (end - start)/ float(X_test.shape[0])

In [85]:
for target, t in T.items():
    T[target] = t * (10**6)

In [86]:
T

In [87]:
[list(MAPE_ours.values()), list(T.values())]

In [88]:
result = pd.DataFrame(data=list(MAPE_ours.values()), index=MAPE_ours.keys(), columns=["MAPE"])

In [89]:
size = [301, 8.07, 301, 301,
        305, 271, 280, 258, 269, 254, 275, 305, 301]

In [90]:
result["Size (MB)"] = size

In [91]:
result["Time(micro-s)"] = T.values()

In [92]:
y_test.shape

In [93]:
max_A = []
for name, model in models.items():
    y_test = data_test_no_zero[target]
    y_pred = model.predict(X_test)
    max_A.append(MAxPE(y_test, y_pred))

In [94]:
y_test = data_test_no_zero[target]
y_pred = model.predict(X_test)
np.argmax(np.abs(y_test - y_pred))

In [110]:
result

In [99]:
import seaborn as sns
import matplotlib.pyplot as plt
sns.set_theme()
plt.figure(figsize=(10, 8))
sns.barplot(x=result.index, y=result["MAPE"])
plt.title("Mean Absolute Percentage Error")
plt.show()

In [100]:
plt.figure(figsize=(10, 8))
sns.barplot(x=result.index, y=result["Size (MB)"])
plt.title("Size of each Model in MB")
plt.show()

In [101]:
plt.figure(figsize=(10, 8))
sns.barplot(x=result.index, y=result["Time(micro-s)"])
plt.title("Throuput in \u03BCS")
plt.show()