### Creation of model based on previously created training and testing data
* Currently supports only linear models for continuous data. 
* Exports data into "models" folder

In [15]:
%pip install scikit-learn

Note: you may need to restart the kernel to use updated packages.


In [66]:

import numpy as np
from sklearn.model_selection import train_test_split
from joblib import dump
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.metrics import accuracy_score, r2_score, mean_squared_error
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import PolynomialFeatures

import itertools

from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)

In [2]:
mods_supported = [0, 8, 16, 64, 1024]
mod_vals = set()

for length in range(1, len(mods_supported)+1):
    combinations = itertools.combinations(mods_supported, length)
    for combination in list(combinations):
        mod_vals.add(sum(combination))

mod_vals = list(mod_vals)
mod_vals.sort()


In [61]:
mod_val = 0
data = np.load(f"data/training{mod_val}.npz", allow_pickle=True)
X = data['X']
y = data['y']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)
y_train[y_train==0] = 1
y_test[y_test==0] = 1

y_log_train = np.log(y_train)
y_log_test = np.log(y_test)

poly_feats = PolynomialFeatures(degree=2)
X_poly_train = poly_feats.fit_transform(X_train)

polyreg = LinearRegression()
polyreg.fit(X_poly_train, y_log_train)





LinearRegression()

In [62]:

#poly_feats = PolynomialFeatures(degree=2)
y_pred = np.exp(polyreg.predict(poly_feats.fit_transform(X_test)))
diff = 0
for j in range(len(y_test)):
    diff += abs(y_pred[j]-y_test[j])

diff

103013.77769497452

In [67]:
polys = set()

for mod_val in mod_vals:

    print(f"generating models for mod_val {mod_val}...")

    data = np.load(f"data/training{mod_val}.npz", allow_pickle=True)
    X = data['X']
    y = data['y']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

    y_train[y_train==0] = 1
    y_test[y_test==0] = 1

    y_log_train = np.log(y_train)
    y_log_test = np.log(y_test)
    
    linreg = LinearRegression()
    ridge = Ridge()
    lasso = Lasso()
    elastic = ElasticNet()
    rf = RandomForestClassifier()
    knn = KNeighborsClassifier(n_neighbors=3)

    regression_models = [linreg, ridge, lasso, elastic]
    regression_titles = ["linreg", "ridge", "lasso", "elastic"]
    
    classification_models = [rf, knn]
    classification_titles = ["rf", "knn"]

    for i in range(len(regression_models)):
        regression_models[i].fit(X_train, y_log_train)

    y_log_train = (np.rint(y_log_train)).astype(int)
    y_log_test = (np.rint(y_log_test)).astype(int)

    for i in range(len(classification_models)):
        classification_models[i].fit(X_train, y_log_train)

    models = regression_models + classification_models
    titles = regression_titles + classification_titles
    diffs = [0 for i in range(len(models))]

    for i in range(len(models)):
        y_pred = np.exp(models[i].predict(X_test))
        for j in range(len(y_test)):
            diffs[i] += abs(y_pred[j]-y_test[j])


    poly_feats = PolynomialFeatures(degree=2)
    X_poly_train = poly_feats.fit_transform(X_train)

    polyreg = LinearRegression()
    polyreg.fit(X_poly_train, y_log_train)

    y_pred = np.exp(polyreg.predict(poly_feats.fit_transform(X_test)))  

    diff = 0
    for j in range(len(y_test)):
        diff += abs(y_pred[j]-y_test[j])

    models.append(polyreg)
    titles.append("polyreg")
    diffs.append(diff)

    diffs = [round(x) for x in diffs]
    leastIndex = -1
    least = np.inf
    for i in range(len(models)):
        # print(f"{titles[i]} diffs: {diffs[i]}")
        if diffs[i] < least:
            least = diffs[i]
            leastIndex = i
    print(f"least diffs for {mod_val}: {titles[leastIndex]} with {diffs[leastIndex]}")
    # print("~~~~~~~~~~")
    if titles[leastIndex] == "polyreg":
            polys.add(mod_val)
    dump(models[leastIndex], f'models/model{mod_val}.joblib')

dump(polys, "models/model_info")
print("done!")

generating models for mod_val 0...
linreg diffs: 138417
ridge diffs: 139526
lasso diffs: 1068872
elastic diffs: 768146
rf diffs: 424611
knn diffs: 971752
polyreg diffs: 109994
least diffs for 0: polyreg with 109994
~~~~~~~~~~
generating models for mod_val 8...
linreg diffs: 262939
ridge diffs: 262399
lasso diffs: 1129278
elastic diffs: 714091
rf diffs: 372570
knn diffs: 1003143
polyreg diffs: 120932
least diffs for 8: polyreg with 120932
~~~~~~~~~~
generating models for mod_val 16...
linreg diffs: 227199
ridge diffs: 227598
lasso diffs: 964838
elastic diffs: 719253
rf diffs: 373439
knn diffs: 971350
polyreg diffs: 155336
least diffs for 16: polyreg with 155336
~~~~~~~~~~
generating models for mod_val 24...
linreg diffs: 230056
ridge diffs: 229990
lasso diffs: 1635152
elastic diffs: 934145
rf diffs: 418224
knn diffs: 1133548
polyreg diffs: 181746
least diffs for 24: polyreg with 181746
~~~~~~~~~~
generating models for mod_val 64...
linreg diffs: 289747
ridge diffs: 288263
lasso diffs: 9

In [69]:
from joblib import load
polys = load("models/model_info")
print(polys)

{0, 1024, 64, 8, 72, 1032, 1096, 16, 80, 1040, 1104, 24, 1048, 88}
