In [None]:
from IPython.core.display_functions import display
from sklearn.metrics import make_scorer, mean_absolute_error
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

np.set_printoptions(precision=3, suppress=True)
ecm = make_scorer(mean_absolute_error)
k_fold_value = 5

raw_dataset = pd.read_csv('dataset_v1.csv', sep=',', skipinitialspace=True)
dataset = raw_dataset.copy()
predict_set = dataset.tail(20)
dataset = dataset.drop(predict_set.index).drop('SampleNo', axis=1).drop('x6', axis=1)

predictors = ['x1', 'x2', 'x3', 'x4', 'x5']
all_test_results = {}

features, labels = dataset[predictors], dataset['Y']

In [None]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score, cross_val_predict


class ModelResult:
    def __init__(self, info_dict: dict):
        self.model = info_dict['model']
        self.arg = info_dict['arg']
        self.predictors = info_dict['predictors']
        self.predicted = info_dict['predicted']
        self.error = info_dict['error']
        self.estimator = info_dict['estimator']


def single_linear_regression(x, y, arg=None, inputs=None, keep_estimator=False):
    estimator = LinearRegression()
    cv_scores = cross_val_score(estimator=estimator,
                                X=x,
                                y=y,
                                cv=5,
                                scoring=ecm)
    cv_predictions = cross_val_predict(estimator=estimator,
                                       X=x,
                                       y=y,
                                       cv=5)
    if keep_estimator:
        estimator.fit(x, y)

    return ModelResult({'model': 'single_lin_regression',
                        'arg': '',
                        'predictors': inputs,
                        'predicted': cv_predictions,
                        'error': cv_scores.sum() / len(cv_scores),
                        'estimator': estimator})


def polynomial_regression(x, y, arg=None, inputs=None, keep_estimator=False):
    estimator = LinearRegression()
    poly_features = PolynomialFeatures(degree=arg, include_bias=False).fit_transform(x)
    cv_scores = cross_val_score(estimator=estimator,
                                X=poly_features,
                                y=y,
                                cv=5, scoring=ecm)
    cv_predictions = cross_val_predict(estimator=estimator,
                                       X=poly_features,
                                       y=y,
                                       cv=5)
    if keep_estimator:
        estimator.fit(poly_features, y)
    return ModelResult({'model': 'polynomial_regression',
                        'arg': f'order={arg}',
                        'predictors': inputs,
                        'predicted': cv_predictions,
                        'error': cv_scores.sum() / len(cv_scores),
                        'estimator': estimator})

In [None]:
def split_by_threshold(ss, predictor, threshold):
    return ss[ss[predictor] < threshold], ss[ss[predictor] >= threshold]


def decision_sets(source_set, conditions):
    return_set = []
    sample_set = source_set.copy()
    for condition in conditions:
        predictor, operator, threshold = condition[:2], condition[2], int(condition[3:])
        lt, gt = split_by_threshold(sample_set, predictor, threshold)
        if operator == '<':
            return_set.append(lt)
            sample_set = gt
        else:
            return_set.append(gt)
            sample_set = lt
        if condition == conditions[-1]:
            return_set.append(sample_set)
    return return_set


def subplot_for_five(data):
    fig, axes = plt.subplots(2, 3, figsize=(18, 10))
    axes[1][2].set_visible(False)
    axes[1][0].set_position([0.24, 0.125, 0.228, 0.343])
    axes[1][1].set_position([0.55, 0.125, 0.228, 0.343])
    for i in range(len(predictors)):
        sns.scatterplot(ax=axes[i // 3, i % 3], x=data[predictors[i]], y=data['Y'], alpha=0.5, color='b')

In [None]:
subplot_for_five(data=dataset)

In [None]:
n1_lt, n1_gt = split_by_threshold(dataset, 'x5', threshold=5)
subplot_for_five(data=n1_gt)
subplot_for_five(data=n1_lt)

In [None]:
n2_lt, n2_gt = split_by_threshold(n1_lt, 'x1', threshold=20)
subplot_for_five(data=n2_lt)
subplot_for_five(data=n2_gt)

In [None]:
def predict(train_sets, test_sets):
    decision_models = get_decision_models(train_sets)
    predictions = pd.DataFrame()
    for i in range(len(decision_models)):
        estimator = decision_models[i].estimator
        inputs = decision_models[i].predictors
        test_set = test_sets[i][inputs]

        if decision_models[i].model == 'polynomial_regression':
            degree = int(decision_models[i].arg.split('=')[-1])
            test_set = PolynomialFeatures(degree=degree, include_bias=False).fit_transform(test_set)

        y_predicted = estimator.predict(test_set)
        predictions = pd.concat([predictions,
                                 pd.DataFrame({'Predictions': y_predicted}, index=test_sets[i].index)])
    predictions.sort_index(inplace=True)
    with pd.option_context('display.max_rows', None,):
        display(predictions)


def get_decision_models(train_sets):
    decisions = []
    for train_set in train_sets:
        train_model = train_set['model'](x=train_set['data'][train_set['predictor']],
                                         y=train_set['data']['Y'],
                                         inputs=train_set['predictor'], arg=train_set['arg'],
                                         keep_estimator=True)
        decisions.append(train_model)
    return decisions


def plot_target_sets(conditions):
    fig = plt.figure(figsize=(18, 5))

    cv_error = 0
    plot_i = 1
    predictions = pd.DataFrame()
    decision_df = pd.DataFrame(columns=['Model', 'Predictors', 'Args', 'Error'])

    for train_set in conditions:
        train_model = train_set['model'](x=train_set['data'][train_set['predictor']],
                                         y=train_set['data']['Y'],
                                         inputs=train_set['predictor'], arg=train_set['arg'])

        predictions = pd.concat([predictions,
                                 pd.DataFrame({'SampleNo': train_set['data']['Y'].index,
                                               'Predictions': train_model.predicted})])

        decision_df.loc[train_set['title']] = [train_model.model, train_model.predictors,
                                               train_model.arg, train_model.error]

        if len(train_set['predictor']) == 1:
            ax = fig.add_subplot(1, 3, plot_i)
            sns.scatterplot(ax=ax, x=train_set['data'][train_set['predictor'][0]], y=train_set['data']['Y'],
                            color='b', label='Actual Data')
            sns.scatterplot(ax=ax, x=train_set['data'][train_set['predictor'][0]], y=train_model.predicted,
                            color='r', label='Predicted')
            ax.set_title(train_set['title'])
        elif len(train_set['predictor']) == 2:
            ax = fig.add_subplot(1, 3, plot_i, projection='3d')
            ax.view_init(elev=2., azim=-96)
            ax.set_box_aspect(aspect=None, zoom=1.35)

            x_axis = train_set['predictor'][0]
            y_axis = train_set['predictor'][1]
            z_axis = 'Y'

            ax.scatter(x3_train[x_axis], x3_train[y_axis], x3_train[z_axis], c=x3_train[z_axis])
            ax.plot_trisurf(x3_train[x_axis], x3_train[y_axis], train_model.predicted,
                            linewidth=0, antialiased=True, alpha=0.3, color='r')

            ax.set_xlabel(x_axis)
            ax.set_ylabel(y_axis)
            ax.set_zlabel(z_axis)
            ax.set_title(train_set['title'])
        else:
            pass

        cv_error += train_model.error
        plot_i += 1

    print('Total CV Score (MAE) =', cv_error)
    display(decision_df)

In [None]:
x5_train, x1_train, x3_train = decision_sets(dataset, ['x5>5', 'x1<20'])
test_one = [{'predictor': ['x5'], 'data': x5_train,
             'title': 'x5>5', 'model': single_linear_regression, 'arg': None},
            {'predictor': ['x1'], 'data': x1_train,
             'title': 'x5<5 && x1<20', 'model': polynomial_regression, 'arg': 3},
            {'predictor': ['x3'], 'data': x3_train,
             'title': 'x5<5 && x1>20', 'model': polynomial_regression, 'arg': 2}]

plot_target_sets(test_one)

In [None]:
from itertools import combinations

predictors = ['x1', 'x2', 'x3', 'x4', 'x5']
models_df = pd.DataFrame(columns=['Order', 'Predictors'])

predictor_combinations = []
for L in range(2, len(predictors) + 1):
    predictor_combinations.extend(combinations(predictors, L))

for order in range(2, 5):
    for inp in list(map(list, predictor_combinations)):
        target_model = polynomial_regression(x=x3_train[inp],
                                             y=x3_train['Y'],
                                             inputs=inp, arg=order)
        comp = pd.DataFrame({'Predicted': target_model.predicted, 'Actual': x3_train['Y']})
        comp['Error'] = (comp['Predicted'] - comp['Actual']).abs()
        models_df.loc[target_model.error] = [order, inp]

models_df.sort_index(inplace=True)
display(models_df)

In [None]:
x5_train, x1_train, x3_train = decision_sets(dataset, ['x5>5', 'x1<20'])
train_decisions = [{'predictor': ['x5'], 'data': x5_train,
                    'title': 'x5>5', 'model': single_linear_regression, 'arg': None},
                   {'predictor': ['x1'], 'data': x1_train,
                    'title': 'x5<5 and x1<20', 'model': polynomial_regression, 'arg': 3},
                   {'predictor': ['x3', 'x1'], 'data': x3_train,
                    'title': 'x5<5 and x1>20', 'model': polynomial_regression, 'arg': 2}]

plot_target_sets(train_decisions)

In [None]:
x5_train, x1_train, x3_train = decision_sets(dataset, ['x5>5', 'x1<20'])

train = [{'model': single_linear_regression, 'title': 'x5>5', 'data': x5_train,
          'predictor': ['x5'], 'arg': None},
         {'model': polynomial_regression, 'title': 'x5<5 and x1<20', 'data': x1_train,
          'predictor': ['x1'], 'arg': 3},
         {'model': polynomial_regression, 'title': 'x5<5 and x1>20', 'data': x3_train,
          'predictor': ['x3', 'x1'], 'arg': 2}]

test = decision_sets(predict_set, ['x5>5', 'x1<20'])

target_set_predictions = predict(train_sets=train, test_sets=test)