# DISTINCTNESS

In [ ]:
import random
import warnings

import numpy as np
import pandas as pd

from library.A_data_collection import make_dataset_for_regression
from library.D_data_analysis import regression
from library.my_functions import pollution_first_second_third_experiments, plot_results, pollution_fourth_experiment, \
    pollution_fifth_experiment, pollution_sixth_experiment, pollution_seventh_experiment, pollution_eighth_experiment, \
    pollution_ninth_tenth_experiments

warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=Warning)

SEED = 2023
random.seed(SEED)
np.random.seed(SEED)

REGRESSION_ALGORITHMS = [
    "LinearRegressor", "BayesianRidge", "GPRegressor", "SVMRegressor", "KNNRegressor",
    "MLPRegressor"
]

## Experiment 1. Distinctness - Mid-high distinctness

In [0]:
DESCRIPTION = 'Distinctness - Mid-high distinctness'
X_LABEL = "Percentage of not distinct values"
RESULTS_PER_ALGORITHM = []
X_AXIS = [n * 0.05 for n in range(0, 11)]

for algorithm in REGRESSION_ALGORITHMS:
    RESULTS_ALGORITHM = []

    for i in range(len(X_AXIS)):
        X, y = make_dataset_for_regression(
            n_samples=1000, n_features=3, n_informative=3, n_targets=1,
            bias=0.0, effective_rank=None, tail_strength=0.5, noise=0.0, seed=2023
        )
        X = pd.DataFrame(X)
        X = pollution_first_second_third_experiments(X, X_AXIS[i])
        RESULTS_ALGORITHM.append(regression(X, y, algorithm, SEED))

    RESULTS_PER_ALGORITHM.append(RESULTS_ALGORITHM)

plot_results(
    x_axis_values=X_AXIS, x_label=X_LABEL, results=RESULTS_PER_ALGORITHM, title=DESCRIPTION,
    algorithms=REGRESSION_ALGORITHMS
)

## Experiment 2. Distinctness - Mid-low distinctness

In [0]:
DESCRIPTION = 'Distinctness - Mid-low distinctness'
X_LABEL = "Percentage of not distinct values"
RESULTS_PER_ALGORITHM = []
X_AXIS = [n * 0.05 for n in range(10, 21)]

for algorithm in REGRESSION_ALGORITHMS:
    RESULTS_ALGORITHM = []

    for i in range(len(X_AXIS)):
        X, y = make_dataset_for_regression(
            n_samples=1000, n_features=3, n_informative=3, n_targets=1,
            bias=0.0, effective_rank=None, tail_strength=0.5, noise=0.0, seed=2023
        )
        X = pd.DataFrame(X)
        X = pollution_first_second_third_experiments(X, X_AXIS[i])
        RESULTS_ALGORITHM.append(regression(X, y, algorithm, SEED))

    RESULTS_PER_ALGORITHM.append(RESULTS_ALGORITHM)

plot_results(
    x_axis_values=X_AXIS, x_label=X_LABEL, results=RESULTS_PER_ALGORITHM, title=DESCRIPTION,
    algorithms=REGRESSION_ALGORITHMS
)

## Experiment 3. Fixed Distinctness of Different Datasets

In [0]:
DESCRIPTION = 'Distinctness - Fixed Distinctness of Different Datasets'
X_LABEL = "Number of Samples"
RESULTS_PER_ALGORITHM = []
X_AXIS = [n * 100 + 1000 for n in range(0, 11)]

for algorithm in REGRESSION_ALGORITHMS:
    RESULTS_ALGORITHM = []

    for i in range(len(X_AXIS)):
        X, y = make_dataset_for_regression(
            n_samples=X_AXIS[i], n_features=3, n_informative=3, n_targets=1,
            bias=0.0, effective_rank=None, tail_strength=0.5, noise=0.0, seed=2023
        )
        X = pd.DataFrame(X)
        X = pollution_first_second_third_experiments(X, 0.2)
        RESULTS_ALGORITHM.append(regression(X, y, algorithm, SEED))

    RESULTS_PER_ALGORITHM.append(RESULTS_ALGORITHM)

plot_results(
    x_axis_values=X_AXIS, x_label=X_LABEL, results=RESULTS_PER_ALGORITHM, title=DESCRIPTION,
    algorithms=REGRESSION_ALGORITHMS
)

## Experiment 4. Distinctness - Different percentages among features

In [0]:
DESCRIPTION = 'Distinctness - Different percentages among features'
X_LABEL = "Percentage of not distinct values"
RESULTS_PER_ALGORITHM = []
X_AXIS = [n * 0.05 for n in range(1, 11)]

for algorithm in REGRESSION_ALGORITHMS:
    RESULTS_ALGORITHM = []

    for i in range(len(X_AXIS)):
        X, y = make_dataset_for_regression(
            n_samples=1000, n_features=9, n_informative=4, n_targets=1,
            bias=0.0, effective_rank=None, tail_strength=0.5, noise=0.0, seed=2023
        )
        X = pd.DataFrame(X)
        X = pollution_fourth_experiment(X, X_AXIS[i])
        RESULTS_ALGORITHM.append(regression(X, y, algorithm, SEED))

    RESULTS_PER_ALGORITHM.append(RESULTS_ALGORITHM)

plot_results(
    x_axis_values=X_AXIS, x_label=X_LABEL, results=RESULTS_PER_ALGORITHM, title=DESCRIPTION,
    algorithms=REGRESSION_ALGORITHMS
)

## Experiment 5. Distinctness - Random noise

In [0]:
DESCRIPTION = 'Distinctness - Random noise'
X_LABEL = "Percentage of not distinct values"
RESULTS_PER_ALGORITHM = []
X_AXIS = [n * 0.00625 for n in range(1, 11)]

for algorithm in REGRESSION_ALGORITHMS:
    RESULTS_ALGORITHM = []

    for i in range(len(X_AXIS)):
        X, y = make_dataset_for_regression(
            n_samples=1000, n_features=3, n_informative=3, n_targets=1,
            bias=0.0, effective_rank=None, tail_strength=0.5, noise=0.0, seed=2023
        )
        X = pd.DataFrame(X)
        X = pollution_fifth_experiment(X, X_AXIS[i])
        RESULTS_ALGORITHM.append(regression(X, y, algorithm, SEED))

    RESULTS_PER_ALGORITHM.append(RESULTS_ALGORITHM)

plot_results(
    x_axis_values=X_AXIS, x_label=X_LABEL, results=RESULTS_PER_ALGORITHM, title=DESCRIPTION,
    algorithms=REGRESSION_ALGORITHMS
)

## Experiment 6. Distinctness - Categorical variables

In [0]:
DESCRIPTION = 'Distinctness - Categorical variables'
X_LABEL = "Number of categorical variables"
RESULTS_PER_ALGORITHM = []
X_AXIS = [n for n in range(1, 11)]

for algorithm in REGRESSION_ALGORITHMS:
    RESULTS_ALGORITHM = []

    for i in range(len(X_AXIS)):
        X, y = make_dataset_for_regression(
            n_samples=1000, n_features=3, n_informative=3, n_targets=1,
            bias=0.0, effective_rank=None, tail_strength=0.5, noise=0.0, seed=2023
        )
        X = pd.DataFrame(X)
        X = pollution_sixth_experiment(X, X_AXIS[i])
        RESULTS_ALGORITHM.append(regression(X, y, algorithm, SEED))

    RESULTS_PER_ALGORITHM.append(RESULTS_ALGORITHM)

plot_results(
    x_axis_values=X_AXIS, x_label=X_LABEL, results=RESULTS_PER_ALGORITHM, title=DESCRIPTION,
    algorithms=REGRESSION_ALGORITHMS
)

## Experiment 7. Distinctness - Outliers

In [0]:
DESCRIPTION = 'Distinctness - Outliers'
X_LABEL = "Percentage of outliers"
RESULTS_PER_ALGORITHM = []
X_AXIS = [n * 0.0125 for n in range(1, 11)]

for algorithm in REGRESSION_ALGORITHMS:
    RESULTS_ALGORITHM = []

    for i in range(len(X_AXIS)):
        X, y = make_dataset_for_regression(
            n_samples=1000, n_features=3, n_informative=3, n_targets=1,
            bias=0.0, effective_rank=None, tail_strength=0.5, noise=0.0, seed=2023
        )
        X = pd.DataFrame(X)
        X = pollution_seventh_experiment(X, X_AXIS[i])
        RESULTS_ALGORITHM.append(regression(X, y, algorithm, SEED))

    RESULTS_PER_ALGORITHM.append(RESULTS_ALGORITHM)

plot_results(
    x_axis_values=X_AXIS, x_label=X_LABEL, results=RESULTS_PER_ALGORITHM, title=DESCRIPTION,
    algorithms=REGRESSION_ALGORITHMS
)

## Experiment 8. High and Low Distinctness Combined

In [0]:
DESCRIPTION = 'Distinctness - High and Low Distinctness Combined'
X_LABEL = "Percentage of outliers"
RESULTS_PER_ALGORITHM = []
X_AXIS = [n * 0.05 for n in range(0, 11)]

for algorithm in REGRESSION_ALGORITHMS:
    RESULTS_ALGORITHM = []

    for i in range(len(X_AXIS)):
        X, y = make_dataset_for_regression(
            n_samples=1000, n_features=3, n_informative=3, n_targets=1,
            bias=0.0, effective_rank=None, tail_strength=0.5, noise=0.0, seed=2023
        )
        X = pd.DataFrame(X)
        X = pollution_eighth_experiment(X, X_AXIS[i])
        RESULTS_ALGORITHM.append(regression(X, y, algorithm, SEED))

    RESULTS_PER_ALGORITHM.append(RESULTS_ALGORITHM)

plot_results(
    x_axis_values=X_AXIS, x_label=X_LABEL, results=RESULTS_PER_ALGORITHM, title=DESCRIPTION,
    algorithms=REGRESSION_ALGORITHMS
)

## Experiment 9. Distinctness over most Informative Feature

In [0]:
DESCRIPTION = 'Distinctness - Distinctness over most Informative Feature'
X_LABEL = "Percentage of outliers"
RESULTS_PER_ALGORITHM = []
X_AXIS = [n * 0.05 for n in range(0, 11)]

for algorithm in REGRESSION_ALGORITHMS:
    RESULTS_ALGORITHM = []

    for i in range(len(X_AXIS)):
        X, y = make_dataset_for_regression(
            n_samples=1000, n_features=9, n_informative=4, n_targets=1,
            bias=0.0, effective_rank=None, tail_strength=0.5, noise=0.0, seed=2023
        )
        X = pd.DataFrame(X)
        X = pollution_ninth_tenth_experiments(X, y, X_AXIS[i])
        RESULTS_ALGORITHM.append(regression(X, y, algorithm, SEED))

    RESULTS_PER_ALGORITHM.append(RESULTS_ALGORITHM)

plot_results(
    x_axis_values=X_AXIS, x_label=X_LABEL, results=RESULTS_PER_ALGORITHM, title=DESCRIPTION,
    algorithms=REGRESSION_ALGORITHMS
)

## Experiment 10. Distinctness over less Informative Features

In [0]:
DESCRIPTION = 'Distinctness - Distinctness over less Informative Features'
X_LABEL = "Percentage of outliers"
RESULTS_PER_ALGORITHM = []
X_AXIS = [n * 0.05 for n in range(0, 11)]

for algorithm in REGRESSION_ALGORITHMS:
    RESULTS_ALGORITHM = []

    for i in range(len(X_AXIS)):
        X, y = make_dataset_for_regression(
            n_samples=1000, n_features=9, n_informative=4, n_targets=1,
            bias=0.0, effective_rank=None, tail_strength=0.5, noise=0.0, seed=2023
        )
        X = pd.DataFrame(X)
        X = pollution_ninth_tenth_experiments(X, y, percentage=X_AXIS[i], informative=False)
        RESULTS_ALGORITHM.append(regression(X, y, algorithm, SEED))

    RESULTS_PER_ALGORITHM.append(RESULTS_ALGORITHM)

plot_results(
    x_axis_values=X_AXIS, x_label=X_LABEL, results=RESULTS_PER_ALGORITHM, title=DESCRIPTION,
    algorithms=REGRESSION_ALGORITHMS
)