<h2><center>Boston Housing dataset</center></h2>
<div><center>Predikce střední hodnoty bytů v Bostonu v $1000.</center></div>
<hr>

<h3><center>Tabulka sloupců s popisem a typem proměnné</center></h3>

| Název sloupce |  Popis |
| :-: | :-: |
| Crim | Per capita crime rate by town. |
| Zn | Proportion of residential land zoned for lots over 25,000 sq.ft. |
| Indus | Proportion of non-retail business acres per town. |
| Chas | Charles River dummy variable (= 1 if tract bounds river; 0 otherwise). |
| Nox | Nitrogen oxides concentration (parts per 10 million). |
| Rm | Average number of rooms per dwelling. |
| Age | Proportion of owner-occupied units built prior to 1940. |
| Dis | Weighted mean of distances to five Boston employment centres. |
| Rad | Index of accessibility to radial highways. |
| Tax | Full-value property-tax rate per \$10,000. |
| Ptratio | Pupil-teacher ratio by town. |
| B | 1000(B - 0.63)^2 where B is the proportion of blacks by town. |
| Lstat | Lower status of the population (percent). |
| Medv | Median value of owner-occupied homes in \$1000s. |

In [1]:
// Instalace Nuget packagů
#r "nuget: Microsoft.Data.Analysis"
    
#r "nuget: XPlot.Plotly"
#r "nuget: XPlot.Plotly.Interactive"
#r "nuget: MathNet.Numerics"
    
#r "nuget: Microsoft.ML"
#r "nuget: Microsoft.ML.AutoML"
#r "nuget: Microsoft.ML.DataView"

Loading extensions from `XPlot.Plotly.Interactive.dll`

Configuring PowerShell Kernel for XPlot.Plotly integration.

Installed support for XPlot.Plotly.

Loading extensions from `Microsoft.Data.Analysis.Interactive.dll`

In [2]:
// Načtení namespaců
using System;
using System.Linq;
using System.IO;
using System.Globalization;

using XPlot.Plotly;
using Microsoft.Data.Analysis;
using Microsoft.AspNetCore.Html;
using Microsoft.DotNet.Interactive.Formatting;
using static Microsoft.DotNet.Interactive.Formatting.PocketViewTags;

using MathNet.Numerics.Statistics;

using Microsoft.ML;
using Microsoft.ML.Data;
using Microsoft.ML.AutoML;
using Microsoft.ML.Trainers;

In [3]:
// Formátovač tabulek - použít jen v případě, že se data nezobrazují v tabulce
/*
Formatter.Register<DataFrame>((df, writer) =>
{
    var headers = new List<IHtmlContent>();
    headers.Add(th(i("index")));
    headers.AddRange(df.Columns.Select(c => (IHtmlContent) th(c.Name)));
    var rows = new List<List<IHtmlContent>>();
    var take = 10;
    for (var i = 0; i < Math.Min(take, df.Rows.Count); i++)
    {
        var cells = new List<IHtmlContent>();
        cells.Add(td(i));
        foreach (var obj in df.Rows[i])
        {
            cells.Add(td(obj));
        }
        rows.Add(cells);
    }

    var t = table(
        thead(
            headers),
        tbody(
            rows.Select(
                r => tr(r))));

    writer.Write(t);
}, "text/html");
*/

In [4]:
class ModelInput
{
    [LoadColumn(0)]
    public float Crim { get; set; }

    [LoadColumn(1)]
    public float Zn { get; set; }

    [LoadColumn(2)]
    public float Indus { get; set; }

    [LoadColumn(3)]
    public float Chas { get; set; }

    [LoadColumn(4)]
    public float Nox { get; set; }

    [LoadColumn(5)]
    public float Rm { get; set; }

    [LoadColumn(6)]
    public float Age { get; set; }

    [LoadColumn(7)]
    public float Dis { get; set; }

    [LoadColumn(8)]
    public float Rad { get; set; }

    [LoadColumn(9)]
    public float Tax { get; set; }

    [LoadColumn(10)]
    public float Ptratio { get; set; }

    [LoadColumn(11)]
    public float B { get; set; }

    [LoadColumn(12)]
    public float Lstat { get; set; }

    [LoadColumn(13), ColumnName("Label")]
    public float Medv { get; set; }
}

In [5]:
class ModelOutput
{
    [ColumnName("Label")]
    public float Medv { get; set; }
    
    public float[] Features { get; set; }

    public float Score { get; set; }
}

In [6]:
// Návzy sloupců
var columnNames = new List<string>() {"Crim", "Zn", "Indus", "Chas", "Nox",
                                      "Rm", "Age", "Dis", "Rad", "Tax",
                                      "Ptratio", "B", "Lstat", "Label"};
// Vytváření modelu
var context = new MLContext();

// Načtení dat knihovnou Microsoft.ML
var data = context.Data.LoadFromTextFile<ModelInput>("./Dataset/bostonHousing.csv", hasHeader: true, separatorChar: ',');
// maxRows: -1 -> Vybere všechny řádky
var df = data.ToDataFrame(maxRows: -1);

In [7]:
// Zobrazení dat
df

index,Crim,Zn,Indus,Chas,Nox,Rm,Age,Dis,Rad,Tax,Ptratio,B,Lstat,Label
⏮⏪◀️Page1▶️⏩⏭️,⏮⏪◀️Page1▶️⏩⏭️,⏮⏪◀️Page1▶️⏩⏭️,⏮⏪◀️Page1▶️⏩⏭️,⏮⏪◀️Page1▶️⏩⏭️,⏮⏪◀️Page1▶️⏩⏭️,⏮⏪◀️Page1▶️⏩⏭️,⏮⏪◀️Page1▶️⏩⏭️,⏮⏪◀️Page1▶️⏩⏭️,⏮⏪◀️Page1▶️⏩⏭️,⏮⏪◀️Page1▶️⏩⏭️,⏮⏪◀️Page1▶️⏩⏭️,⏮⏪◀️Page1▶️⏩⏭️,⏮⏪◀️Page1▶️⏩⏭️,⏮⏪◀️Page1▶️⏩⏭️


In [8]:
// Typ dat každého sloupce a počet položek ve sloupci
df.Info()

index,Info,Crim,Zn,Indus,Chas,Nox,Rm,Age,Dis,Rad,Tax,Ptratio,B,Lstat,Label
0,DataType,System.Single,System.Single,System.Single,System.Single,System.Single,System.Single,System.Single,System.Single,System.Single,System.Single,System.Single,System.Single,System.Single,System.Single
1,Length (excluding null values),506,506,506,506,506,506,506,506,506,506,506,506,506,506


In [9]:
// Základní popis dat
df.Description()

index,Description,Crim,Zn,Indus,Chas,Nox,Rm,Age,Dis,Rad,Tax,Ptratio,B,Lstat,Label
0,Length (excluding null values),506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0
1,Max,88.9762,100.0,27.74,1.0,0.871,8.78,100.0,12.1265,24.0,711.0,22.0,396.9,37.97,50.0
2,Min,0.00632,0.0,0.46,0.0,0.385,3.561,2.9,1.1296,1.0,187.0,12.6,0.32,1.73,5.0
3,Mean,3.6135232,11.363636,11.136797,0.06916996,0.55469555,6.2846355,68.57492,3.7950428,9.549407,408.23715,18.455584,356.67456,12.653064,22.532806


In [10]:
// Kontrola nullových hodnot v každém sloupci
List<long> nullCount = new List<long>();

for (int i = 0; i < columnNames.Count; i++)
{
   nullCount.Add(df.Columns[columnNames[i]].NullCount);
}

PrimitiveDataFrameColumn<long> nullCountCol = new PrimitiveDataFrameColumn<long>("Null Count", nullCount);
StringDataFrameColumn colNames = new StringDataFrameColumn("Column name", columnNames);

DataFrame nullCountDf = new DataFrame(colNames, nullCountCol);
nullCountDf

index,Column name,Null Count
0,Crim,0
1,Zn,0
2,Indus,0
3,Chas,0
4,Nox,0
5,Rm,0
6,Age,0
7,Dis,0
8,Rad,0
9,Tax,0


<h4><center>Vizualizace dat</center></h4>
<hr>

In [11]:
// Funkce na tvoření histogramů
List<PlotlyChart> histogramCreator(List<string> columnName)
{
    List<PlotlyChart> histograms = new List<PlotlyChart>();

    for (int i = 0; i < columnName.Count; i++)
    {
        var hist = Chart.Plot(
            new XPlot.Plotly.Histogram
            {
                x = df.Columns[columnName[i]]
            }
        );

        var hist_layout = new Layout.Layout
        {
            title = columnName[i],
            xaxis = new Xaxis
            {
                title = columnName[i]
            },
            yaxis = new Yaxis
            {
                title = "Quantity"
            }
        };

        hist.WithLayout(hist_layout);
        histograms.Add(hist);
    }

    return histograms;
}

In [12]:
// Funkce na tvoření scatter plotů
PlotlyChart scatterCreator(DataFrame df, string columnNameX, string columnNameY)
{
    var scatter = Chart.Plot(
        new Scatter
        {
            x = df.Columns[columnNameX],
            y = df.Columns[columnNameY],
            mode = "markers",
            marker = new Marker
            {
                color = "rgb(164, 194, 244)", // Barva markeru
                size = 12, // Velikost markeru
                line = new Line // Border markeru
                {
                    color = "blue",
                    width = 0.5
                }
            }
        }
    );

    var scatter_layout = new Layout.Layout
    {
        title = columnNameX + " and " + columnNameY,
        xaxis = new Xaxis
        {
            title = columnNameX
        },
        yaxis = new Yaxis
        {
            title = columnNameY
        }
    };

    scatter.WithLayout(scatter_layout);

    return scatter;
}

In [13]:
// Funkce na tvoření box plotů
PlotlyChart BoxPlotCreator(DataFrameColumn y, string columnName)
{
    var boxPlot = Chart.Plot(
        new Box
        {
            y = y,
            marker = new Marker()
            {
                color = "#3D9970"
            }
        }
    );

    var boxPlot_layout = new Layout.Layout
    {
        title = columnName,
        xaxis = new Xaxis
        {
            title = columnName
        }
    };

    boxPlot.WithLayout(boxPlot_layout);
    return boxPlot;
}

In [14]:
// Zobrazí všechny histogramy
var histograms = histogramCreator(columnNames);
Chart.ShowAll(histograms)

In [15]:
// Zobrazí všechny scatter ploty
var scatters = new List<PlotlyChart>();

for (int i = 0; i < columnNames.Count; i++)
{
    for (int j = 0; j < columnNames.Count; j++)
    {
        if(columnNames[i] != columnNames[j])
        {
            scatters.Add(scatterCreator(df, columnNames[i], columnNames[j]));
        }
    }
}
Chart.ShowAll(scatters)

In [16]:
// Vizualizace pomocí box plotů
var boxPlotList = new List<PlotlyChart>();
for(int i = 0; i < columnNames.Count; i++)
{
    boxPlotList.Add(BoxPlotCreator(df.Columns[columnNames[i]], columnNames[i]));
}
Chart.ShowAll(boxPlotList);

In [17]:
// Převedení sloupců z dataframu na pole
var matrixColumns = new double[columnNames.Count][];
for(int i = 0; i < columnNames.Count; i++)
{
    matrixColumns[i] = df.Columns[columnNames[i]].Cast<Single>().ToList().ConvertAll(x => (double)x).ToArray();
}

In [18]:
// Vypočítání korelační matice
var corrMatrix = Correlation.PearsonMatrix(matrixColumns);

// Převedení matice na data frame kvůli vizualizaci tabulky
var doubleDataFrameColumn = new List<DoubleDataFrameColumn>();
for(int i = 0; i < columnNames.Count; i++)
{
    doubleDataFrameColumn.Add(new DoubleDataFrameColumn(columnNames[i], corrMatrix.Column(i)));
}

DataFrame corrDf = new DataFrame(doubleDataFrameColumn);
corrDf

index,Crim,Zn,Indus,Chas,Nox,Rm,Age,Dis,Rad,Tax,Ptratio,B,Lstat,Label
0,1.0,-0.2004692202866453,0.4065834258061122,-0.0558915823245154,0.4209717271248409,-0.2192467014372476,0.3527342526356969,-0.3796700868864803,0.6255051470770733,0.5827643137403288,0.2899456365707369,-0.3850639502349168,0.455621480361751,-0.3883046116696425
1,-0.2004692202866453,1.0,-0.5338281892911505,-0.0426967192961217,-0.5166037115057294,0.3119905892088462,-0.5695373403888099,0.6644082202949497,-0.3119478260185367,-0.3145633246775997,-0.3916785346935419,0.1755203105154521,-0.4129945756794634,0.3604453446258362
2,0.4065834258061122,-0.5338281892911505,1.0,0.0629380267147618,0.7636514584146167,-0.3916758589920042,0.6447785149379233,-0.7080269893125726,0.5951292956051599,0.720760196143969,0.3832476378762378,-0.3569765437040713,0.6037997236980921,-0.4837251712806936
3,-0.0558915823245154,-0.0426967192961217,0.0629380267147618,1.0,0.0912028045604225,0.0912512267756152,0.0865177742478617,-0.0991757781360887,-0.0073682408860775,-0.0355865175859114,-0.1215151685150101,0.04878848856082,-0.0539292972323931,0.1752601777605944
4,0.4209717271248409,-0.5166037115057294,0.7636514584146167,0.0912028045604225,1.0,-0.3021881631124234,0.731470113479704,-0.7692301182715754,0.6114405651848068,0.6680232075332124,0.1889327158519947,-0.3800506433584298,0.5908789196087519,-0.4273207759388387
5,-0.2192467014372476,0.3119905892088462,-0.3916758589920042,0.0912512267756152,-0.3021881631124234,1.0,-0.2402649119965519,0.2052461992125454,-0.209846668850359,-0.2920478376595949,-0.3555015015921619,0.1280686305396979,-0.6138082595197818,0.6953599371216597
6,0.3527342526356969,-0.5695373403888099,0.6447785149379233,0.0865177742478617,0.731470113479704,-0.2402649119965519,1.0,-0.747880540405257,0.4560224544847905,0.5064555951378126,0.261515022672746,-0.2735339728262818,0.6023385272575823,-0.3769545671480852
7,-0.3796700868864803,0.6644082202949497,-0.7080269893125726,-0.0991757781360887,-0.7692301182715754,0.2052461992125454,-0.747880540405257,1.0,-0.4945879287217531,-0.5344315821147608,-0.2324705730333224,0.2915116671876719,-0.4969958312889488,0.2499287387632882
8,0.6255051470770733,-0.3119478260185367,0.5951292956051599,-0.0073682408860775,0.6114405651848068,-0.209846668850359,0.4560224544847905,-0.4945879287217531,1.0,0.9102281885331868,0.4647412682883685,-0.4444128183296645,0.4886763363330782,-0.3816262315669482
9,0.5827643137403288,-0.3145633246775997,0.720760196143969,-0.0355865175859114,0.6680232075332124,-0.2920478376595949,0.5064555951378126,-0.5344315821147608,0.9102281885331868,1.0,0.4608531171186831,-0.4418080101588348,0.543993412492276,-0.4685359352928887


In [19]:
// Vizualizace korelační matice
Chart.Plot(
    new Heatmap
    {
        x = columnNames,
        y = columnNames,
        z = corrMatrix.ToColumnArrays()
    }
)

In [20]:
// Vizualizace normalizace dat
PlotlyChart Normalize(string type, string columnName, DataFrame uniqDf)
{
    Microsoft.ML.Transforms.NormalizingEstimator normalizeNoCdf = null;
    
    switch(type)
    {
        case "MeanVariance":
            normalizeNoCdf = context.Transforms.NormalizeMeanVariance(outputColumnName: columnName,
                  inputColumnName: columnName);
            break;
        case "LogMeanVariance":
            normalizeNoCdf = context.Transforms.NormalizeLogMeanVariance(outputColumnName: columnName,
                  inputColumnName: columnName);
            break;
        case "MinMax":
            normalizeNoCdf = context.Transforms.NormalizeMinMax(outputColumnName: columnName,
                  inputColumnName: columnName);
            break;
        case "Binning":
            normalizeNoCdf = context.Transforms.NormalizeBinning(outputColumnName: columnName,
                  inputColumnName: columnName);
            break;
        case "RobustScaling":
            normalizeNoCdf = context.Transforms.NormalizeRobustScaling(outputColumnName: columnName,
                  inputColumnName: columnName);
            break;
        default:
            break;
    }
    var normalizeNoCdfTransform = normalizeNoCdf.Fit(uniqDf);
    var noCdfData = normalizeNoCdfTransform.Transform(uniqDf);
    var normalizedColumn = noCdfData.ToDataFrame(maxRows: -1).Columns[columnName];
    
    var hist = Chart.Plot(
            new XPlot.Plotly.Histogram
            {
                x = normalizedColumn,
                name = columnName
            }
        );
    
     var hist_layout = new Layout.Layout
        {
            title = columnName,
            xaxis = new Xaxis
            {
                title = columnName
            },
            yaxis = new Yaxis
            {
                title = "Quantity"
            }
        };

    hist.WithLayout(hist_layout);
    
    Chart.Show(hist);
    
    return BoxPlotCreator(normalizedColumn, columnName);
}

In [21]:
// Vizualizace normalizace
//-------------------------------
Chart.Show(Normalize("MinMax", columnNames[1], df));

In [22]:
// Zobrazení hodnot pro daný sloupec
df.Columns[columnNames[1]].Sort().ValueCounts()

index,Values,Counts
⏮⏪◀️Page1▶️⏩⏭️,⏮⏪◀️Page1▶️⏩⏭️,⏮⏪◀️Page1▶️⏩⏭️


<h4><center>Trénování modelu</center></h4>
<hr>

In [20]:
// Rozdělení data setu na množinu, která bude dále rozdělena na trénovací a validační a na množinu testovací
DataFrameRowCollection rows = df.Rows;

List<DataFrameRow> validationDataRows = new List<DataFrameRow>();
List<DataFrameRow> noValidationDataRows = new List<DataFrameRow>();
List<int> removedRowsIndex = new List<int>();
Random random = new Random();  

// Náhodně vybere 10 procent řádků
for (int i = 0; i < rows.Count * 0.1; i++)
{
    int randomRow = random.Next((int)rows.Count);
    while (removedRowsIndex.Contains(randomRow))
    {
        randomRow = random.Next((int)rows.Count);
    }
    validationDataRows.Add(rows[randomRow]);
    removedRowsIndex.Add(randomRow);
}

for (int i = 0; i < rows.Count; i++)
{
    if (!removedRowsIndex.Contains(i)) 
    { 
        noValidationDataRows.Add(rows[i]);
    }
}

DataFrame validationDf = new DataFrame();
DataFrame trainValDf = new DataFrame();
for (int i = 0; i < columnNames.Count; i++)
{
    validationDf.Columns.Add(new PrimitiveDataFrameColumn<Single>(columnNames[i],
    validationDataRows.Select(row => (float)row[i])));

    trainValDf.Columns.Add(new PrimitiveDataFrameColumn<Single>(columnNames[i],
    noValidationDataRows.Select(row => (float)row[i])));
}
IDataView trainValData = (IDataView)trainValDf;

In [45]:
validationDf

index,Crim,Zn,Indus,Chas,Nox,Rm,Age,Dis,Rad,Tax,Ptratio,B,Lstat,Label
⏮⏪◀️Page1▶️⏩⏭️,⏮⏪◀️Page1▶️⏩⏭️,⏮⏪◀️Page1▶️⏩⏭️,⏮⏪◀️Page1▶️⏩⏭️,⏮⏪◀️Page1▶️⏩⏭️,⏮⏪◀️Page1▶️⏩⏭️,⏮⏪◀️Page1▶️⏩⏭️,⏮⏪◀️Page1▶️⏩⏭️,⏮⏪◀️Page1▶️⏩⏭️,⏮⏪◀️Page1▶️⏩⏭️,⏮⏪◀️Page1▶️⏩⏭️,⏮⏪◀️Page1▶️⏩⏭️,⏮⏪◀️Page1▶️⏩⏭️,⏮⏪◀️Page1▶️⏩⏭️,⏮⏪◀️Page1▶️⏩⏭️


In [46]:
// Rozdělení data setu na trénovací a testovací množiny
var split = context.Data.TrainTestSplit(trainValData, testFraction: 0.3, seed: 1);

string[] featureColumnNames = trainValData.Schema.Select(column => column.Name)
                          .Where(columnName => columnName != "Label").ToArray();

// Tvorba pipeline 
var pipeline = context.Transforms.NormalizeLogMeanVariance(outputColumnName: "Crim", inputColumnName: "Crim")
            .Append(context.Transforms.NormalizeMinMax(outputColumnName: "B",
                      inputColumnName: "B"))
            .Append(context.Transforms.NormalizeMinMax(outputColumnName: "Tax",
                      inputColumnName: "Tax"))
            .Append(context.Transforms.Concatenate("Features", featureColumnNames));





In [47]:
var fastTreePipeline = pipeline.Append(context.Regression.Trainers.FastTree());

// Trénování modelu
var fastTreeModel = fastTreePipeline.Fit(split.TrainSet);

var predictionsTree = fastTreeModel.Transform(split.TestSet);

var metrics = context.Regression.Evaluate(predictionsTree);

Console.WriteLine($"RSquared: {metrics.RSquared}");
Console.WriteLine($"MAE: {metrics.MeanAbsoluteError}");
Console.WriteLine($"MSE: {metrics.MeanSquaredError}");

RSquared: 0.8817849216670102
MAE: 2.2341792875056643
MSE: 8.221065548365644


In [48]:
var fastForestPipeline = pipeline.Append(context.Regression.Trainers.FastForest());

// Trénování modelu
var fastForestModel = fastForestPipeline.Fit(split.TrainSet);

var predictionsForest = fastForestModel.Transform(split.TestSet);

var metrics = context.Regression.Evaluate(predictionsForest);

Console.WriteLine($"RSquared: {metrics.RSquared}");
Console.WriteLine($"MAE: {metrics.MeanAbsoluteError}");
Console.WriteLine($"MSE: {metrics.MeanSquaredError}");

RSquared: 0.8321438300902938
MAE: 2.342313022064648
MSE: 11.673270406658403


In [49]:
// Další varianta rozhodovacího stromu
var lightGbmPipeline = pipeline.Append(context.Regression.Trainers.LightGbm());

// Trénování modelu
var lightGbmModel = lightGbmPipeline.Fit(split.TrainSet);

var predictionsGbm = lightGbmModel.Transform(split.TestSet);

var metrics = context.Regression.Evaluate(predictionsGbm);

Console.WriteLine($"RSquared: {metrics.RSquared}");
Console.WriteLine($"MAE: {metrics.MeanAbsoluteError}");
Console.WriteLine($"MSE: {metrics.MeanSquaredError}");

RSquared: 0.8789298114153168
MAE: 2.2311332723219617
MSE: 8.419619310355843


In [50]:
var olsPipeline = pipeline.Append(context.Regression.Trainers.Ols());

// Trénování modelu
var olsModel = olsPipeline.Fit(split.TrainSet);

var predictionsOls = olsModel.Transform(split.TestSet);

var metrics = context.Regression.Evaluate(predictionsOls);

Console.WriteLine($"RSquared: {metrics.RSquared}");
Console.WriteLine($"MAE: {metrics.MeanAbsoluteError}");
Console.WriteLine($"MSE: {metrics.MeanSquaredError}");

RSquared: 0.7431596933947204
MAE: 3.13231713480229
MSE: 17.86152008559033


In [51]:
// Vizualizace
void Predictions(string modelName, MLContext context, IDataView predictions)
{
    Random rand = new Random();

    var listOfPredictions = context.Data.CreateEnumerable<ModelOutput>(predictions,
        reuseRowObject: false).ToList();

    Console.WriteLine(modelName + "\n-----------------------------");
    foreach (var p in listOfPredictions.OrderBy(x => rand.Next()).Take(10))
    {
        Console.WriteLine($"Label: {p.Medv}, Prediction: {p.Score}\n");
    }

}

In [27]:
Predictions("FastTree", context, predictionsTree)

FastTree
-----------------------------
Label: 21.2, Prediction: 21.848665

Label: 19.6, Prediction: 20.290531

Label: 19.6, Prediction: 21.330753

Label: 6.3, Prediction: 12.047173

Label: 20.1, Prediction: 21.632467

Label: 36.4, Prediction: 38.418896

Label: 30.5, Prediction: 30.359362

Label: 19.3, Prediction: 19.644054

Label: 12.7, Prediction: 11.121975

Label: 50, Prediction: 47.724052



In [28]:
Predictions("FastForest", context, predictionsForest)

FastForest
-----------------------------
Label: 20.3, Prediction: 21.917263

Label: 29.6, Prediction: 24.026922

Label: 19.8, Prediction: 22.767035

Label: 12.7, Prediction: 13.146951

Label: 18, Prediction: 17.091589

Label: 26.6, Prediction: 29.482344

Label: 17.9, Prediction: 12.9172325

Label: 14.9, Prediction: 14.200992

Label: 12.7, Prediction: 16.444347

Label: 14.6, Prediction: 15.843056



In [29]:
Predictions("Gbm", context, predictionsGbm)

Gbm
-----------------------------
Label: 19.4, Prediction: 18.468046

Label: 12.7, Prediction: 15.705626

Label: 23.3, Prediction: 26.10189

Label: 26.7, Prediction: 33.548138

Label: 25, Prediction: 23.131868

Label: 7.2, Prediction: 9.077284

Label: 22.8, Prediction: 25.093542

Label: 23, Prediction: 25.355524

Label: 22, Prediction: 24.855787

Label: 23.8, Prediction: 24.33288



In [30]:
Predictions("Ols", context, predictionsOls)

Ols
-----------------------------
Label: 19.6, Prediction: 22.850594

Label: 22.6, Prediction: 26.655876

Label: 22.2, Prediction: 26.472473

Label: 13.8, Prediction: 20.402573

Label: 18.5, Prediction: 24.95713

Label: 19.8, Prediction: 18.760113

Label: 18, Prediction: 18.633408

Label: 12.7, Prediction: 18.802534

Label: 32.7, Prediction: 30.13248

Label: 23.9, Prediction: 24.94287



<h4><center>Evaluace modelu</center></h4>
<hr>

In [39]:
// Evaluační funkce pro klasifikaci do více tříd
void Evaluation(ITransformer model, DataFrame valDf, MLContext context, string modelName)
{
    var valPredictions = model.Transform(valDf);

    var valMetrics = context.Regression.Evaluate(valPredictions);

    Console.WriteLine($"R-squared: {valMetrics.RSquared}");
    Console.WriteLine($"Mean Absolute Error: {valMetrics.MeanAbsoluteError}");
    Console.WriteLine($"Mean Squared Error: {valMetrics.MeanSquaredError}");

    var score = valPredictions.GetColumn<Single>("Score").Where(x => x != 0).ToList();
    var valLabel = valPredictions.GetColumn<Single>("Label").Where(x => x != 0).ToList();

    Console.WriteLine("Náhodný výběr predikcí:");
    Random rand = new Random();
    for (int i = 0; i < 20; i++)
    {
        var index = rand.Next(0, valLabel.Count);
        Console.WriteLine($"Label: {valLabel[index]} | Score: {score[index]}");
    }

    var hist = Chart.Plot(
        new List<XPlot.Plotly.Histogram>() {
            new XPlot.Plotly.Histogram
            {
                x = score,
                marker = new Marker(){ color = "rgb(52, 67, 213)" },
                name = "Prediction"
            },
            new XPlot.Plotly.Histogram
            {
                x = valLabel,
                marker = new Marker(){ color = "rgb(39, 226, 101)" },
                name = "Label"
            }
        }
    );

    var hist_layout = new Layout.Layout
    {
        title = modelName,
        yaxis = new Yaxis
        {
            title = "Quantity"
        },
        bargap = 0.15
    };

    hist.WithLayout(hist_layout);
    hist.Show();
}

In [40]:
Evaluation(fastTreeModel, validationDf, context, "Fast Tree")

R-squared: 0.8958404047040569
Mean Absolute Error: 2.5325248007680856
Mean Squared Error: 11.011221471286387
Náhodný výběr predikcí:
Label: 50 | Score: 44.725727
Label: 8.5 | Score: 12.432202
Label: 22.8 | Score: 25.19674
Label: 18.6 | Score: 20.345438
Label: 14.5 | Score: 12.696164
Label: 14.6 | Score: 11.264418
Label: 15.6 | Score: 15.815836
Label: 21.4 | Score: 24.406475
Label: 35.2 | Score: 42.38282
Label: 43.5 | Score: 48.238728
Label: 43.5 | Score: 48.238728
Label: 22.5 | Score: 23.271666
Label: 20.1 | Score: 20.429728
Label: 32 | Score: 34.37011
Label: 22.8 | Score: 25.19674
Label: 12.8 | Score: 10.309395
Label: 18.6 | Score: 20.345438
Label: 21.1 | Score: 19.544222
Label: 23.7 | Score: 26.28244
Label: 14.3 | Score: 16.37824


In [41]:
Evaluation(fastForestModel, validationDf, context, "Fast Forest")

R-squared: 0.7632663048346531
Mean Absolute Error: 2.8619693494310567
Mean Squared Error: 25.02627952590715
Náhodný výběr predikcí:
Label: 20.1 | Score: 20.50778
Label: 10.2 | Score: 15.54994
Label: 14.5 | Score: 15.672198
Label: 20.6 | Score: 22.075916
Label: 23.7 | Score: 26.606888
Label: 14.8 | Score: 16.926413
Label: 16.6 | Score: 19.838179
Label: 14.5 | Score: 15.672198
Label: 20.1 | Score: 20.32595
Label: 39.8 | Score: 38.299408
Label: 50 | Score: 22.357214
Label: 50 | Score: 41.227646
Label: 17.8 | Score: 24.052418
Label: 15.6 | Score: 17.691248
Label: 21.1 | Score: 22.266865
Label: 22.2 | Score: 24.483326
Label: 23.7 | Score: 26.606888
Label: 44.8 | Score: 41.605473
Label: 28.7 | Score: 32.528297
Label: 17.5 | Score: 19.363976


In [44]:
Evaluation(lightGbmModel, validationDf, context, "LigtGBM")

R-squared: 0.8807571613789241
Mean Absolute Error: 2.5034953846650967
Mean Squared Error: 12.605745070254402
Náhodný výběr predikcí:
Label: 14.8 | Score: 14.461023
Label: 14.5 | Score: 12.895227
Label: 19.6 | Score: 17.065653
Label: 24.7 | Score: 24.82804
Label: 24.7 | Score: 24.82804
Label: 14.6 | Score: 14.76269
Label: 21.2 | Score: 19.782991
Label: 28.1 | Score: 26.73675
Label: 22.8 | Score: 25.91593
Label: 28.1 | Score: 26.73675
Label: 23.7 | Score: 26.278484
Label: 21.7 | Score: 22.385906
Label: 24.7 | Score: 24.82804
Label: 22.2 | Score: 21.69516
Label: 18.6 | Score: 16.728546
Label: 43.5 | Score: 49.8472
Label: 28.7 | Score: 32.17497
Label: 5 | Score: 10.442846
Label: 25.2 | Score: 24.583466
Label: 50 | Score: 43.421223


In [42]:
Evaluation(olsModel, validationDf, context, "Ordinary Least Squares")

R-squared: 0.7128826808506185
Mean Absolute Error: 3.863514114828671
Mean Squared Error: 30.352579427879125
Náhodný výběr predikcí:
Label: 33.4 | Score: 28.670979
Label: 25.2 | Score: 28.536564
Label: 28.7 | Score: 31.090733
Label: 44.8 | Score: 38.544674
Label: 39.8 | Score: 34.625546
Label: 16.6 | Score: 18.505169
Label: 28.1 | Score: 25.583588
Label: 33.4 | Score: 28.670979
Label: 17.8 | Score: 21.91546
Label: 22.2 | Score: 23.844402
Label: 28.1 | Score: 25.583588
Label: 28.1 | Score: 25.583588
Label: 28.2 | Score: 32.45364
Label: 20.1 | Score: 14.635078
Label: 21.5 | Score: 23.86486
Label: 43.5 | Score: 40.634293
Label: 35.2 | Score: 36.519062
Label: 39.8 | Score: 34.625546
Label: 28.7 | Score: 25.276806
Label: 23.7 | Score: 28.016262


<h4><center>Auto ML </center></h4>
<div><center>Automaticky natrénovaný model s nejvyšším RSquared</center></div>
<hr>

In [33]:
// AutoML
var experiment = context.Auto().CreateRegressionExperiment(maxExperimentTimeInSeconds: 10);

var result = experiment.Execute(data);

Console.WriteLine(result.BestRun.TrainerName);

Console.WriteLine(result.BestRun.ValidationMetrics.RSquared);

FastTreeTweedieRegression
0.862374573388809
