# ML.NET - DataFrame-AutoML-Demo

In [28]:
// HTML-like displays:
display(h1("ML.NET & DataFrame demo on Jupyter!!"));
display(h4("This notebook simplifies approaches with the new DataFrame in .NET."));

In [2]:
// ML.NET Nuget packages 
#r "nuget:Microsoft.ML"
    
// DataFrame Nuget package 
#r "nuget:Microsoft.Data.Analysis"
    
// AutoML Nuget package 
#r "nuget:Microsoft.ML.AutoML"
    
//Install XPlot package
#r "nuget:XPlot.Plotly"

Installed package Microsoft.Data.Analysis version 0.4.0

Installed package XPlot.Plotly version 3.0.1

Installed package Microsoft.ML.AutoML version 0.17.0

Installed package Microsoft.ML version 1.5.0

In [6]:
using Microsoft.ML;
using Microsoft.Data.Analysis;
using XPlot.Plotly;

In [7]:
using Microsoft.AspNetCore.Html;
Formatter<DataFrame>.Register((df, writer) =>
{
    var headers = new List<IHtmlContent>();
    headers.Add(th(i("index")));
    headers.AddRange(df.Columns.Select(c => (IHtmlContent) th(c.Name)));
    var rows = new List<List<IHtmlContent>>();
    var take = 10;
    for (var i = 0; i < Math.Min(take, df.Rows.Count); i++)
    {
        var cells = new List<IHtmlContent>();
        cells.Add(td(i));
        foreach (var obj in df.Rows[i])
        {
            cells.Add(td(obj));
        }
        rows.Add(cells);
    }

    var t = table(
        thead(
            headers),
        tbody(
            rows.Select(
                r => tr(r))));

    writer.Write(t);
}, "text/html");

In [8]:
var trainDf = DataFrame.LoadCsv(@"./Datasets/taxi/taxi-fare-train-small.csv");
trainDf.Description()

index,Description,rate_code,passenger_count,trip_time_in_secs,trip_distance,fare_amount
0,Length (excluding null values),9999.0,9999.0,9999.0,9999.0,9999.0
1,Max,5.0,6.0,4680.0,2855.0,965.0
2,Min,1.0,1.0,0.0,0.0,3.0
3,Mean,1.0265026,1.7669767,665.39124,61.70327,60.569256


In [9]:
var dataFrameWithFiveRows = trainDf.Head(5);
display(dataFrameWithFiveRows);


index,vendor_id,rate_code,passenger_count,trip_time_in_secs,trip_distance,payment_type,fare_amount
0,CMT,1,1,1271,38,CRD,175
1,CMT,1,1,474,15,CRD,8
2,CMT,1,1,637,14,CRD,85
3,CMT,1,1,181,6,CSH,45
4,CMT,1,1,661,11,CRD,85


In [11]:
var faresHistogram = Chart.Plot(new Graph.Histogram(){x = trainDf.Columns["fare_amount"], autobinx = false, nbinsx = 20});
var layout = new Layout.Layout(){title="Distribution of taxi trips per cost"};
faresHistogram.WithLayout(layout);

display(faresHistogram);

In [14]:
// Plot Time vs. Distance with different color on Fares cost-frame

var chart = Chart.Plot(
    new Graph.Scatter()
    {
        x = trainDf.Columns["trip_distance"],
        y = trainDf.Columns["trip_time_in_secs"],
        mode = "markers",
        marker = new Graph.Marker()
        {
            color = trainDf.Columns["fare_amount"],
            colorscale = "Jet"
        }
    }
);

var layout = new Layout.Layout(){title="Plot Time vs. Distance & color scale on Fares"};
chart.WithLayout(layout);
chart.Width = 500;
chart.Height = 500;
chart.WithLegend(true);

display(chart);

In [15]:
using Microsoft.ML;
using Microsoft.ML.Data;
using Microsoft.ML.AutoML;

In [19]:
//%%time

var mlContext = new MLContext();

var experiment = mlContext.Auto().CreateRegressionExperiment(maxExperimentTimeInSeconds: 30);
var result = experiment.Execute(trainDf, labelColumnName:"fare_amount");

In [23]:
//Read Train and Test datasets into DataFrames (NEW)

display("Test Dataset");
var testDf = DataFrame.LoadCsv(@"./Datasets/taxi/taxi-fare-test-small.csv");
display(testDf.Description());

Test Dataset

index,Description,rate_code,passenger_count,trip_time_in_secs,trip_distance,fare_amount
0,Length (excluding null values),2000.0,2000.0,2000.0,2000.0,2000.0
1,Max,5.0,6.0,4380.0,2098.0,685.0
2,Min,1.0,1.0,0.0,0.0,25.0
3,Mean,1.02,1.2125,655.26,225.903,108.3825


In [24]:
// Make Predictions and return a DataFrame
var predictionsDataView = result.BestRun.Model.Transform(testDf);

//(CDLTLL) Any way to convert from a IDataView to an DataFrame?

display(h4("Schema of DataView with Predictions:"));
display(predictionsDataView.Schema);


index,Name,Index,IsHidden,Type,Annotations
0,vendor_id,0,True,{ Microsoft.ML.Data.TextDataViewType: RawType: System.ReadOnlyMemory<System.Char> },{ Microsoft.ML.DataViewSchema+Annotations: Schema: [ ] }
1,vendor_id,1,True,"{ Microsoft.ML.Data.KeyDataViewType: Count: 2, RawType: System.UInt32 }","{ Microsoft.ML.DataViewSchema+Annotations: Schema: [ { Microsoft.ML.DataViewSchema+Column: Name: KeyValues, Index: 0, IsHidden: False, Type: { Microsoft.ML.Data.VectorDataViewType: Dimensions: [ 2 ], IsKnownSize: True, ItemType: { Microsoft.ML.Data.TextDataViewType: RawType: System.ReadOnlyMemory`1[System.Char] }, Size: 2, RawType: Microsoft.ML.Data.VBuffer`1[System.ReadOnlyMemory`1[System.Char]] }, Annotations: { Microsoft.ML.DataViewSchema+Annotations: Schema: [ ] } } ] }"
2,vendor_id,2,False,"{ Microsoft.ML.Data.VectorDataViewType: Dimensions: [ 2 ], IsKnownSize: True, ItemType: { Microsoft.ML.Data.NumberDataViewType: RawType: System.Single }, Size: 2, RawType: Microsoft.ML.Data.VBuffer<System.Single> }","{ Microsoft.ML.DataViewSchema+Annotations: Schema: [ { Microsoft.ML.DataViewSchema+Column: Name: SlotNames, Index: 0, IsHidden: False, Type: { Microsoft.ML.Data.VectorDataViewType: Dimensions: [ 2 ], IsKnownSize: True, ItemType: { Microsoft.ML.Data.TextDataViewType: RawType: System.ReadOnlyMemory`1[System.Char] }, Size: 2, RawType: Microsoft.ML.Data.VBuffer`1[System.ReadOnlyMemory`1[System.Char]] }, Annotations: { Microsoft.ML.DataViewSchema+Annotations: Schema: [ ] } }, { Microsoft.ML.DataViewSchema+Column: Name: CategoricalSlotRanges, Index: 1, IsHidden: False, Type: { Microsoft.ML.Data.VectorDataViewType: Dimensions: [ 1, 2 ], IsKnownSize: True, ItemType: { Microsoft.ML.Data.NumberDataViewType: RawType: System.Int32 }, Size: 2, RawType: Microsoft.ML.Data.VBuffer`1[System.Int32] }, Annotations: { Microsoft.ML.DataViewSchema+Annotations: Schema: [ ] } }, { Microsoft.ML.DataViewSchema+Column: Name: IsNormalized, Index: 2, IsHidden: False, Type: { Microsoft.ML.Data.BooleanDataViewType: RawType: System.Boolean }, Annotations: { Microsoft.ML.DataViewSchema+Annotations: Schema: [ ] } } ] }"
3,rate_code,3,False,{ Microsoft.ML.Data.NumberDataViewType: RawType: System.Single },{ Microsoft.ML.DataViewSchema+Annotations: Schema: [ ] }
4,passenger_count,4,False,{ Microsoft.ML.Data.NumberDataViewType: RawType: System.Single },{ Microsoft.ML.DataViewSchema+Annotations: Schema: [ ] }
5,trip_time_in_secs,5,False,{ Microsoft.ML.Data.NumberDataViewType: RawType: System.Single },{ Microsoft.ML.DataViewSchema+Annotations: Schema: [ ] }
6,trip_distance,6,False,{ Microsoft.ML.Data.NumberDataViewType: RawType: System.Single },{ Microsoft.ML.DataViewSchema+Annotations: Schema: [ ] }
7,payment_type,7,True,{ Microsoft.ML.Data.TextDataViewType: RawType: System.ReadOnlyMemory<System.Char> },{ Microsoft.ML.DataViewSchema+Annotations: Schema: [ ] }
8,payment_type,8,True,"{ Microsoft.ML.Data.KeyDataViewType: Count: 4, RawType: System.UInt32 }","{ Microsoft.ML.DataViewSchema+Annotations: Schema: [ { Microsoft.ML.DataViewSchema+Column: Name: KeyValues, Index: 0, IsHidden: False, Type: { Microsoft.ML.Data.VectorDataViewType: Dimensions: [ 4 ], IsKnownSize: True, ItemType: { Microsoft.ML.Data.TextDataViewType: RawType: System.ReadOnlyMemory`1[System.Char] }, Size: 4, RawType: Microsoft.ML.Data.VBuffer`1[System.ReadOnlyMemory`1[System.Char]] }, Annotations: { Microsoft.ML.DataViewSchema+Annotations: Schema: [ ] } } ] }"
9,payment_type,9,False,"{ Microsoft.ML.Data.VectorDataViewType: Dimensions: [ 4 ], IsKnownSize: True, ItemType: { Microsoft.ML.Data.NumberDataViewType: RawType: System.Single }, Size: 4, RawType: Microsoft.ML.Data.VBuffer<System.Single> }","{ Microsoft.ML.DataViewSchema+Annotations: Schema: [ { Microsoft.ML.DataViewSchema+Column: Name: SlotNames, Index: 0, IsHidden: False, Type: { Microsoft.ML.Data.VectorDataViewType: Dimensions: [ 4 ], IsKnownSize: True, ItemType: { Microsoft.ML.Data.TextDataViewType: RawType: System.ReadOnlyMemory`1[System.Char] }, Size: 4, RawType: Microsoft.ML.Data.VBuffer`1[System.ReadOnlyMemory`1[System.Char]] }, Annotations: { Microsoft.ML.DataViewSchema+Annotations: Schema: [ ] } }, { Microsoft.ML.DataViewSchema+Column: Name: CategoricalSlotRanges, Index: 1, IsHidden: False, Type: { Microsoft.ML.Data.VectorDataViewType: Dimensions: [ 1, 2 ], IsKnownSize: True, ItemType: { Microsoft.ML.Data.NumberDataViewType: RawType: System.Int32 }, Size: 2, RawType: Microsoft.ML.Data.VBuffer`1[System.Int32] }, Annotations: { Microsoft.ML.DataViewSchema+Annotations: Schema: [ ] } }, { Microsoft.ML.DataViewSchema+Column: Name: IsNormalized, Index: 2, IsHidden: False, Type: { Microsoft.ML.Data.BooleanDataViewType: RawType: System.Boolean }, Annotations: { Microsoft.ML.DataViewSchema+Annotations: Schema: [ ] } } ] }"


In [25]:
// Extract the Actual values and Predicted values in two arrays
var trueValues = predictionsDataView.GetColumn<float>("fare_amount");
var predictedValues = predictionsDataView.GetColumn<float>("Score");

var predictedVsTrue = new Graph.Scatter()
{
    x = trueValues,
    y = predictedValues,
    mode = "markers",
};

var maximumValue = Math.Max(trueValues.Max(), predictedValues.Max());

var perfectLine = new Graph.Scatter()
{
    x = new[] {0, maximumValue},
    y = new[] {0, maximumValue},
    mode = "lines",
};

var chart = Chart.Plot(new[] {predictedVsTrue, perfectLine });
chart.WithXTitle("True Values");
chart.WithYTitle("Predicted Values");
chart.WithLegend(false);
chart.Width = 600;
chart.Height = 600;
display(chart);

In [27]:
// (CDLTLL) QUESTIONS TO FOLLOW UP:

display(h4("1. Why results from .Head() is not another DataFrame that can show all the rows/columns properly?"));
var dataFrameWithFiveRows = trainDf.Head(5);
display(dataFrameWithFiveRows);
// Result is not good...

display(h4("2. Any way to convert from an IDataView to an DataFrame?"));

index,vendor_id,rate_code,passenger_count,trip_time_in_secs,trip_distance,payment_type,fare_amount
0,CMT,1,1,1271,38,CRD,175
1,CMT,1,1,474,15,CRD,8
2,CMT,1,1,637,14,CRD,85
3,CMT,1,1,181,6,CSH,45
4,CMT,1,1,661,11,CRD,85
