In [1]:
// HTML-like displays:
display(h1("ML.NET & DataFrame demo on Jupyter!!"));
display(h4("This notebook simplifies approaches with the new DataFrame in .NET."));

In [2]:
// ML.NET Nuget packages 
#r "nuget:Microsoft.ML,1.4.0-preview"
    
// DataFrame Nuget package 
#r "nuget:Microsoft.Data.DataFrame,1.0.0-e190910-1"
    
// AutoML Nuget package 
#r "nuget:Microsoft.ML.AutoML"
    
//Install XPlot package
#r "nuget:XPlot.Plotly,2.0.0"

using Microsoft.ML;
using Microsoft.ML.Data;
using Microsoft.Data;
using XPlot.Plotly;

In [3]:
// Temporal DataFrame formatter for this early preview

using Microsoft.AspNetCore.Html;
Formatter<DataFrame>.Register((df, writer) =>
{
    var headers = new List<IHtmlContent>();
    headers.Add(th(i("index")));
    headers.AddRange(df.Columns.Select(c => (IHtmlContent) th(c)));
    var rows = new List<List<IHtmlContent>>();
    var take = 20;
    for (var i = 0; i < Math.Min(take, df.RowCount); i++)
    {
        var cells = new List<IHtmlContent>();
        cells.Add(td(i));
        foreach (var obj in df[i])
        {
            cells.Add(td(obj));
        }
        rows.Add(cells);
    }
    
    var t = table(
        thead(
            headers),
        tbody(
            rows.Select(
                r => tr(r))));
    
    writer.Write(t);
}, "text/html");

In [4]:
var trainDf = DataFrame.ReadCsv(@"taxi-fare-train-small.csv");
trainDf.Description()

index,Description,fare_amount,trip_distance,trip_time_in_secs,passenger_count,rate_code
0,Length,9999.0,9999.0,9999.0,9999.0,9999.0
1,Max,120.0,54.7,4680.0,6.0,5.0
2,Min,2.5,0.0,0.0,1.0,1.0
3,Mean,11.458276,2.7357678,665.39124,1.7669767,1.0265026


In [5]:
var dataFrameWithFiveRows = trainDf.Head(5);
display(dataFrameWithFiveRows);


index,Unnamed: 1
0,CMT1112713.8CRD17.5
1,CMT114741.5CRD8
2,CMT116371.4CRD8.5
3,CMT111810.6CSH4.5
4,CMT116611.1CRD8.5


In [6]:
var faresHistogram = Chart.Plot(new Graph.Histogram(){x = trainDf["fare_amount"], autobinx = false, nbinsx = 20});
var layout = new Layout.Layout(){title="Distribution of taxi trips per cost"};
faresHistogram.WithLayout(layout);

display(faresHistogram);

In [7]:
// Plot Time vs. Distance with different color on Fares cost-frame

var chart = Chart.Plot(
    new Graph.Scatter()
    {
        x = trainDf["trip_distance"],
        y = trainDf["trip_time_in_secs"],
        mode = "markers",
        marker = new Graph.Marker()
        {
            color = df["fare_amount"],
            colorscale = "Jet"
        }
    }
);

var layout = new Layout.Layout(){title="Plot Time vs. Distance & color scale on Fares"};
chart.WithLayout(layout);
chart.Width = 500;
chart.Height = 500;
chart.WithLegend(true);

display(chart);

Unhandled Exception: (11,21): error CS0103: The name 'df' does not exist in the current context

(11,21): error CS0103: The name 'df' does not exist in the current context

In [8]:
using Microsoft.ML;
using Microsoft.ML.Data;
using Microsoft.ML.AutoML;

In [9]:
%%time

var mlContext = new MLContext();

var experiment = mlContext.Auto().CreateRegressionExperiment(maxExperimentTimeInSeconds: 30);
var result = experiment.Execute(trainDf, labelColumnName:"fare_amount");

Wall time: 38543.9302ms

In [10]:
var scatters = result.RunDetails.GroupBy(
    r => r.TrainerName,
    (name, details) => new Graph.Scatter()
    {
        name = name,
        x = details.Select(r => r.RuntimeInSeconds),
        y = details.Select(r => r.ValidationMetrics.MeanAbsoluteError),
        mode = "markers",
        marker = new Graph.Marker() { size = 12 }
    });

var chart = Chart.Plot(scatters);
chart.WithXTitle("Training Time");
chart.WithYTitle("Error");
display(chart);

display(h3($"Best Trainer:{result.BestRun.TrainerName}"));

In [11]:
//Read Train and Test datasets into DataFrames (NEW)

display("Test Dataset");
var testDf = DataFrame.ReadCsv(@"taxi-fare-test-small.csv");
display(testDf.Description());



index,Description,fare_amount,trip_distance,trip_time_in_secs,passenger_count,rate_code
0,Length,2000.0,2000.0,2000.0,2000.0,2000.0
1,Max,68.5,20.98,4380.0,6.0,5.0
2,Min,2.5,0.0,0.0,1.0,1.0
3,Mean,10.83825,2.4539683,655.26,1.2125,1.02


In [12]:
// Make Predictions and return a DataFrame
var predictionsDataView = result.BestRun.Model.Transform(testDf);

//(CDLTLL) Any way to convert from a IDataView to an DataFrame?

display(h4("Schema of DataView with Predictions:"));
display(predictionsDataView.Schema);


index,Name,Index,IsHidden,Type,Annotations
0,vendor_id,0,True,{ TextDataViewType: RawType: ReadOnlyMemory<Char> },{ Annotations: Schema: [ ] }
1,vendor_id,1,True,"{ KeyDataViewType: Count: 2, RawType: UInt32 }","{ Annotations: Schema: [ { Column: Name: KeyValues, Index: 0, IsHidden: False, Type: { VectorDataViewType: Dimensions: [ 2 ], IsKnownSize: True, ItemType: { Microsoft.ML.Data.TextDataViewType: RawType: System.ReadOnlyMemory`1[System.Char] }, Size: 2, RawType: Microsoft.ML.Data.VBuffer`1[System.ReadOnlyMemory`1[System.Char]] }, Annotations: { Annotations: Schema: [ ] } } ] }"
2,vendor_id,2,False,"{ VectorDataViewType: Dimensions: [ 2 ], IsKnownSize: True, ItemType: { NumberDataViewType: RawType: Single }, Size: 2, RawType: VBuffer<Single> }","{ Annotations: Schema: [ { Column: Name: SlotNames, Index: 0, IsHidden: False, Type: { VectorDataViewType: Dimensions: [ 2 ], IsKnownSize: True, ItemType: { Microsoft.ML.Data.TextDataViewType: RawType: System.ReadOnlyMemory`1[System.Char] }, Size: 2, RawType: Microsoft.ML.Data.VBuffer`1[System.ReadOnlyMemory`1[System.Char]] }, Annotations: { Annotations: Schema: [ ] } }, { Column: Name: CategoricalSlotRanges, Index: 1, IsHidden: False, Type: { VectorDataViewType: Dimensions: [ 1, 2 ], IsKnownSize: True, ItemType: { Microsoft.ML.Data.NumberDataViewType: RawType: System.Int32 }, Size: 2, RawType: Microsoft.ML.Data.VBuffer`1[System.Int32] }, Annotations: { Annotations: Schema: [ ] } }, { Column: Name: IsNormalized, Index: 2, IsHidden: False, Type: { BooleanDataViewType: RawType: System.Boolean }, Annotations: { Annotations: Schema: [ ] } } ] }"
3,rate_code,3,False,{ NumberDataViewType: RawType: Single },{ Annotations: Schema: [ ] }
4,passenger_count,4,False,{ NumberDataViewType: RawType: Single },{ Annotations: Schema: [ ] }
5,trip_time_in_secs,5,False,{ NumberDataViewType: RawType: Single },{ Annotations: Schema: [ ] }
6,trip_distance,6,False,{ NumberDataViewType: RawType: Single },{ Annotations: Schema: [ ] }
7,payment_type,7,True,{ TextDataViewType: RawType: ReadOnlyMemory<Char> },{ Annotations: Schema: [ ] }
8,payment_type,8,True,"{ KeyDataViewType: Count: 4, RawType: UInt32 }","{ Annotations: Schema: [ { Column: Name: KeyValues, Index: 0, IsHidden: False, Type: { VectorDataViewType: Dimensions: [ 4 ], IsKnownSize: True, ItemType: { Microsoft.ML.Data.TextDataViewType: RawType: System.ReadOnlyMemory`1[System.Char] }, Size: 4, RawType: Microsoft.ML.Data.VBuffer`1[System.ReadOnlyMemory`1[System.Char]] }, Annotations: { Annotations: Schema: [ ] } } ] }"
9,payment_type,9,False,"{ VectorDataViewType: Dimensions: [ 4 ], IsKnownSize: True, ItemType: { NumberDataViewType: RawType: Single }, Size: 4, RawType: VBuffer<Single> }","{ Annotations: Schema: [ { Column: Name: SlotNames, Index: 0, IsHidden: False, Type: { VectorDataViewType: Dimensions: [ 4 ], IsKnownSize: True, ItemType: { Microsoft.ML.Data.TextDataViewType: RawType: System.ReadOnlyMemory`1[System.Char] }, Size: 4, RawType: Microsoft.ML.Data.VBuffer`1[System.ReadOnlyMemory`1[System.Char]] }, Annotations: { Annotations: Schema: [ ] } }, { Column: Name: CategoricalSlotRanges, Index: 1, IsHidden: False, Type: { VectorDataViewType: Dimensions: [ 1, 2 ], IsKnownSize: True, ItemType: { Microsoft.ML.Data.NumberDataViewType: RawType: System.Int32 }, Size: 2, RawType: Microsoft.ML.Data.VBuffer`1[System.Int32] }, Annotations: { Annotations: Schema: [ ] } }, { Column: Name: IsNormalized, Index: 2, IsHidden: False, Type: { BooleanDataViewType: RawType: System.Boolean }, Annotations: { Annotations: Schema: [ ] } } ] }"


In [13]:
// Extract the Actual values and Predicted values in two arrays
var trueValues = predictionsDataView.GetColumn<float>("fare_amount");
var predictedValues = predictionsDataView.GetColumn<float>("Score");

var predictedVsTrue = new Graph.Scatter()
{
    x = trueValues,
    y = predictedValues,
    mode = "markers",
};

var maximumValue = Math.Max(trueValues.Max(), predictedValues.Max());

var perfectLine = new Graph.Scatter()
{
    x = new[] {0, maximumValue},
    y = new[] {0, maximumValue},
    mode = "lines",
};

var chart = Chart.Plot(new[] {predictedVsTrue, perfectLine });
chart.WithXTitle("True Values");
chart.WithYTitle("Predicted Values");
chart.WithLegend(false);
chart.Width = 600;
chart.Height = 600;
display(chart);

In [14]:
// (CDLTLL) QUESTIONS TO FOLLOW UP:

display(h4("1. Why results from .Head() is not another DataFrame that can show all the rows/columns properly?"));
var dataFrameWithFiveRows = trainDf.Head(5);
display(dataFrameWithFiveRows);
// Result is not good...

display(h4("2. Any way to convert from an IDataView to an DataFrame?"));

index,Unnamed: 1
0,CMT1112713.8CRD17.5
1,CMT114741.5CRD8
2,CMT116371.4CRD8.5
3,CMT111810.6CSH4.5
4,CMT116611.1CRD8.5
