# ML.Net - Fraudulent Classifier

## Davi Ramos -> Cientista de Dados 👋
(davi.info@gmail.com)

[![Linkedin Badge](https://img.shields.io/badge/-LinkedIn-blue?style=flat-square&logo=Linkedin&logoColor=white&link=https://www.linkedin.com/in/davi-ramos/)](https://www.linkedin.com/in/davi-ramos/)
[![Twitter Badge](https://img.shields.io/badge/-Twitter-1DA1F2?style=flat-square&logo=Twitter&logoColor=white&link=https://twitter.com/Daviinfo/)](https://twitter.com/Daviinfo/)
<a href="https://github.com/DaviRamos"><img src="https://img.shields.io/github/followers/DaviRamos.svg?label=GitHub&style=social" alt="GitHub"></a>

In [1]:
#r "nuget:Microsoft.ML"
#r "nuget:Microsoft.ML.FastTree"
#r "nuget:Microsoft.ML.AutoML"
#r "nuget:Microsoft.Data.Analysis"

Installed package Microsoft.ML version 1.5.0

Installed package Microsoft.ML.AutoML version 0.17.0

Installed package Microsoft.Data.Analysis version 0.4.0

Installed package Microsoft.ML.FastTree version 1.5.0

#### Register a HTML formatter for the DataFrame

In [2]:
using Microsoft.Data;
using Microsoft.Data.Analysis;
using XPlot.Plotly;

In [3]:
using Microsoft.AspNetCore.Html;
Formatter<DataFrame>.Register((df, writer) =>
{
    var headers = new List<IHtmlContent>();
    headers.Add(th(i("index")));
    headers.AddRange(df.Columns.Select(c => (IHtmlContent) th(c.Name)));
    var rows = new List<List<IHtmlContent>>();
    var take = 10;
    for (var i = 0; i < Math.Min(take, df.Rows.Count); i++)
    {
        var cells = new List<IHtmlContent>();
        cells.Add(td(i));
        foreach (var obj in df.Rows[i])
        {
            cells.Add(td(obj));
        }
        rows.Add(cells);
    }

    var t = table(
        thead(
            headers),
        tbody(
            rows.Select(
                r => tr(r))));

    writer.Write(t);
}, "text/html");

#### Fetch the data

In [4]:
using System.IO;
using System.Net.Http;

string dataPath = "data.csv";

if (!File.Exists(dataPath))
{
    var contents = new HttpClient()
        .GetStringAsync("https://aslottepublic.blob.core.windows.net/public/data-small.csv").Result;
        
    File.WriteAllText("data.csv", contents);
}

In [21]:
var dataFrame = DataFrame.LoadCsv(dataPath, separator: ',');
dataFrame

index,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,983964,C1231006815,170136,16029636,M1979787155,0,0,0,0
1,1,PAYMENT,186428,C1666544295,21249,1938472,M2044282225,0,0,0,0
2,1,TRANSFER,181,C1305486145,181,0,C553264065,0,0,1,0
3,1,CASH_OUT,181,C840083671,181,0,C38997010,21182,0,1,0
4,1,PAYMENT,1166814,C2048537720,41554,2988586,M1230701703,0,0,0,0
5,1,PAYMENT,781771,C90045638,53860,4604229,M573487274,0,0,0,0
6,1,PAYMENT,710777,C154988899,183195,17608724,M408069119,0,0,0,0
7,1,PAYMENT,786164,C1912850431,17608724,16822560,M633326333,0,0,0,0
8,1,PAYMENT,402436,C1265012928,2671,0,M1176932104,0,0,0,0
9,1,DEBIT,533777,C712410124,41720,3638223,C195600860,41898,4034879,0,0


#### Explore the data

In [22]:
dataFrame.Description()

index,Description,step,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,Length (excluding null values),402037.0,402037.0,402037.0,402037.0,402037.0,402037.0,402037.0,402037
1,Max,18.0,1000000000.0,3889999900.0,3889999900.0,4150000100.0,4150000100.0,1.0,0
2,Min,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0
3,Mean,12.74421,15681869.0,84593056.0,88583936.0,91173880.0,109914320.0,0.00051239063,0


In [36]:
Chart.Plot(
    new Graph.Histogram()
    {
        x = dataFrame.Columns["amount"],
        nbinsx = 100
    }
)

In [37]:
Chart.Plot(
    new Graph.Histogram()
    {
        x = dataFrame.Columns["newbalanceOrig"],
        nbinsx = 100
    }
)

#### Train your model

In [24]:
using Microsoft.ML;
using Microsoft.ML.Trainers.FastTree;
using Microsoft.ML.Data;

In [25]:
internal sealed class Transaction
{
        [ColumnName("step"), LoadColumn(0)]
        public float Step { get; set; }

        [ColumnName("type"), LoadColumn(1)]
        public string Type { get; set; }

        [ColumnName("amount"), LoadColumn(2)]
        public float Amount { get; set; }

        [ColumnName("nameOrig"), LoadColumn(3)]
        public string NameOrig { get; set; }

        [ColumnName("oldbalanceOrg"), LoadColumn(4)]
        public float OldbalanceOrg { get; set; }

        [ColumnName("newbalanceOrig"), LoadColumn(5)]
        public float NewbalanceOrig { get; set; }

        [ColumnName("nameDest"), LoadColumn(6)]
        public string NameDest { get; set; }

        [ColumnName("oldbalanceDest"), LoadColumn(7)]
        public float OldbalanceDest { get; set; }

        [ColumnName("newbalanceDest"), LoadColumn(8)]
        public float NewbalanceDest { get; set; }

        [ColumnName("isFraud"), LoadColumn(9)]
        public bool IsFraud { get; set; }

        [ColumnName("isFlaggedFraud"), LoadColumn(10)]
        public float IsFlaggedFraud { get; set; }
}

#### Load the data

In [28]:
var mlContext = new MLContext(seed: 1);

var data = mlContext.Data.LoadFromTextFile<Transaction>(dataPath, hasHeader: true, separatorChar: ',');
var testTrainData = mlContext.Data.TrainTestSplit(data);

#### Create a data processing pipeline

In [29]:
var dataProcessingPipeline = mlContext.Transforms.Categorical.OneHotEncoding("type")
    .Append(mlContext.Transforms.Categorical.OneHotHashEncoding("nameDest"))
    .Append(mlContext.Transforms.Concatenate("Features", "type", "nameDest", "amount", "oldbalanceOrg", "oldbalanceDest", "newbalanceOrig", "newbalanceDest")
    .Append(mlContext.Transforms.NormalizeMinMax("Features")));

#### Create a training pipeline

In [30]:
var trainingPipeline = dataProcessingPipeline.Append(mlContext.BinaryClassification.Trainers.FastTree(
new FastTreeBinaryTrainer.Options 
{ 
    NumberOfLeaves = 10, 
    NumberOfTrees = 10, 
    LabelColumnName = "isFraud", 
    FeatureColumnName = "Features" 
}));

#### Train our model

In [31]:
var trainedModel = trainingPipeline.Fit(testTrainData.TrainSet);

#### Evaluate performance

In [32]:
var predictions = trainedModel.Transform(testTrainData.TestSet);

var metrics = mlContext.BinaryClassification.Evaluate(predictions, labelColumnName: "isFraud");
display(metrics)

LogLoss,LogLossReduction,Entropy,AreaUnderRocCurve,Accuracy,PositivePrecision,PositiveRecall,NegativePrecision,NegativeRecall,F1Score,AreaUnderPrecisionRecallCurve,ConfusionMatrix
0.0959483896673138,-17.94258589321288,0.0050652213065425,0.7893789001317654,0.9996023758045676,0,0,0.9996023758045676,1,0,0.2870154656113997,"{ Microsoft.ML.Data.ConfusionMatrix: PerClassPrecision: [ 0, 0.9996023758045677 ], PerClassRecall: [ 0, 1 ], Counts: [ [ 0, 16 ], [ 0, 40223 ] ], NumberOfClasses: 2 }"


## AutoML

In [38]:
//%%time 

using Microsoft.ML.AutoML;

var result = mlContext.Auto()
                .CreateBinaryClassificationExperiment(50)
                .Execute(testTrainData.TrainSet, labelColumnName: "isFraud");

In [39]:
var scatters = result.RunDetails.Where(d => d.ValidationMetrics != null).GroupBy(    
    r => r.TrainerName,
    (name, details) => new Graph.Scatter()
    {
        name = name,
        x = details.Select(r => r.RuntimeInSeconds),
        y = details.Select(r => r.ValidationMetrics.Accuracy),
        mode = "markers",
        marker = new Graph.Marker() { size = 12 }
    });

var chart = Chart.Plot(scatters);
chart.WithXTitle("Training Time");
chart.WithYTitle("Accuracy");
display(chart);

Console.WriteLine($"Best Trainer:{result.BestRun.TrainerName}");

Best Trainer:FastTreeBinary


#### Evaluate AutoML

In [40]:
var predictions = result.BestRun.Model.Transform(testTrainData.TestSet);
var metrics = mlContext.BinaryClassification.Evaluate(predictions, labelColumnName: "isFraud");
display(metrics)

LogLoss,LogLossReduction,Entropy,AreaUnderRocCurve,Accuracy,PositivePrecision,PositiveRecall,NegativePrecision,NegativeRecall,F1Score,AreaUnderPrecisionRecallCurve,ConfusionMatrix
0.0026941868974778,0.4681008519809713,0.0050652213065425,0.9403808144593888,0.9997017818534256,0.8333333333333334,0.3125,0.999726592598116,0.9999751386022924,0.4545454545454545,0.4525390441223951,"{ Microsoft.ML.Data.ConfusionMatrix: PerClassPrecision: [ 0.8333333333333334, 0.999726592598116 ], PerClassRecall: [ 0.3125, 0.9999751386022923 ], Counts: [ [ 5, 11 ], [ 1, 40222 ] ], NumberOfClasses: 2 }"


#### Save

In [41]:
mlContext.Model.Save(trainedModel, data.Schema, "MLModel.zip");