# ML.Net - Building a Fraud Classifier in ML.NET with Jupyter Notebooks

## Import Nuget Packages

In [1]:
// ML.NET Nuget packages 
#r "nuget:Microsoft.ML"     

// ML.NET FastTree Nuget packages 
#r "nuget:Microsoft.ML.FastTree"

// ML.NET LightGbm Nuget packages     
#r "nuget:Microsoft.ML.LightGbm"

// ML.NET AutoML Nuget packages   
#r "nuget:Microsoft.ML.AutoML"

// ML.NET Data.Analysis Nuget packages      
#r "nuget:Microsoft.Data.Analysis  

Installed package Microsoft.ML.LightGbm version 1.5.0

Installed package Microsoft.Data.Analysis version 0.4.0

Installed package Microsoft.ML version 1.5.0

Installed package Microsoft.ML.FastTree version 1.5.0

Installed package Microsoft.ML.AutoML version 0.17.0

## Declare Using

In [3]:
using Microsoft.ML;
using Microsoft.ML.Trainers.FastTree;
using System;
using System.Diagnostics;
using Microsoft.ML.Data;
using XPlot.Plotly;
using Microsoft.ML.AutoML;

In [5]:
// Create new stopwatch.
Stopwatch stopwatch = new Stopwatch();

// Begin timing.
stopwatch.Start();
Console.WriteLine("Time elapsed: {0}", stopwatch.Elapsed);

Time elapsed: 00:00:00.0000020


In [6]:
var mlContext = new MLContext(seed: 1);

var DataPath = "./Datasets/fraudulent-classifier/data.csv";

In [7]:
public sealed class Transaction
{
     [ColumnName("Step"), LoadColumn(0)]
     public float Step { get; set; }

     [ColumnName("Type"), LoadColumn(1)]
     public string Type { get; set; }

     [ColumnName("Amount"), LoadColumn(2)]
     public float Amount { get; set; }

     [ColumnName("NameOrig"), LoadColumn(3)]
     public string NameOrig { get; set; }

     [ColumnName("OldbalanceOrg"), LoadColumn(4)]
     public float OldbalanceOrg { get; set; }

     [ColumnName("NewbalanceOrig"), LoadColumn(5)]
     public float NewbalanceOrig { get; set; }

     [ColumnName("NameDest"), LoadColumn(6)]
     public string NameDest { get; set; }

     [ColumnName("OldbalanceDest"), LoadColumn(7)]
     public float OldbalanceDest { get; set; }

     [ColumnName("NewbalanceDest"), LoadColumn(8)]
     public float NewbalanceDest { get; set; }

     [ColumnName("IsFraud"), LoadColumn(9)]
     public bool IsFraud { get; set; }

     [ColumnName("IsFlaggedFraud"), LoadColumn(10)]
     public float IsFlaggedFraud { get; set; }

}

In [8]:
//Load
var data = mlContext.Data.LoadFromTextFile<Transaction>(DataPath, hasHeader: true, separatorChar: ',');
var testTrainData = mlContext.Data.TrainTestSplit(data);
Console.WriteLine("Time elapsed: {0}-TrainTestSplit", stopwatch.Elapsed);

Time elapsed: 00:00:04.6118122-TrainTestSplit


In [9]:
private static IEstimator<ITransformer> BuildDataProcessingPipeline(MLContext mlContext)
{
    return mlContext.Transforms.Categorical.OneHotEncoding(nameof(Transaction.Type))
            .Append(mlContext.Transforms.Categorical.OneHotHashEncoding(nameof(Transaction.NameDest))
            .Append(mlContext.Transforms.Concatenate("Features", nameof(Transaction.Type), nameof(Transaction.NameDest), 
            nameof(Transaction.Amount), nameof(Transaction.OldbalanceOrg), nameof(Transaction.OldbalanceDest), 
            nameof(Transaction.NewbalanceOrig), nameof(Transaction.NewbalanceDest))));
}

In [10]:
private static IEstimator<ITransformer> BuildTrainingPipeline(MLContext mlContext, IEstimator<ITransformer> dataProcessingPipeline)
{
    return dataProcessingPipeline
  .Append(mlContext.BinaryClassification.Trainers.LbfgsLogisticRegression(labelColumnName: nameof(Transaction.IsFraud)));
}

In [11]:
private void PrintMetrics(BinaryClassificationMetrics metrics)
{
    Console.WriteLine($"Accuracy: {metrics.Accuracy}");
    Console.WriteLine($"AUCPC: {metrics.AreaUnderPrecisionRecallCurve}");
    Console.WriteLine($"Recall: {metrics.PositiveRecall}");
    Console.WriteLine($"Precision: {metrics.PositivePrecision}");
    Console.WriteLine($"F1Score: {metrics.F1Score}");    
}

In [12]:
mlContext.Transforms.Categorical.OneHotEncoding(nameof(Transaction.Type))

In [13]:
//Transform
var dataProcessingPipeline = BuildDataProcessingPipeline(mlContext);
Console.WriteLine("Time elapsed: {0}-BuildDataProcessingPipeline", stopwatch.Elapsed);

Time elapsed: 00:00:13.6751175-BuildDataProcessingPipeline


In [14]:
 //Train
var trainingPipeline = BuildTrainingPipeline(mlContext, dataProcessingPipeline);
Console.WriteLine("Time elapsed: {0}-BuildTrainingPipeline", stopwatch.Elapsed);

Time elapsed: 00:00:14.5835199-BuildTrainingPipeline


In [15]:
var trainedModel = trainingPipeline.Fit(testTrainData.TrainSet);

Console.WriteLine("Time elapsed: {0}-Fit", stopwatch.Elapsed);

Time elapsed: 00:00:19.4764243-Fit


In [16]:
var predictions = trainedModel.Transform(testTrainData.TestSet);

Console.WriteLine("Time elapsed: {0}-Transform", stopwatch.Elapsed);

Time elapsed: 00:00:19.5601631-Transform


In [17]:
//Evaluate
var metrics = mlContext.BinaryClassification.Evaluate(predictions, labelColumnName: nameof(Transaction.IsFraud));  

Console.WriteLine("Time elapsed: {0}-Evaluate", stopwatch.Elapsed);

Time elapsed: 00:00:20.0072584-Evaluate


In [18]:
PrintMetrics(metrics)

Accuracy: 0,9997514848778548
AUCPC: 0,6363275178989002
Recall: 0,5625
Precision: 0,75
F1Score: 0,6428571428571429


In [19]:
metrics

LogLoss,LogLossReduction,Entropy,AreaUnderRocCurve,Accuracy,PositivePrecision,PositiveRecall,NegativePrecision,NegativeRecall,F1Score,AreaUnderPrecisionRecallCurve,ConfusionMatrix
inf,-inf,0.0050652213065425,0.918822874972031,0.9997514848778548,0.75,0.5625,0.9998259875208192,0.9999254158068768,0.6428571428571429,0.6363275178989002,"{ Microsoft.ML.Data.ConfusionMatrix: PerClassPrecision: [ 0.75, 0.9998259875208193 ], PerClassRecall: [ 0.5625, 0.9999254158068767 ], Counts: [ [ 9, 7 ], [ 3, 40220 ] ], NumberOfClasses: 2 }"


In [20]:
//Save
mlContext.Model.Save(trainedModel, data.Schema, @"./Datasets/fraudulent-classifier/LModel_LbfgsLogisticRegression.zip");
Console.WriteLine("End Program Time elapsed: {0}", stopwatch.Elapsed);
//stopwatch.Stop();

End Program Time elapsed: 00:00:23.7501100


In [21]:
var trainingPipeline = dataProcessingPipeline
  .Append(mlContext.BinaryClassification.Trainers.FastTree(new FastTreeBinaryTrainer.Options 
{ 
  NumberOfLeaves = 10, 
  NumberOfTrees = 50,  
  LabelColumnName = nameof(Transaction.IsFraud),
  FeatureColumnName = "Features" 
}));

In [22]:
var trainedModel = trainingPipeline.Fit(testTrainData.TrainSet);

Console.WriteLine("Time elapsed: {0}-Fit", stopwatch.Elapsed);

Time elapsed: 00:00:33.0346438-Fit


In [23]:
var predictions = trainedModel.Transform(testTrainData.TestSet);

Console.WriteLine("Time elapsed: {0}-Transform", stopwatch.Elapsed);

Time elapsed: 00:00:33.1722258-Transform


In [24]:
var metrics = mlContext.BinaryClassification.Evaluate(predictions, labelColumnName: nameof(Transaction.IsFraud));  

Console.WriteLine("Time elapsed: {0}-Evaluate", stopwatch.Elapsed);

Time elapsed: 00:00:33.8326844-Evaluate


In [25]:
PrintMetrics(metrics)

Accuracy: 0,9996520788289968
AUCPC: 0,3414573850471399
Recall: 0,1875
Precision: 0,75
F1Score: 0,3


In [26]:
//Save
mlContext.Model.Save(trainedModel, data.Schema, @"./Datasets/fraudulent-classifier/LModel_FastTree.zip");
Console.WriteLine("End Program Time elapsed: {0}", stopwatch.Elapsed);
//stopwatch.Stop();

End Program Time elapsed: 00:00:34.0181581


# AutoML

In [27]:
using Microsoft.ML.AutoML;


var result = mlContext.Auto()
                .CreateBinaryClassificationExperiment(50)
                .Execute(testTrainData.TrainSet, labelColumnName: "IsFraud");

In [28]:
var scatters = result.RunDetails.Where(d => d.ValidationMetrics != null).GroupBy(    
    r => r.TrainerName,
    (name, details) => new Graph.Scatter()
    {
        name = name,
        x = details.Select(r => r.RuntimeInSeconds),
        y = details.Select(r => r.ValidationMetrics.Accuracy),
        mode = "markers",
        marker = new Graph.Marker() { size = 12 }
    });

var chart = Chart.Plot(scatters);
chart.WithXTitle("Training Time");
chart.WithYTitle("Accuracy");
display(chart);

Console.WriteLine($"Best Trainer:{result.BestRun.TrainerName}");

Best Trainer:FastTreeBinary


## Evaluate AutoML

In [29]:
var predictions = result.BestRun.Model.Transform(testTrainData.TestSet);
var metrics = mlContext.BinaryClassification.Evaluate(predictions, labelColumnName: "IsFraud");
display(metrics)

LogLoss,LogLossReduction,Entropy,AreaUnderRocCurve,Accuracy,PositivePrecision,PositiveRecall,NegativePrecision,NegativeRecall,F1Score,AreaUnderPrecisionRecallCurve,ConfusionMatrix
0.0026941868974778,0.4681008519809713,0.0050652213065425,0.9403808144593888,0.9997017818534256,0.8333333333333334,0.3125,0.999726592598116,0.9999751386022924,0.4545454545454545,0.4525390441223951,"{ Microsoft.ML.Data.ConfusionMatrix: PerClassPrecision: [ 0.8333333333333334, 0.999726592598116 ], PerClassRecall: [ 0.3125, 0.9999751386022923 ], Counts: [ [ 5, 11 ], [ 1, 40222 ] ], NumberOfClasses: 2 }"


In [30]:
PrintMetrics(metrics)

Accuracy: 0,9997017818534257
AUCPC: 0,4525390441223951
Recall: 0,3125
Precision: 0,8333333333333334
F1Score: 0,45454545454545453


## Save

In [31]:
mlContext.Model.Save(trainedModel, data.Schema, "./Datasets/fraudulent-classifier/ML_Autoodel.zip"); 

## Test Prediction

In [32]:
public class FraudPrediction
{
    // ColumnName attribute is used to change the column name from
    // its default value, which is the name of the field.
    [ColumnName("PredictedLabel")]
    public bool IsFraud { get; set; }

    [ColumnName("Score")]
    public float Score { get; set; }
}

In [33]:
PredictionEngine<Transaction, FraudPrediction> predictionEngine;

In [34]:
//Define DataViewSchema for data preparation pipeline and trained model
DataViewSchema modelSchema;

var model = mlContext.Model.Load(@"./Datasets/fraudulent-classifier/ML_Autoodel.zip", out modelSchema);
            
predictionEngine = mlContext.Model.CreatePredictionEngine<Transaction, FraudPrediction>(model);



In [35]:
var transaction = new Transaction 
{
        Amount = 1500f,
        OldbalanceDest = 100,
        NewbalanceDest = 300,
        NameDest = "C123",
        NameOrig = "B123"
};
          
var result = predictionEngine.Predict(transaction);

result

IsFraud,Score
False,-20.025032
