# ML.Net - Tokenization

In [6]:
// ML.NET Nuget packages installation
#r "nuget:Microsoft.ML" 

## Using C# Class

In [7]:
using Microsoft.ML;
using Microsoft.ML.Data;

## Declare data-classes for input data and predictions

In [8]:
public class SentimentData
{
    [LoadColumn(0)]
    public bool Sentiment { get; set; }

    [LoadColumn(1)]
    public string Text { get; set; }
}
  
public class SentimentTokens
{
    public string[] Tokens { get; set; }
}

## Função Auxiliar para Imprimir os Tokens

In [9]:
private static void PrintTokens(SentimentTokens tokens)
{
    Console.WriteLine(Environment.NewLine);

    var sb = new StringBuilder();

    foreach (var token in tokens.Tokens)
    {
        sb.AppendLine(token);
    }

    Console.WriteLine(sb.ToString());
}

## Declare data-classes for input data and predictions

In [10]:
var context = new MLContext();

var emptyData = new List<SentimentData>();

var data = context.Data.LoadFromEnumerable(emptyData);

var tokenization = context.Transforms.Text.TokenizeIntoWords("Tokens", "Text", separators: new[] { ' ', '.', ',' });

var tokenModel = tokenization.Fit(data);

var engine = context.Model.CreatePredictionEngine<SentimentData, SentimentTokens>(tokenModel);

var tokens = engine.Predict(new SentimentData { Text = "This is a test sentence, and it is a long one." });

PrintTokens(tokens);

var charTokenization = context.Transforms.Text.TokenizeIntoCharactersAsKeys("Tokens", "Text", useMarkerCharacters: false)
    .Append(context.Transforms.Conversion.MapKeyToValue("Tokens"));

var charTokenModel = charTokenization.Fit(data);

var charEngine = context.Model.CreatePredictionEngine<SentimentData, SentimentTokens>(charTokenModel);

var charTokens = charEngine.Predict(new SentimentData { Text = "This is a test sentence, and it is a long one." });

PrintTokens(charTokens);

Console.ReadLine();



This
is
a
test
sentence
and
it
is
a
long
one



T
h
i
s
<␠>
i
s
<␠>
a
<␠>
t
e
s
t
<␠>
s
e
n
t
e
n
c
e
,
<␠>
a
n
d
<␠>
i
t
<␠>
i
s
<␠>
a
<␠>
l
o
n
g
<␠>
o
n
e
.

