In [2]:
using DataFrames
using CSV
using Gadfly
using TextAnalysis
using MLJ
using Chain
using Pipe
using StableRNGs

In [3]:
df = CSV.read("spam_dataset.csv", DataFrames.DataFrame)
first(df, 10) |> pretty

┌──────────┬──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┐
│[1m Category [0m│[1m Message                                                                                                                                                          [0m│
│[90m String7  [0m│[90m String                                                                                                                                                           [0m│
│[90m Textual  [0m│[90m Textual                                                                                                                                                          [0m│
├──────────┼──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┤
│ ham      │ Go until jurong point, crazy.. Available only in bugis 

In [4]:
println(size(df))

(5572, 2)


In [5]:
df = @chain df begin
    DataFrames.transform(:Message => ByRow(x -> StringDocument(x)) => :Message2)
  end
  
  @chain df begin
  first(5)
  pretty()
  end

┌──────────┬─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┬──────────────────────────┐
│[1m Category [0m│[1m Message                                                                                                                                                     [0m│[1m Message2                 [0m│
│[90m String7  [0m│[90m String                                                                                                                                                      [0m│[90m StringDocument{String}   [0m│
│[90m Textual  [0m│[90m Textual                                                                                                                                                     [0m│[90m Unknown                  [0m│
├──────────┼──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────

In [6]:
text(df[:, :Message2][1])

"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat..."

In [7]:
tokens(df[:, :Message2][1])

21-element Vector{String}:
 "Go"
 "until"
 "jurong"
 "point"
 ","
 "crazy"
 "Available"
 "only"
 "in"
 "bugis"
 ⋮
 "world"
 "la"
 "e"
 "buffet"
 "Cine"
 "there"
 "got"
 "amore"
 "wat"

In [8]:
ngrams(df[:, :Message2][1])

Dict{String, Int64} with 21 entries:
  "amore"     => 1
  "Available" => 1
  "e"         => 1
  "there"     => 1
  "wat"       => 1
  "got"       => 1
  "Go"        => 1
  ","         => 1
  "in"        => 1
  "world"     => 1
  "point"     => 1
  "until"     => 1
  "Cine"      => 1
  "crazy"     => 1
  "la"        => 1
  "great"     => 1
  "jurong"    => 1
  "only"      => 1
  "buffet"    => 1
  ⋮           => ⋮

In [9]:
ngrams(df[:, :Message2][1], 2)

Dict{AbstractString, Int64} with 20 entries:
  "jurong point"    => 1
  "there got"       => 1
  "bugis n"         => 1
  ", crazy"         => 1
  "only in"         => 1
  "crazy Available" => 1
  "buffet Cine"     => 1
  "great world"     => 1
  "n great"         => 1
  "Cine there"      => 1
  "la e"            => 1
  "Go until"        => 1
  "point ,"         => 1
  "amore wat"       => 1
  "until jurong"    => 1
  "Available only"  => 1
  "got amore"       => 1
  "e buffet"        => 1
  "world la"        => 1
  "in bugis"        => 1

In [11]:
remove_case!.(df[:, :Message2])
prepare!.(df[:, :Message2], strip_html_tags| strip_punctuation| strip_numbers)
stem!.(df[:, :Message2])

@chain df begin
DataFrames.transform(:Message2 => ByRow(x->text(x)) => :Message2)
first(10)
pretty()
end

┌──────────┬──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┬───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┐
│[1m Category [0m│[1m Message                                                                                                                                                          [0m│[1m Message2                                                                                                                                          [0m│
│[90m String7  [0m│[90m String                                                                                                                                                           [0m│[90m String                                                                                                                           

In [12]:
crps = Corpus(df[:, :Message2]);

In [13]:
update_lexicon!(crps)

In [14]:
m = DocumentTermMatrix(crps)

A 5572 X 7274 DocumentTermMatrix

In [15]:
dense_dtm = dtm(m, :dense)
println("Size of the dense document term matrix : $(size(dense_dtm))")

Size of the dense document term matrix : (5572, 7274)


In [16]:
tfidf_mat = tf_idf(m);
println("Size of the tf-idf matrix : $(size(tfidf_mat))")

Size of the tf-idf matrix : (5572, 7274)


In [17]:
X, y = tfidf_mat, df[:, :Category];

In [18]:
DecisionTreeClassifier = @load DecisionTreeClassifier pkg=DecisionTree
tree_model = DecisionTreeClassifier()

import MLJDecisionTreeInterface ✔


┌ Info: For silent loading, specify `verbosity=0`. 
└ @ Main C:\Users\matheus.pavani\.julia\packages\MLJModels\hAzAn\src\loading.jl:159


DecisionTreeClassifier(
  max_depth = -1, 
  min_samples_leaf = 1, 
  min_samples_split = 2, 
  min_purity_increase = 0.0, 
  n_subfeatures = 0, 
  post_prune = false, 
  merge_purity_threshold = 1.0, 
  display_depth = 5, 
  feature_importance = :impurity, 
  rng = Random._GLOBAL_RNG())

In [19]:
tree = machine(tree_model, coerce(X, Continuous), coerce(y, Multiclass))

│ supports. Suppress this type check by specifying `scitype_check_level=0`.
│ 
│ Run `@doc DecisionTree.DecisionTreeClassifier` to learn more about your model's requirements.
│ 
│ Commonly, but non exclusively, supervised models are constructed using the syntax
│ `machine(model, X, y)` or `machine(model, X, y, w)` while most other models are
│ constructed with `machine(model, X)`.  Here `X` are features, `y` a target, and `w`
│ sample or class weights.
│ 
│ In general, data in `machine(model, data...)` is expected to satisfy
│ 
│     scitype(data) <: MLJ.fit_data_scitype(model)
│ 
│ In the present case:
│ 
│ scitype(data) = Tuple{AbstractMatrix{Continuous}, AbstractVector{Multiclass{2}}}
│ 
│ fit_data_scitype(model) = Tuple{Table{<:Union{AbstractVector{<:Count}, AbstractVector{<:OrderedFactor}, AbstractVector{<:Continuous}}}, AbstractVector{<:Finite}}
└ @ MLJBase C:\Users\matheus.pavani\.julia\packages\MLJBase\Fl6Zc\src\machines.jl:176


Machine trained 0 times; caches data
  model: DecisionTreeClassifier(max_depth = -1, …)
  args: 
    1:	Source @649 ⏎ `AbstractMatrix{Continuous}`
    2:	Source @633 ⏎ `AbstractVector{Multiclass{2}}`


In [20]:
rng = StableRNG(42)
train, test = partition(eachindex(y), 0.85, shuffle=true, rng=rng);

In [21]:
MLJ.fit!(tree, rows=train)

┌ Info: Training machine(DecisionTreeClassifier(max_depth = -1, …), …).
└ @ MLJBase C:\Users\matheus.pavani\.julia\packages\MLJBase\Fl6Zc\src\machines.jl:498


Machine trained 1 time; caches data
  model: DecisionTreeClassifier(max_depth = -1, …)
  args: 
    1:	Source @649 ⏎ `AbstractMatrix{Continuous}`
    2:	Source @633 ⏎ `AbstractVector{Multiclass{2}}`


In [22]:
yhat = MLJ.predict(tree, coerce(X[test, :], Continuous))
log_loss(yhat, y[test]) |> mean

0.7329451048026216

In [23]:
accuracy(mode.(yhat), y[test])

0.9796650717703349

In [24]:
ConfusionMatrix()(mode.(yhat), y[test])

│ using: negative='ham' and positive='spam'.
└ @ MLJBase C:\Users\matheus.pavani\.julia\packages\MLJBase\Fl6Zc\src\measures\confusion_matrix.jl:116


              ┌───────────────────────────┐
              │       Ground Truth        │
┌─────────────┼─────────────┬─────────────┤
│  Predicted  │     ham     │    spam     │
├─────────────┼─────────────┼─────────────┤
│     ham     │     722     │     10      │
├─────────────┼─────────────┼─────────────┤
│    spam     │      7      │     97      │
└─────────────┴─────────────┴─────────────┘


In [25]:
models("forest")

5-element Vector{NamedTuple{(:name, :package_name, :is_supervised, :abstract_type, :deep_properties, :docstring, :fit_data_scitype, :human_name, :hyperparameter_ranges, :hyperparameter_types, :hyperparameters, :implemented_methods, :inverse_transform_scitype, :is_pure_julia, :is_wrapper, :iteration_parameter, :load_path, :package_license, :package_url, :package_uuid, :predict_scitype, :prediction_type, :reporting_operations, :reports_feature_importances, :supports_class_weights, :supports_online, :supports_training_losses, :supports_weights, :transform_scitype, :input_scitype, :target_scitype, :output_scitype)}}:
 (name = IForestDetector, package_name = OutlierDetectionPython, ... )
 (name = RandomForestClassifier, package_name = DecisionTree, ... )
 (name = RandomForestClassifier, package_name = ScikitLearn, ... )
 (name = RandomForestRegressor, package_name = DecisionTree, ... )
 (name = RandomForestRegressor, package_name = ScikitLearn, ... )

In [26]:
RandomForestClassifier = @load RandomForestClassifier pkg=DecisionTree
forest_model = RandomForestClassifier()

import MLJDecisionTreeInterface ✔


┌ Info: For silent loading, specify `verbosity=0`. 
└ @ Main C:\Users\matheus.pavani\.julia\packages\MLJModels\hAzAn\src\loading.jl:159


RandomForestClassifier(
  max_depth = -1, 
  min_samples_leaf = 1, 
  min_samples_split = 2, 
  min_purity_increase = 0.0, 
  n_subfeatures = -1, 
  n_trees = 10, 
  sampling_fraction = 0.7, 
  feature_importance = :impurity, 
  rng = Random._GLOBAL_RNG())

In [27]:
forest = machine(forest_model, coerce(X, Continuous), coerce(y, Multiclass))

│ supports. Suppress this type check by specifying `scitype_check_level=0`.
│ 
│ Run `@doc DecisionTree.RandomForestClassifier` to learn more about your model's requirements.
│ 
│ Commonly, but non exclusively, supervised models are constructed using the syntax
│ `machine(model, X, y)` or `machine(model, X, y, w)` while most other models are
│ constructed with `machine(model, X)`.  Here `X` are features, `y` a target, and `w`
│ sample or class weights.
│ 
│ In general, data in `machine(model, data...)` is expected to satisfy
│ 
│     scitype(data) <: MLJ.fit_data_scitype(model)
│ 
│ In the present case:
│ 
│ scitype(data) = Tuple{AbstractMatrix{Continuous}, AbstractVector{Multiclass{2}}}
│ 
│ fit_data_scitype(model) = Tuple{Table{<:Union{AbstractVector{<:Count}, AbstractVector{<:OrderedFactor}, AbstractVector{<:Continuous}}}, AbstractVector{<:Finite}}
└ @ MLJBase C:\Users\matheus.pavani\.julia\packages\MLJBase\Fl6Zc\src\machines.jl:176


Machine trained 0 times; caches data
  model: RandomForestClassifier(max_depth = -1, …)
  args: 
    1:	Source @330 ⏎ `AbstractMatrix{Continuous}`
    2:	Source @146 ⏎ `AbstractVector{Multiclass{2}}`


In [28]:
MLJ.fit!(forest, rows=train)

┌ Info: Training machine(RandomForestClassifier(max_depth = -1, …), …).
└ @ MLJBase C:\Users\matheus.pavani\.julia\packages\MLJBase\Fl6Zc\src\machines.jl:498


Machine trained 1 time; caches data
  model: RandomForestClassifier(max_depth = -1, …)
  args: 
    1:	Source @330 ⏎ `AbstractMatrix{Continuous}`
    2:	Source @146 ⏎ `AbstractVector{Multiclass{2}}`


In [29]:
yhat = MLJ.predict(forest, coerce(X[test, :], Continuous));

In [30]:
println("Log-Loss on the Test Set : $(log_loss(yhat, coerce(y[test], Multiclass)) |> mean)")

Log-Loss on the Test Set : 0.37833313067620566


In [31]:
println("Accuracy on the Test Set : $(accuracy(mode.(yhat), coerce(y[test], Multiclass)))")

Accuracy on the Test Set : 0.9581339712918661


In [32]:
ConfusionMatrix()(mode.(yhat), coerce(y[test], Multiclass))

│ using: negative='ham' and positive='spam'.
└ @ MLJBase C:\Users\matheus.pavani\.julia\packages\MLJBase\Fl6Zc\src\measures\confusion_matrix.jl:116


              ┌───────────────────────────┐
              │       Ground Truth        │
┌─────────────┼─────────────┬─────────────┤
│  Predicted  │     ham     │    spam     │
├─────────────┼─────────────┼─────────────┤
│     ham     │     728     │     34      │
├─────────────┼─────────────┼─────────────┤
│    spam     │      1      │     73      │
└─────────────┴─────────────┴─────────────┘
