In [2]:
using DataFrames, CSV, Plots, Statistics, MLJ

## Loading dataset, treating missings and categorical columns

In [3]:
function openClevelandData(path::String)

    # Oppening the DataFrame
    cleveland = DataFrame(CSV.File(path; header =  [:age, :sex, :cp, :trestbps, :chol, :fbs, :restecg, :thalach, :exang, :oldpeak, :slope, :ca, :thal, :target]))

    # Treating missings
    allowmissing!(cleveland)
    cleveland .= ifelse.(cleveland .=="?", missing, cleveland)
    dropmissing!(cleveland)

    # Converting some colunms to Int
    cleveland.age = convert.(Int, cleveland.age)
    cleveland.sex = convert.(Int, cleveland.sex)
    cleveland.cp = convert.(Int, cleveland.cp)
    cleveland.trestbps = convert.(Int, cleveland.trestbps)
    cleveland.chol = convert.(Int, cleveland.chol)
    cleveland.fbs = convert.(Int, cleveland.fbs)
    cleveland.restecg = convert.(Int, cleveland.restecg)
    cleveland.thalach = convert.(Int, cleveland.thalach)
    cleveland.exang = convert.(Int, cleveland.exang)
    cleveland.slope = convert.(Int, cleveland.slope)
    cleveland.target = convert.(Int, cleveland.target)
    
    
    #Converting ca and thal from String to Int
    cleveland.ca, cleveland.thal = parse.(Float64, cleveland.ca), parse.(Float64, cleveland.thal)
    cleveland.ca, cleveland.thal = convert.(Int, cleveland.ca), convert.(Int, cleveland.thal)
    
    #Converting categorical values
    cleveland.sex = categorical(cleveland.sex)
    cleveland.cp = categorical(cleveland.cp)
    cleveland.fbs = categorical(cleveland.fbs)
    cleveland.restecg = categorical(cleveland.restecg)
    cleveland.exang = categorical(cleveland.exang)
    cleveland.oldpeak = categorical(cleveland.oldpeak)
    cleveland.slope = categorical(cleveland.slope)
    cleveland.thal = categorical(cleveland.thal)
    

    cleveland

end

cleveland = openClevelandData("Data\\processed.cleveland.data")

Unnamed: 0_level_0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope
Unnamed: 0_level_1,Int64,Cat…,Cat…,Int64,Int64,Cat…,Cat…,Int64,Cat…,Cat…,Cat…
1,63,1,1,145,233,1,2,150,0,2.3,3
2,67,1,4,160,286,0,2,108,1,1.5,2
3,67,1,4,120,229,0,2,129,1,2.6,2
4,37,1,3,130,250,0,0,187,0,3.5,3
5,41,0,2,130,204,0,2,172,0,1.4,1
6,56,1,2,120,236,0,0,178,0,0.8,1
7,62,0,4,140,268,0,2,160,0,3.6,3
8,57,0,4,120,354,0,0,163,1,0.6,1
9,63,1,4,130,254,0,2,147,0,1.4,2
10,53,1,4,140,203,1,2,155,1,3.1,3


## Separating the input (X) and the result column (y)

In [4]:
y, X = unpack(cleveland, ==(:target), colname -> true);

## Particionating dataset into train and test and transforming the result in binary
We'll run logistic regression, so our output needs to be binary.

In [5]:
for i = 1:size(y)[1]
   y[i] == 0 ? y[i] = 0 : y[i] = 1
end


train, test = partition(eachindex(y), 0.75, stratify=y);

## Running Logistic Regression

In [6]:
using MLJLinearModels
LRClassifier =  MLJLinearModels.LogisticClassifier()

LR = machine(LRClassifier, X, categorical(y))

└ @ MLJBase C:\Users\Rogerio\.julia\packages\MLJBase\uMlf8\src\machines.jl:73


[34mMachine{LogisticClassifier} @615[39m trained 0 times.
  args: 
    1:	[34mSource @380[39m ⏎ `Table{Union{AbstractArray{Count,1}, AbstractArray{Multiclass{40},1}, AbstractArray{Multiclass{4},1}, AbstractArray{Multiclass{2},1}, AbstractArray{Multiclass{3},1}}}`
    2:	[34mSource @098[39m ⏎ `AbstractArray{Multiclass{2},1}`


In [7]:
MLJ.fit!(LR, rows=train);
ŷ = MLJ.predict(LR, X[test,:]);

┌ Info: Training [34mMachine{LogisticClassifier} @615[39m.
└ @ MLJBase C:\Users\Rogerio\.julia\packages\MLJBase\uMlf8\src\machines.jl:317


### Analysing metrics

In [28]:
print("Acurácia: ",(accuracy(mode.(ŷ), categorical(y[test])))*100, "%")

Acurácia: 81.08108108108108%

In [13]:
confusion_matrix(mode.(ŷ), categorical(y[test]))

│ using: negative='0' and positive='1'.
└ @ MLJBase C:\Users\Rogerio\.julia\packages\MLJBase\uMlf8\src\measures\confusion_matrix.jl:83


              ┌───────────────────────────┐
              │       Ground Truth        │
┌─────────────┼─────────────┬─────────────┤
│  Predicted  │      0      │      1      │
├─────────────┼─────────────┼─────────────┤
│      0      │     37      │     11      │
├─────────────┼─────────────┼─────────────┤
│      1      │      3      │     23      │
└─────────────┴─────────────┴─────────────┘


#### Considering the total of 74 instances, we have:

In [29]:
print("True negative = ", (TN/Total)*100, "%\nTrue Positive = ", (TP/Total)*100, "%\nFalse Negative = ", (FN/Total)*100, "%\nFalse Positive = ", (FP/Total)*100, "%\n\n")

True negative = 45.76271186440678%
True Positive = 27.11864406779661%
False Negative = 18.64406779661017%
False Positive = 8.47457627118644%



#### Considering the total of 26 positive classifications, we have:

In [30]:
print("True Positive = ", (TP/TotalP)*100, "%\nFalse Positive = ", (FP/TotalP)*100, "%\n\n")

True Positive = 61.53846153846154%
False Positive = 19.230769230769234%



#### Considering the total of 48 negative classifications, we have:

In [31]:
print("True Negative = ", (TN/TotalN)*100, "%\nFalse Negative = ", (FN/TotalN)*100, "%\n\n")

True Negative = 56.25%
False Negative = 22.916666666666664%

