In [1]:
using DataFrames
using ScikitLearn: fit!, predict, @sk_import
using CSV

Loading Car evaluation dataset

Dataset contains 1,728 rows × 7 columns

(may take around a minute if loading the data for the first time)

In [3]:
data = DataFrame(CSV.File("car.data"; header=["buying", "maint", "doors", "persons", "lug_boot", "safety","class_values"]))

Unnamed: 0_level_0,buying,maint,doors,persons,lug_boot,safety,class_values
Unnamed: 0_level_1,String,String,String,String,String,String,String
1,vhigh,vhigh,2,2,small,low,unacc
2,vhigh,vhigh,2,2,small,med,unacc
3,vhigh,vhigh,2,2,small,high,unacc
4,vhigh,vhigh,2,2,med,low,unacc
5,vhigh,vhigh,2,2,med,med,unacc
6,vhigh,vhigh,2,2,med,high,unacc
7,vhigh,vhigh,2,2,big,low,unacc
8,vhigh,vhigh,2,2,big,med,unacc
9,vhigh,vhigh,2,2,big,high,unacc
10,vhigh,vhigh,2,4,small,low,unacc


Encoding all categorical data(string type) that helps in training the model

In [4]:
@sk_import preprocessing: LabelEncoder
enc = LabelEncoder()
for i = 1:7
    data[!,i] = enc.fit_transform(data[!,i])
end

Converting DataFrame to Arrays and splitting input columns(x) and output column(y)

In [5]:
data_x = convert(Array, data[:,[1,2,3,4,5,6]])
data_y = convert(Array, data[:,7])

1728-element Array{Int64,1}:
 2
 2
 2
 2
 2
 2
 2
 2
 2
 2
 2
 2
 2
 ⋮
 2
 1
 3
 2
 0
 1
 2
 1
 3
 2
 1
 3

Splitting data into training and testing sub-datasets with 70/30 split respectively

In [6]:
@sk_import model_selection: train_test_split
x_train, x_test, y_train, y_test = train_test_split(data_x, data_y, test_size=0.3, random_state=1)

└ @ ScikitLearn.Skcore /Users/akshaysharma/.julia/packages/ScikitLearn/NJwUf/src/Skcore.jl:179


4-element Array{Array{Int64,N} where N,1}:
 [1 0 … 1 1; 0 1 … 1 0; … ; 3 2 … 2 2; 2 0 … 0 0]
 [2 1 … 2 1; 0 0 … 0 2; … ; 0 3 … 1 0; 3 1 … 0 0]
 [2, 0, 2, 1, 2, 2, 0, 2, 2, 0  …  2, 2, 2, 2, 1, 0, 0, 0, 2, 2]
 [2, 0, 2, 2, 2, 0, 2, 2, 0, 2  …  2, 2, 2, 2, 2, 2, 2, 2, 2, 0]

# Naive Bayesian Classifier

In [7]:
@sk_import naive_bayes: CategoricalNB
model = CategoricalNB()

PyObject CategoricalNB()

In [8]:
fit!(model, x_train, y_train)

PyObject CategoricalNB()

### Performance on Testing Dataset

In [9]:
@time predictions_test = predict(model, x_test)

  0.028887 seconds (28.77 k allocations: 1.557 MiB)


519-element Array{Int64,1}:
 2
 0
 2
 2
 2
 2
 2
 2
 0
 2
 2
 2
 2
 ⋮
 2
 2
 2
 2
 2
 2
 2
 2
 2
 2
 2
 0

In [10]:
@sk_import metrics: accuracy_score
accuracy = accuracy_score(predictions_test, y_test)

0.8689788053949904

### Performance on Training Dataset

In [11]:
@time predictions_train = predict(model, x_train)

  0.000969 seconds (36 allocations: 11.312 KiB)


1209-element Array{Int64,1}:
 2
 0
 2
 0
 2
 2
 1
 2
 2
 2
 2
 2
 2
 ⋮
 0
 0
 2
 2
 2
 2
 1
 0
 0
 0
 2
 2

In [12]:
@sk_import metrics: accuracy_score
accuracy = accuracy_score(predictions_train, y_train)



0.8817204301075269

**Size of Naive bayesian model**

In [13]:
using PyCall
@pyimport pickle
p = pickle.dumps(model)
print(sizeof(p)," bytes")

2424 bytes

# Decision Tree Classifier

In [14]:
@sk_import tree: DecisionTreeClassifier
model = DecisionTreeClassifier()

PyObject DecisionTreeClassifier()

In [15]:
fit!(model, x_train, y_train)

PyObject DecisionTreeClassifier()

### Performance On Testing Dataset

In [16]:
@time predictions_test = predict(model, x_test)

  0.000549 seconds (36 allocations: 5.875 KiB)


519-element Array{Int64,1}:
 2
 0
 2
 2
 2
 0
 2
 2
 0
 2
 0
 2
 2
 ⋮
 2
 2
 2
 2
 2
 2
 2
 2
 2
 2
 2
 0

In [17]:
@sk_import metrics: accuracy_score
accuracy = accuracy_score(predictions_test, y_test)



0.9691714836223507

### Performance On Training Dataset

In [18]:
@time predictions_train = predict(model, x_train)

  0.000589 seconds (36 allocations: 11.312 KiB)


1209-element Array{Int64,1}:
 2
 0
 2
 1
 2
 2
 0
 2
 2
 0
 2
 2
 2
 ⋮
 0
 3
 2
 2
 2
 2
 1
 0
 0
 0
 2
 2

In [19]:
@sk_import metrics: accuracy_score
accuracy = accuracy_score(predictions_train, y_train)



1.0

**Size of Decision tree model**

In [20]:
using PyCall
@pyimport pickle
p = pickle.dumps(model)
print(sizeof(p)," bytes")

14757 bytes

# Support Vector Machine

In [21]:
@sk_import svm: LinearSVC
model = LinearSVC()

PyObject LinearSVC()

In [22]:
fit!(model, x_train, y_train)

PyObject LinearSVC()

### Performance on Testing Dataset

In [23]:
@time predictions_test = predict(model, x_test)

  0.002837 seconds (39 allocations: 7.250 KiB)


519-element Array{Int64,1}:
 2
 0
 2
 2
 2
 2
 2
 2
 2
 2
 2
 2
 2
 ⋮
 2
 2
 2
 0
 2
 2
 2
 2
 2
 2
 2
 2

In [24]:
@sk_import metrics: accuracy_score
accuracy = accuracy_score(predictions_test, y_test)



0.6955684007707129

### Performance on Training Dataset

In [25]:
@time predictions_train = predict(model, x_train)

  0.000658 seconds (36 allocations: 11.312 KiB)


1209-element Array{Int64,1}:
 2
 2
 2
 2
 2
 2
 2
 2
 2
 2
 2
 2
 2
 ⋮
 0
 2
 2
 2
 2
 2
 2
 2
 2
 2
 2
 2

In [26]:
@sk_import metrics: accuracy_score
accuracy = accuracy_score(predictions_train, y_train)



0.6989247311827957

**Size of support vector machine model**

In [27]:
using PyCall
@pyimport pickle
p = pickle.dumps(model)
print(sizeof(p)," bytes")

884 bytes

# Neural Networks

In [28]:
@sk_import neural_network: MLPClassifier
model = MLPClassifier()

PyObject MLPClassifier()

In [29]:
@time fit!(model, x_train, y_train)

  1.696584 seconds (21 allocations: 1.109 KiB)


PyObject MLPClassifier()

### Performance On Testing Dataset

In [30]:
@time predictions_test = predict(model, x_test)

  0.003514 seconds (36 allocations: 5.875 KiB)


519-element Array{Int64,1}:
 2
 0
 2
 2
 2
 0
 2
 2
 0
 2
 0
 2
 2
 ⋮
 2
 2
 2
 2
 2
 2
 2
 2
 2
 2
 2
 0

In [31]:
@sk_import metrics: accuracy_score
accuracy = accuracy_score(predictions_test, y_test)



0.8689788053949904

### Performance On Training Dataset

In [32]:
@time predictions_train = predict(model, x_train)

  0.003493 seconds (36 allocations: 11.312 KiB)


1209-element Array{Int64,1}:
 2
 0
 2
 0
 2
 2
 0
 2
 2
 0
 2
 2
 2
 ⋮
 0
 0
 2
 2
 2
 2
 0
 2
 0
 0
 2
 2

In [33]:
@sk_import metrics: accuracy_score
accuracy = accuracy_score(predictions_train, y_train)



0.9007444168734491

**Size of neural network model**

In [34]:
using PyCall
@pyimport pickle
p = pickle.dumps(model)
print(sizeof(p)," bytes")

34777 bytes