In [2]:
import pandas as pd
import numpy as np

import weka.core.jvm as jvm
from weka.core.dataset import create_instances_from_lists, Instances
from weka.core.converters import Loader
from weka.classifiers import Classifier, Evaluation
from weka.core.classes import Random

import weka.plot.graph as graph
jvm.start()

In [66]:
# inport data



loader = Loader(classname="weka.core.converters.ArffLoader")
data = loader.load_file("vote.arff")
data.class_is_last()

INFO:weka.core.jvm:JVM already running, call jvm.stop() first


In [41]:
cls = Classifier(classname="weka.classifiers.trees.J48")
#print(cls.to_help())

In [4]:
cls.build_classifier(data)
print(cls)

J48 pruned tree
------------------

physician-fee-freeze = n: democrat (253.41/3.75)
physician-fee-freeze = y
|   synfuels-corporation-cutback = n: republican (145.71/4.0)
|   synfuels-corporation-cutback = y
|   |   mx-missile = n
|   |   |   adoption-of-the-budget-resolution = n: republican (22.61/3.32)
|   |   |   adoption-of-the-budget-resolution = y
|   |   |   |   anti-satellite-test-ban = n: democrat (5.04/0.02)
|   |   |   |   anti-satellite-test-ban = y: republican (2.21)
|   |   mx-missile = y: democrat (6.03/1.03)

Number of Leaves  : 	6

Size of the tree : 	11



In [71]:
print(cls)

J48 pruned tree
------------------

physician-fee-freeze = n: democrat (253.41/3.75)
physician-fee-freeze = y
|   synfuels-corporation-cutback = n: republican (145.71/4.0)
|   synfuels-corporation-cutback = y
|   |   mx-missile = n
|   |   |   adoption-of-the-budget-resolution = n: republican (22.61/3.32)
|   |   |   adoption-of-the-budget-resolution = y
|   |   |   |   anti-satellite-test-ban = n: democrat (5.04/0.02)
|   |   |   |   anti-satellite-test-ban = y: republican (2.21)
|   |   mx-missile = y: democrat (6.03/1.03)

Number of Leaves  : 	6

Size of the tree : 	11



In [5]:
graph.plot_dot_graph(cls.graph)

In [6]:
n = 5
evaluation = Evaluation(data)                     # initialize with priors
evaluation.crossvalidate_model(cls, data, n, Random(1))  # 5-fold CV
print("Accuracy = %g" % evaluation.percent_correct + "%")

z = 1.96
accuracy = evaluation.percent_correct/100
margin = z * np.sqrt( (accuracy * (1 - accuracy)) / n)
print("95% "+"Confidence Interval = (%g, %g)" % (accuracy - margin, accuracy + margin))

print(evaluation.summary())

#print("Number of incorrect = %g" % evaluation.incorrect)
print(evaluation.class_details())

Accuracy = 96.5517%
95% Confidence Interval = (0.805579, 1.12546)

Correctly Classified Instances         420               96.5517 %
Incorrectly Classified Instances        15                3.4483 %
Kappa statistic                          0.9275
Mean absolute error                      0.059 
Root mean squared error                  0.1731
Relative absolute error                 12.4478 %
Root relative squared error             35.5458 %
Total Number of Instances              435     

=== Detailed Accuracy By Class ===

                 TP Rate  FP Rate  Precision  Recall   F-Measure  MCC      ROC Area  PRC Area  Class
                 0.966    0.036    0.977      0.966    0.972      0.928    0.967     0.967     democrat
                 0.964    0.034    0.947      0.964    0.956      0.928    0.967     0.932     republican
Weighted Avg.    0.966    0.035    0.966      0.966    0.966      0.928    0.967     0.953     



In [7]:
n = 5
seed = 1
rnd = Random(seed)
rand_data = Instances.copy_instances(data)
rand_data.randomize(rnd)
classifier = Classifier(classname="weka.classifiers.trees.J48")

for i in range(n):
    train = rand_data.train_cv(n, i)
    test = rand_data.test_cv(n, i)

    cls = Classifier.make_copy(classifier)
    cls.build_classifier(train)
    evaluation = Evaluation(rand_data)
    evaluation.test_model(cls, train)

    print("-------------%g-th fold-------------" % i)
    print("Accuracy for training data = %g" % evaluation.percent_correct + "%")
    
    evaluation = Evaluation(rand_data)
    evaluation.test_model(cls, test)
    print("Accuracy for test data = %g" % evaluation.percent_correct + "%")
    
    #graph.plot_dot_graph(cls.graph)

-------------0-th fold-------------
Accuracy for training data = 97.9885%
Accuracy for test data = 94.2529%
-------------1-th fold-------------
Accuracy for training data = 97.7011%
Accuracy for test data = 95.4023%
-------------2-th fold-------------
Accuracy for training data = 97.4138%
Accuracy for test data = 88.5057%
-------------3-th fold-------------
Accuracy for training data = 96.2644%
Accuracy for test data = 98.8506%
-------------4-th fold-------------
Accuracy for training data = 96.8391%
Accuracy for test data = 94.2529%


In [8]:
print(data)

@relation vote

@attribute handicapped-infants {n,y}
@attribute water-project-cost-sharing {n,y}
@attribute adoption-of-the-budget-resolution {n,y}
@attribute physician-fee-freeze {n,y}
@attribute el-salvador-aid {n,y}
@attribute religious-groups-in-schools {n,y}
@attribute anti-satellite-test-ban {n,y}
@attribute aid-to-nicaraguan-contras {n,y}
@attribute mx-missile {n,y}
@attribute immigration {n,y}
@attribute synfuels-corporation-cutback {n,y}
@attribute education-spending {n,y}
@attribute superfund-right-to-sue {n,y}
@attribute crime {n,y}
@attribute duty-free-exports {n,y}
@attribute export-administration-act-south-africa {n,y}
@attribute Class {democrat,republican}

@data
n,y,n,y,y,y,n,n,n,y,?,y,y,y,n,y,republican
n,y,n,y,y,y,n,n,n,n,n,y,y,y,n,?,republican
?,y,y,?,y,y,n,n,n,n,y,n,y,y,n,n,democrat
n,y,y,n,?,y,n,n,n,n,y,n,y,n,n,y,democrat
y,y,y,n,y,y,n,n,n,n,y,?,y,y,y,y,democrat
n,y,y,n,y,y,n,n,n,n,n,n,y,y,y,y,democrat
n,y,n,y,y,y,n,n,n,n,n,n,?,y,y,y,democrat
n,y,n,y,y,y,n,n,n,n,n,

In [49]:
import arff
import pandas as pd
data_dir = "https://archive.ics.uci.edu/ml/machine-learning-databases/voting-records/"
data1 = pd.read_csv(data_dir + "house-votes-84.data", header=None)
name1 = ["handicapped-infants", "water-project-cost-sharing", "adoption-of-the-budget-resolution", "physician-fee-freeze",
        "el-salvador-aid", "religious-groups-in-schools", "anti-satellite-test-ban", "aid-to-nicaraguan-contras",
        "mx-missile", "immigration", "synfuels-corporation-cutback", "education-spending",
        "superfund-right-to-sue", "crime", "duty-free-exports", "export-administration-act-south-africa", "Class"]

In [52]:
data1 = data1.reindex(columns=[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,0])

In [53]:
data1 = np.array(data1)

In [57]:
obj = {
   'description': u'',
   "relation": "vote",
   'attributes': [
       ("handicapped-infants", ["n", "y"]),
       ('water-project-cost-sharing', ['n', 'y']),
       ('adoption-of-the-budget-resolution', ['n', 'y']),
       ('physician-fee-freeze', ['n', 'y']),
       
       ('el-salvador-aid', ['n', 'y']),
       ('religious-groups-in-schools', ['n', 'y']),
       ('anti-satellite-test-ban', ['n', 'y']),
       ('aid-to-nicaraguan-contras', ['n', 'y']),
       
       ('mx-missile', ['n', 'y']),
       ('immigration', ['n', 'y']),
       ('synfuels-corporation-cutback', ['n', 'y']),
       ('education-spending', ['n', 'y']),
       
       ('superfund-right-to-sue', ['n', 'y']),
       ('crime', ['n', 'y']),
       ('duty-free-exports', ['n', 'y']),
       ('export-administration-act-south-africa', ['n', 'y']),
       
       ('\'Class\'', ['democrat', 'republican']),
   ],
   'data': data1,
}




fp = open("vote2.arff", "w")
arff.dump(obj, fp)
fp.close()

In [70]:
import weka.core.converters as converters
x = converters.load_any_file("vote2.arff")
x.class_is_last()
cls.build_classifier(x)