# Imports

In [21]:
import pandas as pd
from functions import *
import json

# Read and split the data from the CSV file

In [22]:
data = pd.read_csv('republican_democrat.csv', delimiter=',')
train_data, test_data = split_data(data, 0.8)

In [23]:
train_data

Unnamed: 0,handicapped-infants,water-project-cost-sharing,adoption-of-the-budget-resolution,physician-fee-freeze,religious-groups-in-schools,anti-satellite-test-ban,mx-missile,immigration,synfuels-corporation-cutback,education-spending,superfund-right-to-sue,crime,duty-free-exports,export-administration-act-south-africa,Target
31,y,y,y,n,n,y,y,n,y,n,n,n,y,?,democrat
253,n,y,n,y,y,n,n,n,n,y,y,y,n,y,republican
49,n,?,n,y,y,n,n,n,n,y,y,y,n,n,republican
397,y,y,n,n,?,n,n,n,y,n,y,y,n,y,democrat
172,n,y,y,n,n,y,y,y,n,n,?,n,y,y,democrat
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
368,n,y,y,n,y,y,n,y,n,n,n,n,y,y,democrat
113,n,?,n,y,y,n,n,y,n,y,y,y,n,n,republican
29,y,y,y,n,n,y,y,n,y,n,n,n,y,y,democrat
393,?,?,?,?,y,n,y,n,n,y,y,n,n,?,republican


In [24]:
result_tree = id3(train_data, 'Target')
print(json.dumps(result_tree, indent=4))

{
    "physician-fee-freeze": {
        "n": {
            "adoption-of-the-budget-resolution": {
                "y": "democrat",
                "n": {
                    "religious-groups-in-schools": {
                        "?": "democrat",
                        "n": {
                            "duty-free-exports": {
                                "y": "democrat",
                                "n": "republican"
                            }
                        },
                        "y": "democrat"
                    }
                },
                "?": "democrat"
            }
        },
        "y": {
            "synfuels-corporation-cutback": {
                "n": {
                    "export-administration-act-south-africa": {
                        "y": "republican",
                        "n": "republican",
                        "?": {
                            "adoption-of-the-budget-resolution": {
                                "n": "republ

In [26]:
#test the tree
accuracy = evaluate(result_tree, test_data, 'Target')

print("Accuracy: ", accuracy)



Accuracy:  0.927710843373494


In [50]:
cols_to_aggregate = list(train_data.columns.drop('Target'))

In [54]:
train_data[['Target','handicapped-infants']].groupby(by=['Target'], axis=0).agg('count')

Unnamed: 0_level_0,handicapped-infants
Target,Unnamed: 1_level_1
democrat,200
republican,132


In [65]:
apriori_mode = train_data[['Target']].mode().iloc[0,0]

In [55]:
from sklearn.metrics import accuracy_score,precision_score, recall_score, cohen_kappa_score, f1_score, confusion_matrix

Unnamed: 0,handicapped-infants,water-project-cost-sharing,adoption-of-the-budget-resolution,physician-fee-freeze,religious-groups-in-schools,anti-satellite-test-ban,mx-missile,immigration,synfuels-corporation-cutback,education-spending,superfund-right-to-sue,crime,duty-free-exports,export-administration-act-south-africa,Target
0,n,y,n,y,y,n,n,y,?,y,y,y,n,y,republican
1,n,y,n,y,y,n,n,n,n,y,y,y,n,?,republican
2,n,y,y,n,y,n,n,n,y,n,y,n,n,y,democrat
3,n,y,n,y,y,n,n,n,n,n,?,y,y,y,democrat
4,n,y,n,y,y,n,n,n,n,n,y,y,?,y,republican
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
78,n,y,n,y,y,n,n,y,n,y,?,y,n,n,republican
79,y,n,y,n,y,n,y,y,n,n,y,y,n,y,democrat
80,y,n,y,n,n,y,y,y,n,y,n,?,y,y,democrat
81,y,y,y,n,n,y,y,y,y,n,y,n,n,y,democrat


In [66]:
predictions_tree = [predict(result_tree, test_data.iloc[index]) for index, row in test_data.iterrows()]
predictions_apriori = [apriori_mode for _, _ in test_data.iterrows()]

In [67]:
test_data['pred_tree'] = predictions_tree
test_data['pred_apriori'] = predictions_apriori
test_data

Unnamed: 0,handicapped-infants,water-project-cost-sharing,adoption-of-the-budget-resolution,physician-fee-freeze,religious-groups-in-schools,anti-satellite-test-ban,mx-missile,immigration,synfuels-corporation-cutback,education-spending,superfund-right-to-sue,crime,duty-free-exports,export-administration-act-south-africa,Target,pred_tree,pred_apriori
0,n,y,n,y,y,n,n,y,?,y,y,y,n,y,republican,republican,democrat
1,n,y,n,y,y,n,n,n,n,y,y,y,n,?,republican,republican,democrat
2,n,y,y,n,y,n,n,n,y,n,y,n,n,y,democrat,democrat,democrat
3,n,y,n,y,y,n,n,n,n,n,?,y,y,y,democrat,republican,democrat
4,n,y,n,y,y,n,n,n,n,n,y,y,?,y,republican,republican,democrat
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
78,n,y,n,y,y,n,n,y,n,y,?,y,n,n,republican,republican,democrat
79,y,n,y,n,y,n,y,y,n,n,y,y,n,y,democrat,democrat,democrat
80,y,n,y,n,n,y,y,y,n,y,n,?,y,y,democrat,democrat,democrat
81,y,y,y,n,n,y,y,y,y,n,y,n,n,y,democrat,democrat,democrat


In [88]:
def generate_results_report(true, pred):
    print("Acurácia %.3f" % accuracy_score(true, pred))
    
    print("Matriz de Confusão", confusion_matrix(true, pred))
    
    print("Estatística Kappa %.3f" % cohen_kappa_score(true, pred))

    print("Precisão %.3f" % precision_score(true, pred, pos_label="democrat"))
    
    print("Sensitividade %.3f" % recall_score(true, pred, pos_label="democrat"))
    
    print("F1 Score %.3f" % f1_score(true, pred, pos_label="democrat"))


In [89]:
generate_results_report(test_data['Target'], test_data['pred_tree'])
# ,precision_score, recall_score, cohen_kappa_score, f1_score, confusion_matrix

Acurácia 0.928
Matriz de Confusão [[52  5]
 [ 1 25]]
Estatística Kappa 0.839
Precisão 0.981
Sensitividade 0.912
F1 Score 0.945
