# **Maximum Entropy Classifier**

This program predicts music preference based on age and gender and returns data entropy and average prediction accuracy over 30 runs.<br>
Run the code by clicking <b>Run All</b>.

In [1]:
#import pandas, scipy and sklearn packages

import pandas as pd
import scipy.stats
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn import tree
from itertools import repeat
import numpy as np

**1. Read in the dataset**

In [2]:
df = pd.read_csv('SupervisedLearning/test-new.csv')
dfTrain = pd.read_csv('SupervisedLearning/train-new.csv')
#TODO: Write code below to inspect the first five rows of the data frame

df.head(5)

Unnamed: 0,dvcat,weight,dead,airbag,seatbelt,frontal,sex,ageOFocc,yearacc,yearVeh,abcat,occRole,deploy,caseid
0,40-54,18.895,alive,airbag,belted,0,m,25,2002,2000.0,deploy,pass,1,81:99:1
1,25-39,266.532,alive,none,none,1,f,28,2001,1991.0,unavail,driver,0,76:12:2
2,25-39,51.81,alive,airbag,belted,1,f,36,1999,1994.0,deploy,driver,1,78:53:2
3,10-24,1567.626,alive,airbag,belted,0,f,24,2002,1994.0,nodeploy,driver,0,11:47:1
4,25-39,31.342,alive,none,none,1,f,46,1997,1990.0,unavail,pass,0,12:87:2


In [3]:
# Clean data
df = df.drop(columns = ['dvcat'])
dfTrain = dfTrain.drop(columns = ['dvcat'])
df = df.drop(columns = ['caseid'])
dfTrain = dfTrain.drop(columns = ['caseid'])

df = df.dropna()
dfTrain = dfTrain.dropna()

combine = [dfTrain, df]

# Convert strings to integers
dead_mapping = {'alive': 1, 'dead':0}
airbag_mapping = {'airbag':1, 'none':0}
seatbelt_mapping = {'belted':1, 'none':0}
sex_mapping = {'m':1, 'f':0}
abcat_mapping = {'deploy':1, 'nodeploy':0}
role_mapping = {'driver':1, 'pass':0}

for dataset in combine:
    dataset['dead'] = dataset['dead'].map(dead_mapping)
    dataset['dead'] = dataset['dead'].fillna(0)
    dataset['airbag'] = dataset['airbag'].map(airbag_mapping)
    dataset['airbag'] = dataset['airbag'].fillna(0)
    dataset['seatbelt'] = dataset['seatbelt'].map(seatbelt_mapping)
    dataset['seatbelt'] = dataset['seatbelt'].fillna(0)
    dataset['sex'] = dataset['sex'].map(sex_mapping)
    dataset['sex'] = dataset['sex'].fillna(0)
    dataset['occRole'] = dataset['occRole'].map(role_mapping)
    dataset['occRole'] = dataset['occRole'].fillna(0)

dfTrain.head()

Unnamed: 0,weight,dead,airbag,seatbelt,frontal,sex,ageOFocc,yearacc,yearVeh,abcat,occRole,deploy,injSeverity
0,53.342,0,1,1,1,0,48,2002,1997,deploy,1,1,3
1,154.96,1,0,0,1,1,26,2001,1968,unavail,1,0,3
2,38.994,1,0,0,1,0,51,2002,1994,unavail,1,0,3
3,168.568,1,1,1,1,1,27,1998,1996,deploy,0,1,3
4,27.751,1,1,1,0,1,26,2002,1997,nodeploy,0,0,0


**2. Split the dataset**

In [8]:
# Run this section to inspect X
X = df.drop(columns = ['ageOFocc'])

#TODO: Write code to inspect X
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8651 entries, 0 to 8651
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   weight    8651 non-null   float64
 1   dead      8651 non-null   int64  
 2   airbag    8651 non-null   int64  
 3   seatbelt  8651 non-null   int64  
 4   frontal   8651 non-null   int64  
 5   sex       8651 non-null   int64  
 6   yearacc   8651 non-null   int64  
 7   yearVeh   8651 non-null   float64
 8   abcat     8651 non-null   object 
 9   occRole   8651 non-null   int64  
 10  deploy    8651 non-null   int64  
dtypes: float64(2), int64(8), object(1)
memory usage: 811.0+ KB


In [9]:
# Uncomment this section to inpect y
y = df['ageOFocc']

#TODO: Write code to inspect y
y.info()

<class 'pandas.core.series.Series'>
Int64Index: 8651 entries, 0 to 8651
Series name: ageOFocc
Non-Null Count  Dtype
--------------  -----
8651 non-null   int64
dtypes: int64(1)
memory usage: 135.2 KB


**3. Compute entropy of data set**

In [11]:
# Compute the maximum entropy value
k = y.unique().size
maxE = np.log2(k)
p_data = y.value_counts(normalize=True)           # counts occurrence of each value
entropy = scipy.stats.entropy(p_data)  # get entropy from counts

# normalize the value to be between 0 and 1.
normalizedE = entropy/maxE

#TODO: Write code to display the entropy value
entropy

4.025941101403273

**4. Testing: entropy-based decision tree classifier averaged over 30 runs**

In [13]:
avg_score = 0.0
ntimes = 30

for i in repeat(None, ntimes):

    # train model with 80% of the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

    # prediction using entropy
    # Note: You can replace 'entropy' by 'gini' to get the classifier to use the gini index criterion.
    model = DecisionTreeClassifier(criterion='entropy')
    model.fit(X_train,y_train)
    predictions = model.predict(X_test)

    # compute model accuracy
    avg_score += accuracy_score(y_test, predictions)

avg_score /= ntimes

ValueError: could not convert string to float: 'deploy'

**5. print outputs**

In [33]:
print('normalized entropy value: %.3f'% normalizedE)
print('average accuracy score: %.3f' % avg_score)


normalized entropy value: 0.639
average accuracy score: 0.000


**Output the tree dot file**

In [14]:
# output visual (can be visualized with visual code)
tree.export_graphviz(model, out_file='SupervisedLearning/caraccidentgraph.dot',
                    feature_names=['airbag', 'seatbelt'],
                    class_names=sorted(y.unique()),
                    label='all',
                    rounded=True,
                    filled=True)

AttributeError: 'DecisionTreeClassifier' object has no attribute 'n_features_in_'

**6. Earn Your Wings: Do it yourself & Gini index test**

Test this algorithm on your own data set.
Repeat steps 4 and 5 with the gini index criterion, and save the visualization with *gini*. 

Open the dot files in VS Code, take screenshots of both trees and insert them in the text cell below.