In [34]:
import pandas as pd

In [35]:
# We re trying to predict whether or not a baseball player
# is inducted into the hall of fame (hof) using a classification tree

df = pd.read_csv("C:/Users/BARIS/Desktop/500hits.csv", encoding = 'latin-1')

In [36]:
df.head()

Unnamed: 0,PLAYER,YRS,G,AB,R,H,2B,3B,HR,RBI,BB,SO,SB,CS,BA,HOF
0,Ty Cobb,24,3035,11434,2246,4189,724,295,117,726,1249,357,892,178,0.366,1
1,Stan Musial,22,3026,10972,1949,3630,725,177,475,1951,1599,696,78,31,0.331,1
2,Tris Speaker,22,2789,10195,1882,3514,792,222,117,724,1381,220,432,129,0.345,1
3,Derek Jeter,20,2747,11195,1923,3465,544,66,260,1311,1082,1840,358,97,0.31,1
4,Honus Wagner,21,2792,10430,1736,3430,640,252,101,0,963,327,722,15,0.329,1


In [37]:
df.shape

(465, 16)

In [38]:
df = df.drop(columns = ['PLAYER', 'CS'])

In [39]:
# HOF column is my target variable
# so all other features will form my inputs
# for my input matrix X include all rows and every column except column 14 
# which is the HOF  column
# remember python begins indexing from 0

X = df.iloc[:,0:13]

In [40]:
y = df.iloc[:,13]

In [41]:
# we will split up our date between a training and a testing set

from sklearn.model_selection import train_test_split


In [42]:
import sklearn as sk

In [43]:
# random_state argument shuffles the data before applying the split
# It also makes the model more reproducible

x_train, x_test, y_train, y_test = train_test_split(X,y,random_state = 17, test_size=0.2)
# 80% of data is trained
# 20% of data is tested

In [44]:
x_train.shape

(372, 13)

In [45]:
x_test.shape

# to make sure the train test split happened correctly

(93, 13)

In [46]:
from sklearn.tree import DecisionTreeClassifier

In [47]:
dtc = DecisionTreeClassifier()

In [48]:
dtc.get_params()

{'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'random_state': None,
 'splitter': 'best'}

In [49]:
 # now we will fit model to this data

dtc.fit(x_train, y_train)



In [50]:
y_pred = dtc.predict(x_test)

In [51]:
# we will do a confusion matrix (error matrix) to test this model
# a confusion matrix is a matrix where entry(1,1) is true positive between actual and predicted values 
# entry (1,2) is false negative between actual (rows) and (col.s) predicted values
# entry (2,1) is false positive and
# entry (2,2) is a true negative

from sklearn.metrics import confusion_matrix as cm

print(cm(y_test, y_pred))


[[51 10]
 [12 20]]


In [52]:
# 52 true positives
# 9 false negatives
# 11 false positives
# 21 true negatives
from sklearn.metrics import classification_report

In [53]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.81      0.84      0.82        61
           1       0.67      0.62      0.65        32

    accuracy                           0.76        93
   macro avg       0.74      0.73      0.73        93
weighted avg       0.76      0.76      0.76        93



In [54]:
 # precision = true positive / (true pos + false positive)
# recall = true positive / (true pos + false negative)

# WHAT IS F1-SCORE ?????
# we will look at which features have had the biggest impact on our model

In [55]:
dtc.feature_importances_
# the importance of each feature is calculated using a measure called
# the gini importance

array([0.04465915, 0.0154869 , 0.04574729, 0.02765362, 0.38738161,
       0.04325963, 0.04400595, 0.00816891, 0.07925991, 0.10006703,
       0.03934693, 0.05632516, 0.10863792])

In [56]:
features = pd.DataFrame(dtc.feature_importances_, index = X.columns )

In [57]:
features.head(15)
# a dataframe showing importance of each feature

Unnamed: 0,0
YRS,0.044659
G,0.015487
AB,0.045747
R,0.027654
H,0.387382
2B,0.04326
3B,0.044006
HR,0.008169
RBI,0.07926
BB,0.100067


In [58]:
# it makes sense that hits are the most important
# the least important feature appears to be 3B in this case
# were going to run the model again

In [59]:
dtc.get_params()

{'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'random_state': None,
 'splitter': 'best'}

In [60]:
# criterion argument is the function used to measure the quality of a split
# Default (criterion) = "gini" for Gini impurity
# were going to choose a criterion called "entropy"
dtc2 = DecisionTreeClassifier(criterion='entropy', ccp_alpha=0.04)

# we chose ccp_alpha = 0.04 because we think the model overfits the data

In [63]:
dtc2.fit(x_train,y_train)

In [65]:
y_pred2 = dtc2.predict(x_test)

In [67]:
print(cm(y_test,y_pred2))

[[50 11]
 [ 9 23]]


In [68]:
print(classification_report(y_test,y_pred2))

              precision    recall  f1-score   support

           0       0.85      0.82      0.83        61
           1       0.68      0.72      0.70        32

    accuracy                           0.78        93
   macro avg       0.76      0.77      0.77        93
weighted avg       0.79      0.78      0.79        93



In [69]:
# this change didn't make too much of an impact
# but we re still going to look at feature importances
# by creating a dataframe of where every element is the importance
# of a feature

features2 = pd.DataFrame(dtc2.feature_importances_, index = X.columns)

In [70]:
features2.head(15)

Unnamed: 0,0
YRS,0.0
G,0.0
AB,0.0
R,0.0
H,0.837977
2B,0.0
3B,0.0
HR,0.0
RBI,0.0
BB,0.0


In [None]:
# this change is due to ccp.
# ccp removes part of the tree in order to stop overfitting