In [5]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split

In [6]:
data = pd.read_csv('500hits.csv', encoding = 'latin-1')
data.head()

Unnamed: 0,PLAYER,YRS,G,AB,R,H,2B,3B,HR,RBI,BB,SO,SB,CS,BA,HOF
0,Ty Cobb,24,3035,11434,2246,4189,724,295,117,726,1249,357,892,178,0.366,1
1,Stan Musial,22,3026,10972,1949,3630,725,177,475,1951,1599,696,78,31,0.331,1
2,Tris Speaker,22,2789,10195,1882,3514,792,222,117,724,1381,220,432,129,0.345,1
3,Derek Jeter,20,2747,11195,1923,3465,544,66,260,1311,1082,1840,358,97,0.31,1
4,Honus Wagner,21,2792,10430,1736,3430,640,252,101,0,963,327,722,15,0.329,1


In [7]:
df = data.drop(columns = ['PLAYER', 'CS'])
df.head()

Unnamed: 0,YRS,G,AB,R,H,2B,3B,HR,RBI,BB,SO,SB,BA,HOF
0,24,3035,11434,2246,4189,724,295,117,726,1249,357,892,0.366,1
1,22,3026,10972,1949,3630,725,177,475,1951,1599,696,78,0.331,1
2,22,2789,10195,1882,3514,792,222,117,724,1381,220,432,0.345,1
3,20,2747,11195,1923,3465,544,66,260,1311,1082,1840,358,0.31,1
4,21,2792,10430,1736,3430,640,252,101,0,963,327,722,0.329,1


In [8]:
X = df.iloc[:, 0:13]
y = df.iloc[:, 13]

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2)

In [11]:
dtc = DecisionTreeClassifier()
dtc.fit(X_train, y_train)


In [22]:
dtc.get_params()

{'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'monotonic_cst': None,
 'random_state': None,
 'splitter': 'best'}

In [14]:
dtc.score(X_test, y_test)

0.8387096774193549

In [17]:
y_pred = dtc.predict(X_test)
print('pridected', y_pred)

pridected [1 0 1 0 0 0 0 1 0 0 0 1 0 1 0 1 0 0 1 1 1 0 1 0 1 0 0 1 1 1 0 0 1 0 1 1 1
 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 1 0 1 0 1 1
 0 0 0 0 1 0 1 0 0 0 1 0 1 0 0 1 0 1 0]


In [19]:
print(confusion_matrix(y_test, y_pred))

[[54  8]
 [ 7 24]]


In [20]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.89      0.87      0.88        62
           1       0.75      0.77      0.76        31

    accuracy                           0.84        93
   macro avg       0.82      0.82      0.82        93
weighted avg       0.84      0.84      0.84        93



In [25]:
dtc.feature_importances_

array([0.01044965, 0.02803169, 0.03836323, 0.32387975, 0.06333245,
       0.0353301 , 0.02989079, 0.04532669, 0.1070903 , 0.05955122,
       0.05432816, 0.04268329, 0.16174268])

In [26]:
X.columns

Index(['YRS', 'G', 'AB', 'R', 'H', '2B', '3B', 'HR', 'RBI', 'BB', 'SO', 'SB',
       'BA'],
      dtype='object')

In [31]:
features = pd.DataFrame(dtc.feature_importances_, index = X.columns)
features

Unnamed: 0,0
YRS,0.01045
G,0.028032
AB,0.038363
R,0.32388
H,0.063332
2B,0.03533
3B,0.029891
HR,0.045327
RBI,0.10709
BB,0.059551


In [34]:
dtc2 = DecisionTreeClassifier(criterion = 'entropy', ccp_alpha = 0.04)

In [35]:
dtc2.fit(X_train, y_train)

In [38]:
dtc.score(X_test, y_test)

0.8387096774193549

In [40]:
y_pred2 = dtc.predict(X_test)
print(y_pred2)

[1 0 1 0 0 0 0 1 0 0 0 1 0 1 0 1 0 0 1 1 1 0 1 0 1 0 0 1 1 1 0 0 1 0 1 1 1
 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 1 0 1 0 1 1
 0 0 0 0 1 0 1 0 0 0 1 0 1 0 0 1 0 1 0]


In [42]:
print(confusion_matrix(y_test,y_pred2))

[[54  8]
 [ 7 24]]


In [43]:
print(classification_report(y_test, y_pred2))

              precision    recall  f1-score   support

           0       0.89      0.87      0.88        62
           1       0.75      0.77      0.76        31

    accuracy                           0.84        93
   macro avg       0.82      0.82      0.82        93
weighted avg       0.84      0.84      0.84        93

