# RandomForestClassifier

In [10]:
! pip install scikit-learn



In [15]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

In [17]:
data = pd.read_csv('500hits.csv', encoding = 'latin=1')
data.head()

Unnamed: 0,PLAYER,YRS,G,AB,R,H,2B,3B,HR,RBI,BB,SO,SB,CS,BA,HOF
0,Ty Cobb,24,3035,11434,2246,4189,724,295,117,726,1249,357,892,178,0.366,1
1,Stan Musial,22,3026,10972,1949,3630,725,177,475,1951,1599,696,78,31,0.331,1
2,Tris Speaker,22,2789,10195,1882,3514,792,222,117,724,1381,220,432,129,0.345,1
3,Derek Jeter,20,2747,11195,1923,3465,544,66,260,1311,1082,1840,358,97,0.31,1
4,Honus Wagner,21,2792,10430,1736,3430,640,252,101,0,963,327,722,15,0.329,1


In [19]:
df = data.drop(columns = ['PLAYER', 'CS'])
df.head()

Unnamed: 0,YRS,G,AB,R,H,2B,3B,HR,RBI,BB,SO,SB,BA,HOF
0,24,3035,11434,2246,4189,724,295,117,726,1249,357,892,0.366,1
1,22,3026,10972,1949,3630,725,177,475,1951,1599,696,78,0.331,1
2,22,2789,10195,1882,3514,792,222,117,724,1381,220,432,0.345,1
3,20,2747,11195,1923,3465,544,66,260,1311,1082,1840,358,0.31,1
4,21,2792,10430,1736,3430,640,252,101,0,963,327,722,0.329,1


In [20]:
X = df.iloc[:, 0:13]
y = df.iloc[: , 13]

In [21]:
X

Unnamed: 0,YRS,G,AB,R,H,2B,3B,HR,RBI,BB,SO,SB,BA
0,24,3035,11434,2246,4189,724,295,117,726,1249,357,892,0.366
1,22,3026,10972,1949,3630,725,177,475,1951,1599,696,78,0.331
2,22,2789,10195,1882,3514,792,222,117,724,1381,220,432,0.345
3,20,2747,11195,1923,3465,544,66,260,1311,1082,1840,358,0.310
4,21,2792,10430,1736,3430,640,252,101,0,963,327,722,0.329
...,...,...,...,...,...,...,...,...,...,...,...,...,...
460,15,1920,6653,1105,1665,285,39,291,964,1224,1427,225,0.250
461,17,1829,6092,900,1664,379,10,275,1065,936,1453,20,0.273
462,15,1834,6499,1062,1661,338,67,210,761,960,1190,315,0.256
463,16,1822,6309,714,1660,254,25,54,593,396,489,74,0.263


In [22]:
y

0      1
1      1
2      1
3      1
4      1
      ..
460    0
461    0
462    0
463    0
464    0
Name: HOF, Length: 465, dtype: int64

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, test_size = 0.2)

In [26]:
model  = RandomForestClassifier()
model.fit(X_train, y_train)

In [29]:
y_pred = model.predict(X_test)
y_pred

array([1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0,
       1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0,
       0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0,
       0, 0, 0, 1, 0], dtype=int64)

In [30]:
model.score(X_test, y_test)

0.8817204301075269

In [31]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.89      0.94      0.91        62
           1       0.86      0.77      0.81        31

    accuracy                           0.88        93
   macro avg       0.87      0.85      0.86        93
weighted avg       0.88      0.88      0.88        93



In [35]:
features = pd.DataFrame(model.feature_importances_, index = X.columns)
features

Unnamed: 0,0
YRS,0.027737
G,0.051129
AB,0.066564
R,0.148363
H,0.117355
2B,0.076797
3B,0.053697
HR,0.048395
RBI,0.104723
BB,0.050196


# HyperParameter

In [49]:
model2 = RandomForestClassifier(n_estimators = 1000, 
                               min_samples_split = 10,
                               criterion = 'entropy',
                                max_depth = 14,
                                random_state = 11
                                
)

In [50]:
model2.fit(X_train, y_train)

In [51]:
model2.score(X_test, y_test)

0.8817204301075269

In [53]:
y_pred = model2.predict(X_test)
y_pred

array([1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0,
       1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0,
       0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0,
       0, 0, 0, 1, 0], dtype=int64)

In [54]:
print(classification_report(y_pred, y_test))

              precision    recall  f1-score   support

           0       0.94      0.89      0.91        65
           1       0.77      0.86      0.81        28

    accuracy                           0.88        93
   macro avg       0.85      0.87      0.86        93
weighted avg       0.89      0.88      0.88        93

