<a href="https://colab.research.google.com/github/Christian-Albertini/project-4/blob/main/Project_4_random_forest_with_standings.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Import dependencies
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.utils.class_weight import compute_class_weight
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import roc_auc_score
from imblearn.over_sampling import SMOTE
import tensorflow as tf
import numpy as np
from sklearn.ensemble import RandomForestClassifier



# Use raw URLs from GitHub
combined_url = 'https://raw.githubusercontent.com/Christian-Albertini/project-4/main/Resources/Merged_Player_Stats.csv'

# Read the CSV files
Combined_Player_Stats = pd.read_csv(combined_url)

# Display the first few rows of each DataFrame
Combined_Player_Stats.head()


Unnamed: 0,Player,Season,Age,Team,Lg,G,AS,PA,AB,R,...,OPS,OPS+,TB,GIDP,HBP,SH,SF,IBB,Pos,Standings
0,Adam Duvall,2023,34,BOS,AL,92,0,353,320,45,...,0.834,119,170,0,6,0,5,1,89/H7D,20.0
1,Adam Frazier,2023,31,BAL,AL,141,0,455,412,59,...,0.696,94,163,4,4,4,2,0,*4H/97D,2.0
2,Adley Rutschman,2023,25,BAL,AL,154,1,687,588,84,...,0.809,128,256,14,2,0,5,6,*2D/H,2.0
3,Adolis García,2023,30,TEX,AL,148,1,632,555,108,...,0.836,127,282,12,6,0,6,0,*9/8DH,7.0
4,Akil Baddoo,2023,24,DET,AL,112,0,357,312,40,...,0.682,88,116,4,0,2,1,2,7H/89D,19.0


In [7]:
# Convert categorical data to numeric
Combined_Player_Stats=Combined_Player_Stats.drop(columns=['Team', '1B', '2B', '3B', 'CS', 'Lg', 'GIDP','HBP','SH','SF','IBB','Pos'])
Combined_Player_Stats=Combined_Player_Stats.drop(columns=['OPS', 'Age', 'Player'])
Combined_Player_Stats=Combined_Player_Stats.dropna()

# Review Dataframes
Combined_Player_Stats.head()

Unnamed: 0,Season,G,AS,PA,AB,R,H,HR,RBI,SB,BB,SO,BA,OBP,SLG,OPS+,TB,Standings
0,2023,92,0,353,320,45,79,21,58,4,22,110,0.247,0.303,0.531,119,170,20.0
1,2023,141,0,455,412,59,99,13,60,11,32,68,0.24,0.3,0.396,94,163,2.0
2,2023,154,1,687,588,84,163,20,80,1,92,101,0.277,0.374,0.435,128,256,2.0
3,2023,148,1,632,555,108,136,39,107,9,65,175,0.245,0.328,0.508,127,282,7.0
4,2023,112,0,357,312,40,68,11,34,14,42,89,0.218,0.31,0.372,88,116,19.0


In [16]:
# Identify feature and target arrays
y = Combined_Player_Stats['AS']
X = Combined_Player_Stats.drop(columns=['AS'])
y.head()
# Split the data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [17]:
# Apply SMOTE to the training data to handle class imbalance

smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

In [20]:
# Initialize the Random Forest model
model = RandomForestClassifier(n_estimators=25, random_state=42)

# Train the model on the resampled training data
model.fit(X_train_smote, y_train_smote)

In [21]:
# Predict on the test set
y_pred = model.predict(X_test)

# Evaluate the model
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Calculate ROC AUC score
roc_auc = roc_auc_score(y_test, y_pred)
print(f"\nROC AUC Score: {roc_auc}")

# # Feature Importance
importances = model.feature_importances_
feature_names = X.columns
feature_importances = pd.DataFrame({'Feature': feature_names, 'Importance': importances})
feature_importances = feature_importances.sort_values(by='Importance', ascending=False)

print("\nFeature Importances:")
print(feature_importances)

Confusion Matrix:
[[282  62]
 [ 27  54]]

Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.82      0.86       344
           1       0.47      0.67      0.55        81

    accuracy                           0.79       425
   macro avg       0.69      0.74      0.71       425
weighted avg       0.83      0.79      0.80       425


ROC AUC Score: 0.7432170542635659

Feature Importances:
      Feature  Importance
7         RBI    0.113251
11         BA    0.094768
14       OPS+    0.088005
15         TB    0.083914
5           H    0.079773
13        SLG    0.066808
4           R    0.057020
6          HR    0.052939
16  Standings    0.051488
12        OBP    0.048176
2          PA    0.046864
3          AB    0.043604
9          BB    0.040646
10         SO    0.039356
1           G    0.035477
8          SB    0.030702
0      Season    0.027208
