In [1]:
# Import dependencies
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import roc_auc_score
from imblearn.over_sampling import SMOTE
import tensorflow as tf
import numpy as np
from sklearn.ensemble import RandomForestClassifier



# Use raw URLs from GitHub
combined_url = 'https://raw.githubusercontent.com/Christian-Albertini/project-4/main/Resources/Combined_Player_Stats_Cleaned_3.csv'

# Read the CSV files
Combined_Player_Stats = pd.read_csv(combined_url)

# Display the first few rows of each DataFrame
Combined_Player_Stats.head()


Unnamed: 0,Season,Team,League,Games_Played,All-Star,Plate_Appearances,At_Bats,Runs,Hits,Single,...,Stolen_Bases,Caught_Stealing,Walks,Strikeouts,Batting_Average,On_Base_Percentage,Slugging_Percentage,On_Base_Plus_Slugging_(OPS),On_Base_Plus_Slugging_Plus_(OPS+),Total_Bases
0,2023,BOS,AL,92,0,353,320,45,79,32,...,4,0,22,110,0.247,0.303,0.531,0.834,119,170
1,2023,BAL,AL,141,0,455,412,59,99,63,...,11,4,32,68,0.24,0.3,0.396,0.696,94,163
2,2023,BAL,AL,154,1,687,588,84,163,111,...,1,2,92,101,0.277,0.374,0.435,0.809,128,256
3,2023,TEX,AL,148,1,632,555,108,136,68,...,9,1,65,175,0.245,0.328,0.508,0.836,127,282
4,2023,DET,AL,112,0,357,312,40,68,43,...,14,3,42,89,0.218,0.31,0.372,0.682,88,116


In [2]:
# Convert categorical data to numeric
combined_df = pd.get_dummies(Combined_Player_Stats, columns=['League', 'Team'])

# Review Dataframes
combined_df.head()

Unnamed: 0,Season,Games_Played,All-Star,Plate_Appearances,At_Bats,Runs,Hits,Single,Double,Triple,...,Team_PHI,Team_PIT,Team_SDP,Team_SEA,Team_SFG,Team_STL,Team_TBR,Team_TEX,Team_TOR,Team_WSN
0,2023,92,0,353,320,45,79,32,24,2,...,False,False,False,False,False,False,False,False,False,False
1,2023,141,0,455,412,59,99,63,21,2,...,False,False,False,False,False,False,False,False,False,False
2,2023,154,1,687,588,84,163,111,31,1,...,False,False,False,False,False,False,False,False,False,False
3,2023,148,1,632,555,108,136,68,29,0,...,False,False,False,False,False,False,False,True,False,False
4,2023,112,0,357,312,40,68,43,13,1,...,False,False,False,False,False,False,False,False,False,False


In [9]:
# Identify feature and target arrays
y = combined_df['All-Star']
X = combined_df.drop(columns=['All-Star'])

# Split the data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)

In [10]:
# Apply SMOTE to the training data to handle class imbalance

smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

In [13]:
# Initialize the Random Forest model
model = RandomForestClassifier(n_estimators=50, random_state=42)

# Train the model on the resampled training data
model.fit(X_train_smote, y_train_smote)

In [14]:
# Predict on the test set
y_pred = model.predict(X_test)

# Evaluate the model
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Feature Importance
importances = model.feature_importances_
feature_names = X.columns
feature_importances = pd.DataFrame({'Feature': feature_names, 'Importance': importances})
feature_importances = feature_importances.sort_values(by='Importance', ascending=False)

print("\nFeature Importances:")
print(feature_importances)

Confusion Matrix:
[[809  88]
 [ 94 106]]

Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.90      0.90       897
           1       0.55      0.53      0.54       200

    accuracy                           0.83      1097
   macro avg       0.72      0.72      0.72      1097
weighted avg       0.83      0.83      0.83      1097


Feature Importances:
                              Feature  Importance
18        On_Base_Plus_Slugging_(OPS)    0.089601
20                        Total_Bases    0.073738
19  On_Base_Plus_Slugging_Plus_(OPS+)    0.073376
4                                Runs    0.068741
5                                Hits    0.065432
2                   Plate_Appearances    0.058374
17                Slugging_Percentage    0.051677
15                    Batting_Average    0.046357
10                     Runs_Batted_In    0.042674
3                             At_Bats    0.039941
13                              Walk