In [1]:
# Import dependencies
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import roc_auc_score
from imblearn.over_sampling import SMOTE
import tensorflow as tf
import numpy as np
from sklearn.ensemble import RandomForestClassifier



# Use raw URLs from GitHub
combined_url = 'https://raw.githubusercontent.com/Christian-Albertini/project-4/main/Resources/Merged_Stats_Final_Pos.csv'

# Read the CSV files
Merged_Stats = pd.read_csv(combined_url)

# Display the first few rows of each DataFrame
Merged_Stats.head()


Unnamed: 0,Player,Season,Games_Played,All-Star,Plate_Appearances,At_Bats,Runs,Hits,Home_Run,Runs_Batted_In,Stolen_Bases,Walks,Strikeouts,Batting_Average,On_Base_Percentage,Slugging_Percentage,On_Base_Plus_Slugging_Plus_(OPS+),Pos,Standings,WAR
0,Adam Duvall,2023,92,0,353,320,45,79,21,58,4,22,110,0.247,0.303,0.531,119,8,20,1.6
1,Adam Frazier,2023,141,0,455,412,59,99,13,60,11,32,68,0.24,0.3,0.396,94,4,2,1.7
2,Adley Rutschman,2023,154,1,687,588,84,163,20,80,1,92,101,0.277,0.374,0.435,128,2,2,4.3
3,Adolis García,2023,148,1,632,555,108,136,39,107,9,65,175,0.245,0.328,0.508,127,9,7,4.2
4,Akil Baddoo,2023,112,0,357,312,40,68,11,34,14,42,89,0.218,0.31,0.372,88,7,19,0.6


In [2]:
Merged_Stats = Merged_Stats.drop(columns=['Player', 'Games_Played'])
#Merged_Stats = Merged_Stats.drop(Merged_Stats.columns[0], axis=1)
Merged_Stats = Merged_Stats.dropna()
Merged_Stats.head()

Unnamed: 0,Season,All-Star,Plate_Appearances,At_Bats,Runs,Hits,Home_Run,Runs_Batted_In,Stolen_Bases,Walks,Strikeouts,Batting_Average,On_Base_Percentage,Slugging_Percentage,On_Base_Plus_Slugging_Plus_(OPS+),Pos,Standings,WAR
0,2023,0,353,320,45,79,21,58,4,22,110,0.247,0.303,0.531,119,8,20,1.6
1,2023,0,455,412,59,99,13,60,11,32,68,0.24,0.3,0.396,94,4,2,1.7
2,2023,1,687,588,84,163,20,80,1,92,101,0.277,0.374,0.435,128,2,2,4.3
3,2023,1,632,555,108,136,39,107,9,65,175,0.245,0.328,0.508,127,9,7,4.2
4,2023,0,357,312,40,68,11,34,14,42,89,0.218,0.31,0.372,88,7,19,0.6


In [None]:
#Merged_Stats[['Season', 'All-Star', 'Plate_Appearances', 'At_Bats', 'Runs', 'Hits', 'Home_Run', 'Runs_Batted_In', 'Stolen_Bases', 'Walks', 'Strikeouts', 'Pos', 'Standings','On_Base_Plus_Slugging_Plus_(OPS+)']] = Merged_Stats[['Season', 'All-Star', 'Plate_Appearances', 'At_Bats', 'Runs', 'Hits', 'Home_Run', 'Runs_Batted_In', 'Stolen_Bases', 'Walks', 'Strikeouts', 'Pos', 'Standings','On_Base_Plus_Slugging_Plus_(OPS+)']].astype(int)
#Merged_Stats.head()

Unnamed: 0,Season,All-Star,Plate_Appearances,At_Bats,Runs,Hits,Home_Run,Runs_Batted_In,Stolen_Bases,Walks,Strikeouts,Bating_Average,On_Base_Percentage,Slugging_Percentage,On_Base_Plus_Slugging_Plus_(OPS+),Pos,Standings,WAR
0,2023,0,353,320,45,79,21,58,4,22,110,0.247,0.303,0.531,119,8,20,1.6
1,2023,0,455,412,59,99,13,60,11,32,68,0.24,0.3,0.396,94,4,2,1.7
2,2023,1,687,588,84,163,20,80,1,92,101,0.277,0.374,0.435,128,2,2,4.3
4,2023,0,357,312,40,68,11,34,14,42,89,0.218,0.31,0.372,88,7,19,0.6
5,2023,0,611,558,74,153,20,97,4,42,94,0.274,0.327,0.437,106,5,8,0.4


In [None]:
# Convert categorical data to numeric
#combined_df = pd.get_dummies(Combined_Player_Stats, columns=['League'])

# Review Dataframes
#combined_df.head()

In [21]:
# Identify feature and target arrays
y = Merged_Stats['All-Star']
X = Merged_Stats.drop(columns=['All-Star'])

# Split the data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [22]:
# Apply SMOTE to the training data to handle class imbalance

smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

In [23]:
# Initialize the Random Forest model
model = RandomForestClassifier(n_estimators=25, random_state=42, class_weight='balanced')

# Train the model on the resampled training data
model.fit(X_train_smote, y_train_smote)

In [24]:
# Predict on the test set
y_pred = model.predict(X_test)

# Evaluate the model
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Feature Importance
importances = model.feature_importances_
feature_names = X.columns
feature_importances = pd.DataFrame({'Feature': feature_names, 'Importance': importances})
feature_importances = feature_importances.sort_values(by='Importance', ascending=False)

print("\nFeature Importances:")
print(feature_importances)


Confusion Matrix:
[[468  79]
 [ 43  68]]

Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.86      0.88       547
           1       0.46      0.61      0.53       111

    accuracy                           0.81       658
   macro avg       0.69      0.73      0.71       658
weighted avg       0.84      0.81      0.82       658


Feature Importances:
                              Feature  Importance
16                                WAR    0.185649
6                      Runs_Batted_In    0.099902
12                Slugging_Percentage    0.089506
10                    Batting_Average    0.076124
4                                Hits    0.071668
3                                Runs    0.070845
13  On_Base_Plus_Slugging_Plus_(OPS+)    0.063038
11                 On_Base_Percentage    0.039433
2                             At_Bats    0.039079
14                                Pos    0.038013
1                   Plate_Appearance