In [None]:
# Import dependencies
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import roc_auc_score
from imblearn.over_sampling import SMOTE
import tensorflow as tf
import numpy as np
from sklearn.ensemble import RandomForestClassifier



# Use raw URLs from GitHub
combined_url = 'https://raw.githubusercontent.com/Christian-Albertini/project-4/main/Resources/Combined_Player_Stats_Cleaned_3.csv'

# Read the CSV files
Combined_Player_Stats = pd.read_csv(combined_url)

# Display the first few rows of each DataFrame
Combined_Player_Stats.head()


Unnamed: 0,Season,Team,League,Games_Played,All-Star,Plate_Appearances,At_Bats,Runs,Hits,Single,...,Stolen_Bases,Caught_Stealing,Walks,Strikeouts,Batting_Average,On_Base_Percentage,Slugging_Percentage,On_Base_Plus_Slugging_(OPS),On_Base_Plus_Slugging_Plus_(OPS+),Total_Bases
0,2023,BOS,AL,92,0,353,320,45,79,32,...,4,0,22,110,0.247,0.303,0.531,0.834,119,170
1,2023,BAL,AL,141,0,455,412,59,99,63,...,11,4,32,68,0.24,0.3,0.396,0.696,94,163
2,2023,BAL,AL,154,1,687,588,84,163,111,...,1,2,92,101,0.277,0.374,0.435,0.809,128,256
3,2023,TEX,AL,148,1,632,555,108,136,68,...,9,1,65,175,0.245,0.328,0.508,0.836,127,282
4,2023,DET,AL,112,0,357,312,40,68,43,...,14,3,42,89,0.218,0.31,0.372,0.682,88,116


In [None]:
Combined_Player_Stats = Combined_Player_Stats.drop(columns=['Team', 'Single', 'Double', 'Triple', 'Caught_Stealing', 'On_Base_Plus_Slugging_(OPS)', 'League'])
Combined_Player_Stats.head()

Unnamed: 0,Season,Games_Played,All-Star,Plate_Appearances,At_Bats,Runs,Hits,Home_Runs,Runs_Batted_In,Stolen_Bases,Walks,Strikeouts,Batting_Average,On_Base_Percentage,Slugging_Percentage,On_Base_Plus_Slugging_Plus_(OPS+),Total_Bases
0,2023,92,0,353,320,45,79,21,58,4,22,110,0.247,0.303,0.531,119,170
1,2023,141,0,455,412,59,99,13,60,11,32,68,0.24,0.3,0.396,94,163
2,2023,154,1,687,588,84,163,20,80,1,92,101,0.277,0.374,0.435,128,256
3,2023,148,1,632,555,108,136,39,107,9,65,175,0.245,0.328,0.508,127,282
4,2023,112,0,357,312,40,68,11,34,14,42,89,0.218,0.31,0.372,88,116


In [None]:
# Convert categorical data to numeric
#combined_df = pd.get_dummies(Combined_Player_Stats, columns=['League'])

# Review Dataframes
#combined_df.head()

In [None]:
# Identify feature and target arrays
y = Combined_Player_Stats['All-Star']
X = Combined_Player_Stats.drop(columns=['All-Star'])

# Split the data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

In [None]:
# Apply SMOTE to the training data to handle class imbalance

smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

In [None]:
# Initialize the Random Forest model
model = RandomForestClassifier(n_estimators=25, random_state=42)

# Train the model on the resampled training data
model.fit(X_train_smote, y_train_smote)

In [None]:
# Predict on the test set
y_pred = model.predict(X_test)

# Evaluate the model
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Feature Importance
importances = model.feature_importances_
feature_names = X.columns
feature_importances = pd.DataFrame({'Feature': feature_names, 'Importance': importances})
feature_importances = feature_importances.sort_values(by='Importance', ascending=False)

print("\nFeature Importances:")
print(feature_importances)

Confusion Matrix:
[[160  16]
 [ 18  26]]

Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.91      0.90       176
           1       0.62      0.59      0.60        44

    accuracy                           0.85       220
   macro avg       0.76      0.75      0.75       220
weighted avg       0.84      0.85      0.84       220


Feature Importances:
                              Feature  Importance
15                        Total_Bases    0.149395
13                Slugging_Percentage    0.100077
14  On_Base_Plus_Slugging_Plus_(OPS+)    0.085029
7                      Runs_Batted_In    0.082373
4                                Runs    0.082000
11                    Batting_Average    0.074725
2                   Plate_Appearances    0.070375
12                 On_Base_Percentage    0.054260
5                                Hits    0.052307
10                         Strikeouts    0.044952
9                               Walk