In [14]:
# Import dependencies
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import roc_auc_score
from imblearn.over_sampling import SMOTE
import tensorflow as tf
import numpy as np



# Use raw URLs from GitHub
combined_url = 'https://raw.githubusercontent.com/Christian-Albertini/project-4/main/Resources/Combined_Player_Stats_Cleaned_3.csv'

# Read the CSV files
Combined_Player_Stats = pd.read_csv(combined_url)

# Display the first few rows of each DataFrame
Combined_Player_Stats.head()


Unnamed: 0,Season,Team,League,Games_Played,All-Star,Plate_Appearances,At_Bats,Runs,Hits,Single,...,Stolen_Bases,Caught_Stealing,Walks,Strikeouts,Batting_Average,On_Base_Percentage,Slugging_Percentage,On_Base_Plus_Slugging_(OPS),On_Base_Plus_Slugging_Plus_(OPS+),Total_Bases
0,2023,BOS,AL,92,0,353,320,45,79,32,...,4,0,22,110,0.247,0.303,0.531,0.834,119,170
1,2023,BAL,AL,141,0,455,412,59,99,63,...,11,4,32,68,0.24,0.3,0.396,0.696,94,163
2,2023,BAL,AL,154,1,687,588,84,163,111,...,1,2,92,101,0.277,0.374,0.435,0.809,128,256
3,2023,TEX,AL,148,1,632,555,108,136,68,...,9,1,65,175,0.245,0.328,0.508,0.836,127,282
4,2023,DET,AL,112,0,357,312,40,68,43,...,14,3,42,89,0.218,0.31,0.372,0.682,88,116


In [2]:
# Convert categorical data to numeric
combined_df = pd.get_dummies(Combined_Player_Stats, columns=['League', 'Team'])

# Review Dataframes
combined_df.head()

Unnamed: 0,Season,Games_Played,All-Star,Plate_Appearances,At_Bats,Runs,Hits,Single,Double,Triple,...,Team_PHI,Team_PIT,Team_SDP,Team_SEA,Team_SFG,Team_STL,Team_TBR,Team_TEX,Team_TOR,Team_WSN
0,2023,92,0,353,320,45,79,32,24,2,...,False,False,False,False,False,False,False,False,False,False
1,2023,141,0,455,412,59,99,63,21,2,...,False,False,False,False,False,False,False,False,False,False
2,2023,154,1,687,588,84,163,111,31,1,...,False,False,False,False,False,False,False,False,False,False
3,2023,148,1,632,555,108,136,68,29,0,...,False,False,False,False,False,False,False,True,False,False
4,2023,112,0,357,312,40,68,43,13,1,...,False,False,False,False,False,False,False,False,False,False


In [3]:
# Identify feature and target arrays
y = combined_df['All-Star']
X = combined_df.drop(columns=['All-Star'])

# Split the data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [9]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit and scale the data
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Apply SMOTE to balance the dataset
smote = SMOTE()
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_scaled, y_train)

In [15]:
# Compute class weights
class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(y_train_resampled),
    y=y_train_resampled
)
class_weight_dict = dict(enumerate(class_weights))

# Define model
number_input_features = len(X_train_resampled[0])
hidden_nodes_layer1 = 30
hidden_nodes_layer2 = 20
hidden_nodes_layer3 = 10

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation='relu'))

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation='relu'))

# Third hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer3, activation='relu'))

# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation='sigmoid'))


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [16]:
# Compile the model
nn.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [17]:
# Train the model
fit_model = nn.fit(
    X_train_resampled, y_train_resampled,
    epochs=50,
    validation_split=0.2,
    class_weight=class_weight_dict,
    verbose=1
)

Epoch 1/50
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.6172 - loss: 0.6475 - val_accuracy: 0.7740 - val_loss: 0.5073
Epoch 2/50
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.8130 - loss: 0.4497 - val_accuracy: 0.7740 - val_loss: 0.5186
Epoch 3/50
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.8259 - loss: 0.3857 - val_accuracy: 0.8180 - val_loss: 0.4363
Epoch 4/50
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.8446 - loss: 0.3655 - val_accuracy: 0.8240 - val_loss: 0.4325
Epoch 5/50
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.8535 - loss: 0.3344 - val_accuracy: 0.8200 - val_loss: 0.4181
Epoch 6/50
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.8340 - loss: 0.3539 - val_accuracy: 0.8480 - val_loss: 0.3791
Epoch 7/50
[1m63/63[0m [32m━━━━━━━━━━

In [18]:
# Predict probabilities
y_pred_proba = nn.predict(X_test_scaled)

# Convert probabilities to binary class labels
y_pred = (y_pred_proba > 0.5).astype('int32').flatten()

# Generate and print classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Print ROC-AUC score
roc_auc = roc_auc_score(y_test, y_pred_proba)
print(f"ROC-AUC Score: {roc_auc}")

[1m21/21[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.89      0.89       540
           1       0.48      0.47      0.47       119

    accuracy                           0.81       659
   macro avg       0.68      0.68      0.68       659
weighted avg       0.81      0.81      0.81       659

ROC-AUC Score: 0.7875194522253347
