<a href="https://colab.research.google.com/github/Christian-Albertini/project-4/blob/main/Project_4_with_standings.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Import dependencies
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import roc_auc_score
from imblearn.over_sampling import SMOTE
import tensorflow as tf
import numpy as np



# Use raw URLs from GitHub
combined_url = 'https://raw.githubusercontent.com/Christian-Albertini/project-4/main/Resources/Merged_Player_Stats.csv'

# Read the CSV files
Combined_Player_Stats = pd.read_csv(combined_url)

# Display the first few rows of each DataFrame
Combined_Player_Stats.head()


Unnamed: 0,Player,Season,Age,Team,Lg,G,AS,PA,AB,R,...,OPS,OPS+,TB,GIDP,HBP,SH,SF,IBB,Pos,Standings
0,Adam Duvall,2023,34,BOS,AL,92,0,353,320,45,...,0.834,119,170,0,6,0,5,1,89/H7D,20.0
1,Adam Frazier,2023,31,BAL,AL,141,0,455,412,59,...,0.696,94,163,4,4,4,2,0,*4H/97D,2.0
2,Adley Rutschman,2023,25,BAL,AL,154,1,687,588,84,...,0.809,128,256,14,2,0,5,6,*2D/H,2.0
3,Adolis García,2023,30,TEX,AL,148,1,632,555,108,...,0.836,127,282,12,6,0,6,0,*9/8DH,7.0
4,Akil Baddoo,2023,24,DET,AL,112,0,357,312,40,...,0.682,88,116,4,0,2,1,2,7H/89D,19.0


In [3]:
# Convert categorical data to numeric
combined_df = pd.get_dummies(Combined_Player_Stats, columns=['Lg'])
combined_df=combined_df.drop(columns=['Team', '1B', '2B', '3B', 'CS', 'GIDP','HBP','SH','SF','IBB','Pos'])
combined_df=combined_df.drop(columns=['OPS', 'Age', 'Player'])
combined_df=combined_df.dropna()
# Review Dataframes
combined_df.head()

Unnamed: 0,Season,G,AS,PA,AB,R,H,HR,RBI,SB,BB,SO,BA,OBP,SLG,OPS+,TB,Standings,Lg_AL,Lg_NL
0,2023,92,0,353,320,45,79,21,58,4,22,110,0.247,0.303,0.531,119,170,20.0,True,False
1,2023,141,0,455,412,59,99,13,60,11,32,68,0.24,0.3,0.396,94,163,2.0,True,False
2,2023,154,1,687,588,84,163,20,80,1,92,101,0.277,0.374,0.435,128,256,2.0,True,False
3,2023,148,1,632,555,108,136,39,107,9,65,175,0.245,0.328,0.508,127,282,7.0,True,False
4,2023,112,0,357,312,40,68,11,34,14,42,89,0.218,0.31,0.372,88,116,19.0,True,False


In [6]:
# Identify feature and target arrays
combined_df=combined_df.drop(columns=['Lg_AL', 'Lg_NL'])
y = combined_df['AS']
X = combined_df.drop(columns=['AS'])

# Split the data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [7]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit and scale the data
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Apply SMOTE to balance the dataset
smote = SMOTE()
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_scaled, y_train)

In [12]:
# Compute class weights
class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(y_train_resampled),
    y=y_train_resampled
)
class_weight_dict = dict(enumerate(class_weights))

# Define model
number_input_features = len(X_train_resampled[0])
hidden_nodes_layer1 = 30
hidden_nodes_layer2 = 20
hidden_nodes_layer3 = 10

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation='relu'))

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation='sigmoid'))

# Third hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer3, activation='sigmoid'))

# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation='sigmoid'))


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [13]:
# Compile the model
nn.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [14]:
# Train the model
fit_model = nn.fit(
    X_train_resampled, y_train_resampled,
    epochs=50,
    validation_split=0.2,
    class_weight=class_weight_dict,
    verbose=1
)

Epoch 1/50
[1m61/61[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.6084 - loss: 0.6578 - val_accuracy: 0.0000e+00 - val_loss: 0.8871
Epoch 2/50
[1m61/61[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.6555 - loss: 0.5799 - val_accuracy: 0.6296 - val_loss: 0.7330
Epoch 3/50
[1m61/61[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.7863 - loss: 0.5056 - val_accuracy: 0.7222 - val_loss: 0.6825
Epoch 4/50
[1m61/61[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.7831 - loss: 0.4790 - val_accuracy: 0.7551 - val_loss: 0.6325
Epoch 5/50
[1m61/61[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.7926 - loss: 0.4590 - val_accuracy: 0.7819 - val_loss: 0.5767
Epoch 6/50
[1m61/61[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.7755 - loss: 0.4570 - val_accuracy: 0.7840 - val_loss: 0.5766
Epoch 7/50
[1m61/61[0m [32m━━━━━━

In [15]:
# Predict probabilities
y_pred_proba = nn.predict(X_test_scaled)

# Convert probabilities to binary class labels
y_pred = (y_pred_proba > 0.5).astype('int32').flatten()

# Generate and print classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Print ROC-AUC score
roc_auc = roc_auc_score(y_test, y_pred_proba)
print(f"ROC-AUC Score: {roc_auc}")

[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.83      0.88       516
           1       0.50      0.71      0.59       121

    accuracy                           0.81       637
   macro avg       0.71      0.77      0.73       637
weighted avg       0.84      0.81      0.82       637

ROC-AUC Score: 0.8551316548145301
