In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Input
from tensorflow.keras.optimizers import Adam

# Specify the folder path containing CSV files
folder_path = '/content/PL_data'

# Read and combine all CSV files into one DataFrame
all_files = [os.path.join(folder_path, f) for f in os.listdir(folder_path) if f.endswith('.csv')]
dataframes = [pd.read_csv(f) for f in all_files]
combined_df = pd.concat(dataframes, ignore_index=True)




In [2]:

# Loop through each file in the folder
for filename in os.listdir(folder_path):
    # Check if the file is a CSV (you can modify this if your files have a different format)
    if filename.endswith('.csv'):
        file_path = os.path.join(folder_path, filename)

        # Load the data into a DataFrame
        df = pd.read_csv(file_path)

        # Calculate the count of null values in each column
        null_counts = df.isnull().sum()

        # Display the results for this file
        print(f"File: {filename}")
        print("Count of null cells in each column:")
        print(null_counts)
        print("\n" + "-"*40 + "\n")  # Separator for readability


File: player_clean_sheets.csv
Count of null cells in each column:
Rank              0
Player            0
Team              0
Clean Sheets      0
Goals Conceded    0
Minutes           0
Matches           0
Country           0
dtype: int64

----------------------------------------

File: accurate_pass_team.csv
Count of null cells in each column:
Rank                         0
Team                         0
Accurate Passes per Match    0
Pass Success (%)             0
Matches                      0
Country                      0
dtype: int64

----------------------------------------

File: penalty_won_team.csv
Count of null cells in each column:
Rank                   0
Team                   0
Penalties Won          0
Conversion Rate (%)    0
Matches                0
Country                0
dtype: int64

----------------------------------------

File: won_tackle_team.csv
Count of null cells in each column:
Rank                            0
Team                            0
Successful T

In [3]:
# Preprocessing
# Define target column and separate features and labels
from sklearn.preprocessing import LabelEncoder, StandardScaler


target_column = 'Goals'  # Example target, adjust as needed
X = combined_df.drop([target_column, 'Team'], axis=1)
y = combined_df[target_column]

# One-hot encode categorical features
X = pd.get_dummies(X, columns=['Country', 'Player'], drop_first=True)  # Adjust for all non-numeric columns if needed

# Encode target variable if it's categorical
if y.dtype == 'object' or np.issubdtype(y.dtype, np.number):
    label_encoder = LabelEncoder()
    y = label_encoder.fit_transform(y.fillna(0))  # Fill NaNs in y if they exist

# Replace any remaining NaNs in X
X = X.fillna(0)

# Scale numeric features
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



In [4]:

# Handle NaN values in y
y = np.nan_to_num(y, nan=-1)  # Replace NaNs in y with a placeholder, like -1

# Ensure no NaN values are in y before splitting
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Check if any NaN values remain in y_train or y_test after splitting
assert not np.isnan(y_train).any(), "NaN values found in y_train after split"
assert not np.isnan(y_test).any(), "NaN values found in y_test after split"


In [5]:
# Step 1: Balance the Dataset
# Calculate class weights to handle any imbalance in classes
from sklearn.utils import class_weight

class_weights = class_weight.compute_class_weight(
    class_weight='balanced',
    classes=np.unique(y_train),
    y=y_train
)
class_weight_dict = dict(enumerate(class_weights))

In [6]:
# Step 3: Define the model with improvements

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization, Input
from tensorflow.keras.optimizers import Adam

# Verify unique class count
num_classes = len(np.unique(y))

input_shape = X_train.shape[1]

model = Sequential([
    Input(shape=(input_shape,)),
    Dense(128, activation='relu'),
    BatchNormalization(),
    Dropout(0.3),
    Dense(64, activation='relu'),
    BatchNormalization(),
    Dropout(0.2),
    Dense(32, activation='relu'),
    Dense(num_classes, activation='softmax')  # Set output layer dynamically
])

# Compile the model
model.compile(optimizer=Adam(learning_rate=0.0001),
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

print("Model compiled with output layer size:", num_classes)




Model compiled with output layer size: 40


In [7]:
# Train the model with early stopping
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.utils import class_weight

# Compute class weights
class_weights = class_weight.compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
class_weight_dict = dict(enumerate(class_weights))

# Set up early stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

# Train the model
history = model.fit(
    X_train, y_train,
    epochs=100,
    batch_size=32,
    validation_data=(X_test, y_test),
    class_weight=class_weight_dict,
    callbacks=[early_stopping],
    verbose=1
)



Epoch 1/100
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 6ms/step - accuracy: 5.3207e-04 - loss: 4.6195 - val_accuracy: 0.0000e+00 - val_loss: 3.9376
Epoch 2/100
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.0013 - loss: 3.6681 - val_accuracy: 5.7274e-04 - val_loss: 3.8580
Epoch 3/100
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.0024 - loss: 3.6824 - val_accuracy: 0.0023 - val_loss: 3.7619
Epoch 4/100
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.0012 - loss: 3.2830 - val_accuracy: 0.0034 - val_loss: 3.7076
Epoch 5/100
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.0040 - loss: 3.4659 - val_accuracy: 0.0057 - val_loss: 3.6491
Epoch 6/100
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.0053 - loss: 2.7816 - val_accuracy: 0.0097 - val_loss: 3.6015
Epoch 7/10

In [8]:
#TESTING
# Evaluate the model on the test set
test_loss, test_accuracy = model.evaluate(X_test, y_test, verbose=1)
print(f'Test Loss: {test_loss}')
print(f'Test Accuracy: {test_accuracy}')

# Make predictions on the test data
predictions = model.predict(X_test)

# Convert predicted values from one-hot encoded to class labels
predicted_classes = predictions.argmax(axis=1)  # Get the class with the highest probability

# If your labels were encoded, convert predictions back to original labels
predicted_labels = label_encoder.inverse_transform(predicted_classes)

# Convert y_test to original labels for comparison
true_labels = label_encoder.inverse_transform(y_test)

# Compare the predictions with the true labels
comparison = pd.DataFrame({'True Label': true_labels, 'Predicted Label': predicted_labels})
print(comparison.head())  # Display the first few rows of predictions vs actual values


[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.0874 - loss: 2.9168
Test Loss: 2.9230129718780518
Test Accuracy: 0.08762886375188828
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step
   True Label  Predicted Label
0         0.0              6.0
1         0.0             35.0
2         0.0             91.0
3         0.0              2.0
4         0.0             12.0
