<a href="https://colab.research.google.com/github/Elisha-Kiplangat/Machine-Learning-Model/blob/main/PremierLeague.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Input
from tensorflow.keras.optimizers import Adam

# Specify the folder path containing CSV files
folder_path = '/content/PL_data'

# Read and combine all CSV files into one DataFrame
all_files = [os.path.join(folder_path, f) for f in os.listdir(folder_path) if f.endswith('.csv')]
dataframes = [pd.read_csv(f) for f in all_files]
combined_df = pd.concat(dataframes, ignore_index=True)




In [None]:
# Define target column name (replace with your actual target column)
target_column = 'Team'

# Split into features and target
X = combined_df.drop(target_column, axis=1)
y = combined_df[target_column]

# Encode target labels as integers
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Check for non-numeric columns
non_numeric_columns = X_train.select_dtypes(include=['object']).columns
# print("Non-numeric columns:", non_numeric_columns)

# Apply one-hot encoding to categorical columns
X_train = pd.get_dummies(X_train, columns=non_numeric_columns)
X_test = pd.get_dummies(X_test, columns=non_numeric_columns)

# Align columns of X_test to match X_train
X_test = X_test.reindex(columns=X_train.columns, fill_value=0)

# Verify all data is numeric
# print("X_train types:\n", X_train.dtypes)
# print("X_test types:\n", X_test.dtypes)

Non-numeric columns: Index(['Country', 'Player'], dtype='object')
X_train types:
 Rank                         int64
Big Chances Missed         float64
Goals                      float64
Matches                      int64
Possession (%)             float64
                            ...   
Player_Youssef Chermiti       bool
Player_Yves Bissouma          bool
Player_Zack Nelson            bool
Player_Zeki Amdouni           bool
Player_Álex Moreno            bool
Length: 696, dtype: object
X_test types:
 Rank                         int64
Big Chances Missed         float64
Goals                      float64
Matches                      int64
Possession (%)             float64
                            ...   
Player_Youssef Chermiti       bool
Player_Yves Bissouma          bool
Player_Zack Nelson           int64
Player_Zeki Amdouni           bool
Player_Álex Moreno            bool
Length: 696, dtype: object


In [None]:
# Determine the number of unique classes in the target
input_shape = X_train.shape[1]  # This should now be 696 after encoding

# Define the model with the correct input shape
model = Sequential([
    Input(shape=(input_shape,)),  # Set input shape to match X_train after encoding
    Dense(64, activation='relu'),
    Dense(32, activation='relu'),
    Dropout(0.3),
    Dense(16, activation='relu'),
    Dense(num_classes, activation='softmax')  # Match the number of classes in your target variable
])

# Compile the model
model.compile(optimizer=Adam(learning_rate=0.001),
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])


In [None]:
# Train the model
history = model.fit(X_train, y_train,
                    epochs=50,
                    batch_size=32,
                    validation_data=(X_test, y_test),
                    verbose=1)


Epoch 1/50
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 15ms/step - accuracy: 0.0496 - loss: nan - val_accuracy: 0.0510 - val_loss: nan
Epoch 2/50
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 2ms/step - accuracy: 0.0535 - loss: nan - val_accuracy: 0.0510 - val_loss: nan
Epoch 3/50
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.0536 - loss: nan - val_accuracy: 0.0510 - val_loss: nan
Epoch 4/50
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.0527 - loss: nan - val_accuracy: 0.0510 - val_loss: nan
Epoch 5/50
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.0518 - loss: nan - val_accuracy: 0.0510 - val_loss: nan
Epoch 6/50
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.0529 - loss: nan - val_accuracy: 0.0510 - val_loss: nan
Epoch 7/50
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[

In [None]:
#TESTING
# Evaluate the model on the test set
test_loss, test_accuracy = model.evaluate(X_test, y_test, verbose=1)
print(f'Test Loss: {test_loss}')
print(f'Test Accuracy: {test_accuracy}')

# Make predictions on the test data
predictions = model.predict(X_test)

# Convert predicted values from one-hot encoded to class labels
predicted_classes = predictions.argmax(axis=1)  # Get the class with the highest probability

# If your labels were encoded, convert predictions back to original labels
predicted_labels = label_encoder.inverse_transform(predicted_classes)

# Convert y_test to original labels for comparison
true_labels = label_encoder.inverse_transform(y_test)

# Compare the predictions with the true labels
comparison = pd.DataFrame({'True Label': true_labels, 'Predicted Label': predicted_labels})
print(comparison.head())  # Display the first few rows of predictions vs actual values


[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.0636 - loss: nan
Test Loss: nan
Test Accuracy: 0.05097365379333496
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 12ms/step
                 True Label  Predicted Label
0                 Brentford  AFC Bournemouth
1                 Brentford  AFC Bournemouth
2                 Brentford  AFC Bournemouth
3  Brighton and Hove Albion  AFC Bournemouth
4           West Ham United  AFC Bournemouth
