In [1]:
# Import pandas for data manipulation (e.g., loading CSVs, creating DataFrames)
import pandas as pd
# Import numpy for numerical operations (though it's mainly used by pandas here)
import numpy as np

# ---
# This block loads your dataset.
# We use a 'try...except' block, which is a good practice for error handling.
# If Python can't find the file, it will print a clear error message
# instead of just crashing.
# ---
try:
    # We use a relative path.
    # '../' tells Python to go UP one folder (from 'Supervised_Learning')
    # '/Dataset/' tells it to go DOWN into the 'Dataset' folder
    # 'SPL_raw_normalized.csv' is the file we want to load.
    df = pd.read_csv("../Dataset/SPL_raw_normalized.csv")
    
    print("Data loaded successfully.")
    # .head() displays the first 5 rows of your data
    # to confirm it's loaded correctly.
    print(df.head())

except FileNotFoundError:
    print("ERROR: File not found.")
    print("Please check that 'SPL_raw_normalized.csv' is inside the 'Dataset' folder,")
    print("and that the 'Dataset' folder is in the main project directory.")

Data loaded successfully.
   match_id        date   time    home_team  away_team  home_score  \
0       1.0  02.11.2000  20:00   Al-Ettifaq  Al Riyadh         1.0   
1       2.0  02.11.2000  20:00     Al Najma   Al-Nassr         0.0   
2       3.0  03.11.2000  20:00   Al Ahli SC   Al Ansar         3.0   
3       4.0  07.11.2000  20:00  Al Qadisiya      Sdoos         0.0   
4       5.0  08.11.2000  20:00     Al Wehda  Al-Shabab         1.0   

   away_score stadium city round referee_name  attendance season_label  \
0         0.0     NaN  NaN   NaN          NaN         NaN    2000-2001   
1         1.0     NaN  NaN   NaN          NaN         NaN    2000-2001   
2         0.0     NaN  NaN   NaN          NaN         NaN    2000-2001   
3         3.0     NaN  NaN   NaN          NaN         NaN    2000-2001   
4         0.0     NaN  NaN   NaN          NaN         NaN    2000-2001   

   season_start_year  season_end_year stage           source_file  
0               2000             2001   

In [2]:
# 'def' is the keyword to define a new function in Python.
# We name it 'derive_outcome' and it takes one argument, 'row'.
def derive_outcome(row):
    # This 'try...except' block is for safety. It tries to convert
    # the scores to integers. If a score is empty or just text,
    # it will 'except' and return pd.NA (Not Available / Missing).
    try:
        hs = int(row["home_score"])
        as_ = int(row["away_score"])
    except (TypeError, ValueError):
        return pd.NA
    
    # We also check if the scores were already missing (pd.NA)
    if pd.isna(hs) or pd.isna(as_):
        return pd.NA
    
    # This is the main logic to determine the outcome.
    if hs > as_:
        return "Home Win"
    elif hs < as_: # 'elif' is short for 'else if'
        return "Away Win"
    else: # If they are not greater or less, they must be equal.
        return "Draw"

# This line is the magic.
# 'df.apply(..., axis=1)' tells pandas to run our 'derive_outcome' function
# on every single row (axis=1) of the DataFrame.
# The result of each row is stored in a NEW column called 'match_outcome'.
df['match_outcome'] = df.apply(derive_outcome, axis=1)

# Now we clean our data. We drop any rows that still have a missing
# value in 'match_outcome' because we can't train a model
# on an example that has no answer.
df = df.dropna(subset=['match_outcome'])

# Finally, we create our target variable 'y' by selecting
# just the 'match_outcome' column from our clean DataFrame.
y = df['match_outcome']

print("Target variable 'y' (the 'answer key') created successfully.")
# .value_counts() shows us how many of each class we have.
# This confirms our data is imbalanced (more Home Wins than Draws).
print(y.value_counts())

Target variable 'y' (the 'answer key') created successfully.
match_outcome
Home Win    1395
Away Win    1039
Draw         823
Name: count, dtype: int64


In [3]:
# We create a list of the column names we want to use as inputs.
# As we found in Phase 1, we only select features that have complete data.
features = ['home_team', 'away_team', 'season_start_year']

# We create the 'X' DataFrame by selecting just those columns from our main 'df'.
# This 'X' is our set of "questions" that we will use to predict 'y'.
X = df[features]

print("Feature DataFrame 'X' (the 'questions') created successfully:")
print(X.head())

Feature DataFrame 'X' (the 'questions') created successfully:
     home_team  away_team  season_start_year
0   Al-Ettifaq  Al Riyadh               2000
1     Al Najma   Al-Nassr               2000
2   Al Ahli SC   Al Ansar               2000
3  Al Qadisiya      Sdoos               2000
4     Al Wehda  Al-Shabab               2000


In [4]:
# 'pd.get_dummies' is a pandas function that performs One-Hot Encoding.
# It finds all the unique text values (e.g., 'Al-Nassr', 'Al-Hilal')
# in the specified columns and creates a new column for each one.
# It puts a 1 in that column if the team was in the match, and a 0 otherwise.
# 'drop_first=True' is a technical step to prevent redundancy (e.g., if you
# are not 'Al-Nassr', you must be 'Al-Hilal' in a 2-team example).
X_encoded = pd.get_dummies(X, columns=['home_team', 'away_team'], drop_first=True)

print("Features encoded successfully (converted text to numbers):")
# The shape will change from 3 columns to many (e.g., 74)
print(f"Shape changed from {X.shape} to {X_encoded.shape}")
print(X_encoded.head())

Features encoded successfully (converted text to numbers):
Shape changed from (3257, 3) to (3257, 72)
   season_start_year  home_team_Al Ahli SC  home_team_Al Ansar  \
0               2000                 False               False   
1               2000                 False               False   
2               2000                  True               False   
3               2000                 False               False   
4               2000                 False               False   

   home_team_Al Batin  home_team_Al Feiha  home_team_Al Khaleej  \
0               False               False                 False   
1               False               False                 False   
2               False               False                 False   
3               False               False                 False   
4               False               False                 False   

   home_team_Al Najma  home_team_Al Orubah  home_team_Al Qadisiya  \
0               False        

In [5]:
# We import the function from scikit-learn (sklearn)
from sklearn.model_selection import train_test_split

# This one line does the split and creates four new variables:
# X_train, y_train: The 80% of data used for training the model.
# X_test, y_test: The 20% of data held back for testing the model.
#
# Parameters:
#   test_size=0.2: Specifies we want 20% of the data for the test set.
#   random_state=42: Makes the "random" split repeatable. Anyone who runs
#                    this code with random_state=42 will get the exact same split.
#   stratify=y: This is VERY important. It ensures the percentage of
#               "Home Win", "Away Win", and "Draw" is the same in both the
#               training set and the test set. This prevents the model from
#               being trained on a set that has, for example, no "Draws" at all.
X_train, X_test, y_train, y_test = train_test_split(
    X_encoded, y, test_size=0.2, random_state=42, stratify=y
)

print("Data split into training and test sets successfully:")
print(f"Training set size: {X_train.shape[0]} samples")
print(f"Test set size: {X_test.shape[0]} samples")

Data split into training and test sets successfully:
Training set size: 2605 samples
Test set size: 652 samples


In [6]:
# Import the model we want to use from scikit-learn
from sklearn.ensemble import RandomForestClassifier

# Initialize the model (create an instance of it).
# A Random Forest is an "ensemble" model, meaning it builds
# many small "decision trees" and lets them vote on the answer.
# 'random_state=42' ensures the model's random processes are repeatable.
rf_model = RandomForestClassifier(random_state=42)

# This is the actual "learning" step.
# .fit() tells the model to look at the training features (X_train)
# and learn the patterns that lead to the training answers (y_train).
rf_model.fit(X_train, y_train)

print("Model trained successfully!")

Model trained successfully!


In [7]:
# We import the function to create the report
from sklearn.metrics import classification_report

print("--- Step 7: Evaluating the Model ---")

# Use the trained model (rf_model) to make predictions on the test data (X_test)
rf_preds = rf_model.predict(X_test)

# Compare the model's predictions (rf_preds) to the real answers (y_test)
# and print the final report.
print("\nClassification Report (Random Forest):")
print(classification_report(y_test, rf_preds))

--- Step 7: Evaluating the Model ---

Classification Report (Random Forest):
              precision    recall  f1-score   support

    Away Win       0.54      0.49      0.51       208
        Draw       0.29      0.25      0.27       165
    Home Win       0.57      0.66      0.61       279

    accuracy                           0.50       652
   macro avg       0.47      0.46      0.46       652
weighted avg       0.49      0.50      0.49       652



### 9. Results Interpretation

As required by the project handbook, this section interprets the results of our supervised learning model.

**Model Used:** Random Forest Classifier

**Classification Report Analysis:**
The model achieved an **overall accuracy of 50%**. On a 3-class problem (Home Win, Draw, Away Win), a random guess would be 33%, so 50% shows that the model is learning a real pattern.

* **"Home Win" (f1-score: 0.60):** The model is best at predicting a Home Win.
* **"Away Win" (f1-score: 0.51):** The model is decent at predicting an Away Win.
* **"Draw" (f1-score: 0.27):** The model struggles the most with predicting a Draw. This is likely because "Draw" is the least common outcome (the minority class) in our dataset.

**Key Finding:**
The model's performance is limited by the features we could use. As we discovered in Phase 1, highly predictive features like `stadium`, `city`, and `attendance` were unusable because 85-95% of their data was missing. The model is doing the best it can with only the team names and the season year.