In [4]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, ConfusionMatrixDisplay

In [5]:
matches_train = pd.read_csv('cleaned_training_matches.csv')
matches_test = pd.read_csv('cleaned_matches_test.csv')
players = pd.read_csv('sorted__players.csv')
teams = pd.read_csv('cleaned_teams.csv')


In [6]:
for df in [matches_train, matches_test]:
    for col in ["home_team_id", "away_team_id"]:
        if col not in df.columns:
            raise ValueError(f"{col} is missing from dataset.")
        df[col] = df[col].fillna(0).astype(int)

# Encode match results
matches_train["match_result_encoded"] = matches_train.apply(
    lambda row: 1 if row["home_team_goal"] > row["away_team_goal"]
    else (-1 if row["home_team_goal"] < row["away_team_goal"] else 0), axis=1
)

# Rename team attributes to specify home/away teams
home_team_attributes = teams.rename(columns=lambda x: f"home_{x}" if x != "team_id" else "home_team_id")
away_team_attributes = teams.rename(columns=lambda x: f"away_{x}" if x != "team_id" else "away_team_id")

# Merge team attributes
matches_train = matches_train.merge(home_team_attributes, on="home_team_id", how="left")
matches_train = matches_train.merge(away_team_attributes, on="away_team_id", how="left")
matches_test = matches_test.merge(home_team_attributes, on="home_team_id", how="left")
matches_test = matches_test.merge(away_team_attributes, on="away_team_id", how="left")


home_players = matches_train.melt(
    id_vars=["home_team_id"],
    value_vars=[col for col in matches_train.columns if "home_player" in col],
    var_name="player_role", value_name="player_id"
).drop(columns=["player_role"])

away_players = matches_train.melt(
    id_vars=["away_team_id"],
    value_vars=[col for col in matches_train.columns if "away_player" in col],
    var_name="player_role", value_name="player_id"
).drop(columns=["player_role"])

home_players = home_players.merge(players, on="player_id", how="left").drop(columns=["player_id"])
away_players = away_players.merge(players, on="player_id", how="left").drop(columns=["player_id"])

home_team_stats = home_players.groupby("home_team_id").mean().add_prefix("home_").reset_index()
away_team_stats = away_players.groupby("away_team_id").mean().add_prefix("away_").reset_index()

matches_train = matches_train.merge(home_team_stats, on="home_team_id", how="left")
matches_train = matches_train.merge(away_team_stats, on="away_team_id", how="left")
matches_test = matches_test.merge(home_team_stats, on="home_team_id", how="left")
matches_test = matches_test.merge(away_team_stats, on="away_team_id", how="left")

# List of categorical columns that need encoding
categorical_cols = [
    "home_buildUpPlaySpeedClass", "home_buildUpPlayDribblingClass", "home_buildUpPlayPassingClass",
    "home_buildUpPlayPositioningClass", "home_chanceCreationPassingClass", "home_chanceCreationCrossingClass",
    "home_chanceCreationShootingClass", "home_chanceCreationPositioningClass", "home_defencePressureClass",
    "home_defenceAggressionClass", "home_defenceTeamWidthClass", "home_defenceDefenderLineClass",
    "away_buildUpPlaySpeedClass", "away_buildUpPlayDribblingClass", "away_buildUpPlayPassingClass",
    "away_buildUpPlayPositioningClass", "away_chanceCreationPassingClass", "away_chanceCreationCrossingClass",
    "away_chanceCreationShootingClass", "away_chanceCreationPositioningClass", "away_defencePressureClass",
    "away_defenceAggressionClass", "away_defenceTeamWidthClass", "away_defenceDefenderLineClass"
]

# Apply Label Encoding
label_encoder = LabelEncoder()
for col in categorical_cols:
    if col in matches_train.columns:
        matches_train[col] = label_encoder.fit_transform(matches_train[col])
        matches_test[col] = label_encoder.transform(matches_test[col])

# Handle missing values
matches_train = matches_train.fillna(0)
matches_test = matches_test.fillna(0)

# Select features for training
features = matches_train.columns.tolist()
print(features)
features.remove("match_result_encoded")

X = matches_train[features]
y = matches_train["match_result_encoded"]

# Split into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Random Forest Classifier
model = RandomForestClassifier(n_estimators=200, random_state=42)
model.fit(X_train, y_train)

# Evaluate model
y_pred = model.predict(X_val)
accuracy = accuracy_score(y_val, y_pred)
report = classification_report(y_val, y_pred, target_names=["Loss", "Draw", "Win"])

print(f"Model Accuracy: {accuracy:.2f}")
print(report)

# Predict match results for 2015-2016 season
X_test = matches_test[features]
matches_test["predicted_result"] = model.predict(X_test)
matches_test["predicted_result"] = matches_test["predicted_result"].map({1: "Win", 0: "Draw", -1: "Loss"})


ConfusionMatrixDisplay.from_predictions(y_val, y_pred, display_labels=["Loss", "Draw", "Win"])
plt.title("Random Forest Confusion Matrix")
plt.show()

matches_test.to_csv("predicted_matches_2015_2016.csv", index=False)

print("Predictions saved to predicted_matches_2015_2016.csv")


['season', 'stage', 'date', 'match_id', 'home_team_id', 'away_team_id', 'on_target_shot_home_team', 'on_target_shot_away_team', 'off_target_shot_home_team', 'off_target_shot_away_team', 'foul_home_team', 'foul_away_team', 'yellow_card_home_team', 'yellow_card_away_team', 'red_card_home_team', 'red_card_away_team', 'crosses_home_team', 'crosses_away_team', 'corner_home_team', 'corner_away_team', 'possession_home_team', 'possession_away_team', 'home_team_goal', 'away_team_goal', 'match_result_encoded', 'home_buildUpPlaySpeed', 'home_buildUpPlaySpeedClass', 'home_buildUpPlayDribbling', 'home_buildUpPlayDribblingClass', 'home_buildUpPlayPassing', 'home_buildUpPlayPassingClass', 'home_buildUpPlayPositioningClass', 'home_chanceCreationPassing', 'home_chanceCreationPassingClass', 'home_chanceCreationCrossing', 'home_chanceCreationCrossingClass', 'home_chanceCreationShooting', 'home_chanceCreationShootingClass', 'home_chanceCreationPositioningClass', 'home_defencePressure', 'home_defencePressu

  matches_train = matches_train.fillna(0)
  matches_test = matches_test.fillna(0)


ValueError: could not convert string to float: '2013/2014'