In [18]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import accuracy_score, mean_absolute_error
import joblib

In [19]:
df = pd.read_csv("clean_fifa_worldcup_matches.csv")

In [20]:
print(df.head())


    HomeTeam AwayTeam  Year  HomeGoals  AwayGoals  TotalGoals
0     France   Mexico  1930          4          1           5
1  Argentina   France  1930          1          0           1
2      Chile   Mexico  1930          3          0           3
3      Chile   France  1930          1          0           1
4  Argentina   Mexico  1930          6          3           9


In [21]:
required_columns = {'HomeTeam', 'AwayTeam', 'Year', 'HomeGoals', 'AwayGoals', 'TotalGoals'}
if not required_columns.issubset(df.columns):
    raise ValueError("Dataset is missing required columns!")

In [22]:
all_teams = pd.concat([df['HomeTeam'], df['AwayTeam']]).unique()


In [23]:
team_encoder = LabelEncoder()
team_encoder.fit(all_teams)


In [24]:
df['HomeTeamEncoded'] = team_encoder.transform(df['HomeTeam'])
df['AwayTeamEncoded'] = team_encoder.transform(df['AwayTeam'])

In [25]:
X = df[['HomeTeamEncoded', 'AwayTeamEncoded', 'Year']]
y_home = df['HomeGoals']
y_away = df['AwayGoals']

In [27]:
X_train, X_test, y_train_home, y_test_home = train_test_split(X, y_home, test_size=0.2, random_state=42)
X_train, X_test, y_train_away, y_test_away = train_test_split(X, y_away, test_size=0.2, random_state=42)

home_model = RandomForestRegressor(n_estimators=100, random_state=42)
away_model = RandomForestRegressor(n_estimators=100, random_state=42)

home_model.fit(X_train, y_train_home)
away_model.fit(X_train, y_train_away)

In [29]:
joblib.dump(home_model, "home_goal_model.pkl")
joblib.dump(away_model, "away_goal_model.pkl")
joblib.dump(team_encoder, "team_encoder.pkl")
print("Models trained and saved successfully!")


Models trained and saved successfully!


In [31]:
#time for prediction
fixtures_2026 = pd.read_csv("clean_fifa_worldcup_fixture.csv")


In [32]:
def encode_team(team):
    if team in team_encoder.classes_:
        return team_encoder.transform([team])[0]
    else:
        return -1 

In [43]:
# Ensure proper renaming with correct case
fixtures_2026.rename(columns={'Home': 'HomeTeam', 'Away': 'AwayTeam'}, inplace=True)

# Print column names to verify renaming worked
print("Columns after renaming:", list(fixtures_2026.columns))

# Encode teams
fixtures_2026['HomeTeamEncoded'] = fixtures_2026['HomeTeam'].apply(encode_team)
fixtures_2026['AwayTeamEncoded'] = fixtures_2026['AwayTeam'].apply(encode_team)


Columns after renaming: ['HomeTeam', 'Score', 'AwayTeam', 'Year']


In [49]:
fixtures_2026['HomeTeamEncoded'] = fixtures_2026['HomeTeam'].apply(encode_team)
fixtures_2026['AwayTeamEncoded'] = fixtures_2026['AwayTeam'].apply(encode_team)

# Drop matches where either team encoding is -1 (i.e., new unseen teams)
fixtures_2026 = fixtures_2026[
    (fixtures_2026['HomeTeamEncoded'] != -1) & (fixtures_2026['AwayTeamEncoded'] != -1)
].copy()  # Ensure we create a new copy

# Prepare dataset for prediction
X_2026 = fixtures_2026[['HomeTeamEncoded', 'AwayTeamEncoded']].copy()  # Create a copy
X_2026.loc[:, 'Year'] = 2026  # Modify safely to avoid SettingWithCopyWarning

# Load trained models
home_model = joblib.load("home_goal_model.pkl")
away_model = joblib.load("away_goal_model.pkl")

In [50]:
fixtures_2026['PredictedHomeGoals'] = home_model.predict(X_2026)
fixtures_2026['PredictedAwayGoals'] = away_model.predict(X_2026)

# Round predictions to nearest whole number
fixtures_2026['PredictedHomeGoals'] = fixtures_2026['PredictedHomeGoals'].round().astype(int)
fixtures_2026['PredictedAwayGoals'] = fixtures_2026['PredictedAwayGoals'].round().astype(int)

# Add a score column
fixtures_2026['Score'] = fixtures_2026['PredictedHomeGoals'].astype(str) + " - " + fixtures_2026['PredictedAwayGoals'].astype(str)

In [51]:
fixtures_2026[['HomeTeam', 'Score', 'AwayTeam', 'Year']].to_csv("fifa_worldcup_2026_Score_predictions.csv", index=False)

print("Predictions for World Cup 2026 saved successfully!")

Predictions for World Cup 2026 saved successfully!
