In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

In [10]:
df = pd.read_csv("clean_fifa_worldcup_matches.csv")
df = df.dropna()


In [11]:
team_encoder = LabelEncoder()
df['HomeTeamEncoded'] = team_encoder.fit_transform(df['HomeTeam'])
df['AwayTeamEncoded'] = df['AwayTeam'].apply(lambda x: team_encoder.transform([x])[0] if x in team_encoder.classes_ else -1)


In [12]:
X = df[['HomeTeamEncoded', 'AwayTeamEncoded']]
y_home = df['HomeGoals']
y_away = df['AwayGoals']

In [13]:
X_train, X_test, y_train_home, y_test_home = train_test_split(X, y_home, test_size=0.2, random_state=42)
X_train, X_test, y_train_away, y_test_away = train_test_split(X, y_away, test_size=0.2, random_state=42)


In [15]:
home_model = RandomForestRegressor(n_estimators=100, random_state=42)
away_model = RandomForestRegressor(n_estimators=100, random_state=42)

home_model.fit(X_train, y_train_home)
away_model.fit(X_train, y_train_away)

y_pred_home = home_model.predict(X_test)
y_pred_away = away_model.predict(X_test)

home_mae = mean_absolute_error(y_test_home, y_pred_home)
away_mae = mean_absolute_error(y_test_away, y_pred_away)

In [33]:
fixtures = pd.read_csv("clean_fifa_worldcup_fixture.csv")

# Debugging: Print column names before renaming
print("Columns before renaming:", list(fixtures.columns))

# Rename columns correctly (matching lowercase)
fixtures.rename(columns={'home': 'HomeTeam', 'away': 'AwayTeam'}, inplace=True)

# Debugging: Print column names after renaming
print("Columns after renaming:", list(fixtures.columns))

# Check if required columns exist
if 'HomeTeam' not in fixtures.columns or 'AwayTeam' not in fixtures.columns:
    raise KeyError("Columns 'HomeTeam' or 'AwayTeam' not found after renaming!")

# Fix FutureWarning: Fill missing values correctly
fixtures['HomeTeam'] = fixtures['HomeTeam'].fillna('Unknown')
fixtures['AwayTeam'] = fixtures['AwayTeam'].fillna('Unknown')

# Encode team names
fixtures['HomeTeamEncoded'] = fixtures['HomeTeam'].apply(lambda x: team_encoder.transform([x])[0] if x in team_encoder.classes_ else -1)
fixtures['AwayTeamEncoded'] = fixtures['AwayTeam'].apply(lambda x: team_encoder.transform([x])[0] if x in team_encoder.classes_ else -1)

# Predict goals
fixtures['PredictedHomeGoals'] = home_model.predict(fixtures[['HomeTeamEncoded', 'AwayTeamEncoded']])
fixtures['PredictedAwayGoals'] = away_model.predict(fixtures[['HomeTeamEncoded', 'AwayTeamEncoded']])

# Determine winner
fixtures['Winner'] = np.where(
    fixtures['PredictedHomeGoals'] > fixtures['PredictedAwayGoals'], fixtures['HomeTeam'],
    np.where(fixtures['PredictedHomeGoals'] < fixtures['PredictedAwayGoals'], fixtures['AwayTeam'], 'Draw')
)


Columns before renaming: ['home', 'score', 'away', 'year']
Columns after renaming: ['HomeTeam', 'score', 'AwayTeam', 'year']


In [34]:
fixtures.to_csv("fifa_worldcup_2026_predictions.csv", index=False)


In [35]:
group_stage_results = pd.read_csv("fifa_worldcup_2026_predictions.csv")

standings = {}
for _, row in group_stage_results.iterrows():
    home, away, winner = row['HomeTeam'], row['AwayTeam'], row['Winner']
    
    standings.setdefault(home, {'Points': 0, 'GD': 0})
    standings.setdefault(away, {'Points': 0, 'GD': 0})

    if winner == home:
        standings[home]['Points'] += 3
    elif winner == away:
        standings[away]['Points'] += 3
    else:
        standings[home]['Points'] += 1
        standings[away]['Points'] += 1

standings_df = pd.DataFrame.from_dict(standings, orient='index').reset_index().rename(columns={'index': 'Team'})
standings_df = standings_df.sort_values(by=['Points', 'GD'], ascending=False)



In [36]:
standings_df.to_csv("fifa_worldcup_2026_standings.csv", index=False)

print("Predictions for World Cup 2026 saved successfully!")

Predictions for World Cup 2026 saved successfully!
