In [1]:
# Convert date columns to datetime objects
df_fifa['date'] = pd.to_datetime(df_fifa['date']) # Corrected column name
df_results['date'] = pd.to_datetime(df_results['date'])

# Sort by date for merge_asof
df_fifa = df_fifa.sort_values(by='date') # Corrected column name
df_results = df_results.sort_values(by='date')

# Merge match results with FIFA rankings for home teams
df_merged = pd.merge_asof(df_results, df_fifa, left_on='date', right_on='date', suffixes=('_results', '_fifa_home'), left_by='home_team', right_by='team', direction='backward')

# Merge the result with FIFA rankings for away teams
df_merged = pd.merge_asof(df_merged, df_fifa, left_on='date', right_on='date', suffixes=('_fifa_home', '_fifa_away'), left_by='away_team', right_by='team', direction='backward')


# Feature Engineering (including score_difference and home_win)
df_merged['rank_difference'] = df_merged['rank_fifa_home'] - df_merged['rank_fifa_away']
df_merged['points_difference'] = df_merged['total.points_fifa_home'] - df_merged['total.points_fifa_away']
df_merged['home_win'] = (df_merged['home_score'] > df_merged['away_score']).astype(int)
df_merged['score_difference'] = df_merged['home_score'] - df_merged['away_score'] # Target for regression

# One-hot encode the 'tournament' column
df_merged = pd.get_dummies(df_merged, columns=['tournament'], prefix='tournament')

# Convert 'neutral' to integer
df_merged['neutral'] = df_merged['neutral'].astype(int)

# Data Splitting
engineered_features = ['rank_difference', 'points_difference', 'neutral']
tournament_features = [col for col in df_merged.columns if col.startswith('tournament_')]
features = engineered_features + tournament_features

X = df_merged[features]
y_classification = df_merged['home_win']
y_regression = df_merged['score_difference']

X_train_clf, X_test_clf, y_train_clf, y_test_clf = train_test_split(X, y_classification, test_size=0.2, random_state=42)
X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(X, y_regression, test_size=0.2, random_state=42)

# Impute missing values in training and testing data for classification
X_train_clf = X_train_clf.fillna(X_train_clf.mean())
X_test_clf = X_test_clf.fillna(X_test_clf.mean())

# Impute missing values in training and testing data for regression
X_train_reg = X_train_reg.fillna(X_train_reg.mean())
X_test_reg = X_test_reg.fillna(X_test_reg.mean())

# Classification Model Training
model_clf = LogisticRegression(random_state=42)
model_clf.fit(X_train_clf, y_train_clf)

# Regression Model Training
model_reg = RandomForestRegressor(random_state=42)
model_reg.fit(X_train_reg, y_train_reg)

# Model Evaluation
y_pred_clf = model_clf.predict(X_test_clf)
accuracy = accuracy_score(y_test_clf, y_pred_clf)
precision = precision_score(y_test_clf, y_pred_clf)
recall = recall_score(y_test_clf, y_pred_clf)
conf_matrix = confusion_matrix(y_test_clf, y_pred_clf)

y_pred_reg = model_reg.predict(X_test_reg)
mae = mean_absolute_error(y_test_reg, y_pred_reg)
mse = mean_squared_error(y_test_reg, y_pred_reg)

# Prediction Function
def predict_match_outcome(home_team, away_team, tournament, neutral):
    """
    Predicts the winner and scoreline of a football match.

    Args:
        home_team (str): The name of the home team.
        away_team (str): The name of the away team.
        tournament (str): The name of the tournament.
        neutral (bool): True if the match is played on neutral ground, False otherwise.

    Returns:
        tuple: A tuple containing the predicted winner (str) and predicted scoreline (str).
    """
    # 1. Find the latest FIFA ranking data
    latest_home_rank = df_fifa[df_fifa['team'] == home_team].sort_values(by='date', ascending=False).iloc[0] # Corrected column name
    latest_away_rank = df_fifa[df_fifa['team'] == away_team].sort_values(by='date', ascending=False).iloc[0] # Corrected column name

    # 2. Calculate the rank_difference and points_difference
    rank_difference = latest_home_rank['rank'] - latest_away_rank['rank']
    points_difference = latest_home_rank['total.points'] - latest_away_rank['total.points']

    # 3. Prepare the input features
    input_data = {
        'rank_difference': [rank_difference],
        'points_difference': [points_difference],
        'neutral': [int(neutral)]
    }

    # Create a DataFrame for the input
    input_df = pd.DataFrame(input_data)

    # Add one-hot encoded tournament features, matching training data columns
    for col in tournament_features:
        input_df[col] = 0
    if f'tournament_{tournament}' in tournament_features:
        input_df[f'tournament_{tournament}'] = 1

    # Ensure the order of columns matches the training data
    # Need to handle potential missing columns if a new tournament is provided
    for col in X_train_clf.columns:
        if col not in input_df.columns:
            input_df[col] = 0

    input_df = input_df[X_train_clf.columns]


    # 4. Use the trained classification model to predict the winner
    prediction_clf = model_clf.predict(input_df)[0]

    # 5. Use the trained regression model to predict the score difference
    prediction_reg = model_reg.predict(input_df)[0]

    # 6. Determine the likely scoreline
    predicted_score_difference = round(prediction_reg)

    if prediction_clf == 1: # Home win predicted
        winner = f"{home_team} Wins"
        # If home wins, predicted difference should be positive or zero.
        # Assume a base score and add the difference.
        # This is a simplification; a more complex approach could predict individual scores.
        if predicted_score_difference >= 0:
            home_score = predicted_score_difference + 1 # Assume away scores at least 0, home scores at least 1 more
            away_score = 1
        else:
             # If regression predicts negative difference but classification predicts home win,
             # it suggests a close match, maybe a 1 goal difference for home
             home_score = 1
             away_score = 0

        scoreline = f"{home_score}-{away_score}"

    else: # Draw or Away win predicted
        if predicted_score_difference <= 0: # Away win or Draw
             winner = f"{away_team} Wins" if predicted_score_difference < 0 else "Draw"
             # If away wins, predicted difference should be negative or zero.
             if predicted_score_difference <= 0:
                 away_score = abs(predicted_score_difference) + 1 # Assume home scores at least 0, away scores at least 1 more
                 home_score = 1
             else:
                 # If regression predicts positive difference but classification predicts no home win,
                 # it suggests a close match, maybe a 1 goal difference for away
                 away_score = 1
                 home_score = 0
             scoreline = f"{home_score}-{away_score}"
        else: # Regression predicts positive difference but classification says no home win - likely a draw
            winner = "Draw"
            scoreline = "0-0" # Simplified for draw


    # 7. Return the predicted winner and scoreline
    return winner, scoreline

# Optional Visualizations

# --- Feature Importance for Regression Model ---
# For the regression model (model_reg), extract feature importances
if hasattr(model_reg, 'feature_importances_'):
    feature_importances_reg = model_reg.feature_importances_

    # Create a pandas Series of feature importances and sort
    feature_importance_series_reg = pd.Series(feature_importances_reg, index=X_train_reg.columns).sort_values(ascending=False)

    # Create a bar plot of the top N feature importances for the regression model
    plt.figure(figsize=(12, 6))
    sns.barplot(x=feature_importance_series_reg.head(15).values, y=feature_importance_series_reg.head(15).index)
    plt.title('Top 15 Feature Importances (Random Forest Regressor)')
    plt.xlabel('Importance')
    plt.ylabel('Feature')
    plt.show()
else:
    print("Regression model does not have a 'feature_importances_' attribute.")

# --- Most Influential Coefficients for Classification Model ---
# For the classification model (model_clf), extract the coefficients
# Check if the model is Logistic Regression as it has coef_ attribute
if hasattr(model_clf, 'coef_'):
    coefficients_clf = model_clf.coef_[0]

    # Create a pandas Series of coefficients and sort by absolute value
    coefficient_series_clf = pd.Series(coefficients_clf, index=X_train_clf.columns).sort_values(key=abs, ascending=False)

    # Create a bar plot of the top N most influential coefficients for the classification model
    plt.figure(figsize=(12, 6))
    sns.barplot(x=coefficient_series_clf.head(15).values, y=coefficient_series_clf.head(15).index)
    plt.title('Top 15 Most Influential Coefficients (Logistic Regression Classifier)')
    plt.xlabel('Coefficient Value')
    plt.ylabel('Feature')
    plt.show()
else:
    print("Classification model does not have a 'coef_' attribute for extracting coefficients.")


# --- Confusion Matrix for Classification Model ---
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix (Logistic Regression Classifier)')
plt.show()


# Example usage of the prediction function
home_team_example = "Netherlands"
away_team_example = "Australia"
tournament_example = "Friendly"
neutral_example = False # Or True if neutral ground

predicted_winner, predicted_scoreline = predict_match_outcome(home_team_example, away_team_example, tournament_example, neutral_example)

print(f"Predicting match between {home_team_example} and {away_team_example} ({tournament_example}, Neutral: {neutral_example})")
print(f"Predicted Outcome: {predicted_winner}, Predicted Scoreline: {predicted_scoreline}")


NameError: name 'pd' is not defined