In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import pickle
import warnings
warnings.filterwarnings('ignore')

In [2]:
df=pd.read_csv("results.csv",encoding='latin-1')
df2=df.copy()

In [135]:
'''
FTHG full time home team goal
FTAG full time away team goal
FTR full time results(H home win A away win D draw)
HTHG
HTAG
HTR
HS home team shots
AS away team shots
HST home team shots on target
AST away team shots on target
HC home team corners
AC away team corners
HF home team fouls
AF away team fouls
HY home team yellow card
AY
HR home team red card
AR
'''

'\nFTHG full time home team goal\nFTAG full time away team goal\nFTR full time results(H home win A away win D draw)\nHTHG\nHTAG\nHTR\nHS home team shots\nAS away team shots\nHST home team shots on target\nAST away team shots on target\nHC home team corners\nAC away team corners\nHF home team fouls\nAF away team fouls\nHY home team yellow card\nAY\nHR home team red card\nAR\n'

In [136]:
df.shape

(11113, 23)

In [3]:
df['DateTime'] = df['DateTime'].apply(lambda x: x.split('T')[0])
df['DateTime'] = pd.to_datetime(df['DateTime'])
df['Year'] = df['DateTime'].dt.year
df['Month'] = df['DateTime'].dt.month
df['Day'] = df['DateTime'].dt.day
df = df.drop(columns=['Season', 'DateTime'])

In [138]:
from ydata_profiling import ProfileReport
pf=ProfileReport(df)
pf.to_file("Profile Report.html")


Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]


  0%|                                                                                           | 0/24 [00:00<?, ?it/s][A
  4%|███▍                                                                               | 1/24 [00:01<00:32,  1.40s/it][A
 38%|███████████████████████████████▏                                                   | 9/24 [00:01<00:02,  6.29it/s][A
 67%|██████████████████████████████████████████████████████▋                           | 16/24 [00:02<00:00, 11.76it/s][A
100%|██████████████████████████████████████████████████████████████████████████████████| 24/24 [00:03<00:00,  7.44it/s][A
  discretized_df.loc[:, column] = self._discretize_column(
  discretized_df.loc[:, column] = self._discretize_column(
  discretized_df.loc[:, column] = self._discretize_column(


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

In [139]:
df.columns

Index(['HomeTeam', 'AwayTeam', 'FTHG', 'FTAG', 'FTR', 'HTHG', 'HTAG', 'HTR',
       'Referee', 'HS', 'AS', 'HST', 'AST', 'HC', 'AC', 'HF', 'AF', 'HY', 'AY',
       'HR', 'AR', 'Year', 'Month', 'Day'],
      dtype='object')

In [4]:
def pts(pt): 
    if pt=='H':
        return 3
    elif pt=='D':
        return 2
    elif pt=='A': 
        return 1
    else:
        return 0

df['HTR']=df['HTR'].apply(pts)
df['FTR']=df['FTR'].apply(pts)

        

In [5]:
df[['AR', 'HR']] = SimpleImputer(strategy='most_frequent').fit_transform(pd.DataFrame(df[['AR', 'HR']]))
df[['HTHG', 'HTAG']] = KNNImputer(n_neighbors=1, weights='distance').fit_transform(pd.DataFrame(df[['HTHG', 'HTAG']]))
df['HTR'] = KNNImputer(n_neighbors=1, weights='distance').fit_transform(pd.DataFrame(df['HTR']))
df[['HS', 'AS', 'HST', 'AST', 'HY', 'AY', 'HF', 'AF', 'HC', 'AC']] = SimpleImputer(strategy='mean').fit_transform(
    pd.DataFrame(df[['HS', 'AS', 'HST', 'AST', 'HY', 'AY', 'HF', 'AF', 'HC', 'AC']]))

In [6]:
x = df[['HomeTeam', 'AwayTeam', 'HS', 'AS', 'HST', 'AST', 'HC', 'AC', 'HF', 'AF', 'HY', 'AY', 'HR', 'AR']]
y = df['FTR']

In [7]:
xtrain,xtest,ytrain,ytest=train_test_split(x,y,test_size=0.2,random_state=13)

In [8]:
categorical_features = ['HomeTeam', 'AwayTeam']
numerical_features = ['HS', 'AS', 'HST', 'AST', 'HC', 'AC', 'HF', 'AF', 'HY', 'AY', 'HR', 'AR']
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features),
        ('num', StandardScaler(), numerical_features)
    ])

In [19]:
def evaluate_model(model, xtrain, ytrain, xtest, ytest):
    # Create pipeline
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('model', model)
    ])
    
    # Fit the model
    pipeline.fit(xtrain, ytrain)
    
    # Make predictions
    ypred = pipeline.predict(xtest)
    
    # Calculate metrics
    r2 = r2_score(ytest, ypred)
    mae = mean_absolute_error(ytest, ypred)
    rmse = mean_squared_error(ytest, ypred, squared=False)
    
    # Cross-validation
    cv_scores = cross_val_score(pipeline, xtrain, ytrain, 
                              cv=5, scoring='neg_mean_squared_error')
    cv_rmse = (-cv_scores).mean()
    
    return {
        'model': model.__class__.__name__,
        'r2_score': r2,
        'mae': mae,
        'rmse': rmse,
        'cv_rmse': cv_rmse,
        'pipeline': pipeline
    }

In [40]:
models = [
    LinearRegression(),
    DecisionTreeRegressor(random_state=42),
    # RandomForestRegressor(random_state=42),
    XGBRegressor(random_state=42)
]

# Evaluate all models
results = []
for model in models:
    result = evaluate_model(model, xtrain, ytrain, xtest, ytest)
    results.append(result)
    print(f"\n{'-'*50}")
    print(f"Model: {result['model']}")
    print(f"R2 Score: {result['r2_score']:.4f}")
    print(f"MAE: {result['mae']:.4f}")
    print(f"RMSE: {result['rmse']:.4f}")
    print(f"CV RMSE: {result['cv_rmse']:.4f}")

# Create a results dataframe
results_df = pd.DataFrame(results)
print("\nSummary of Results:")
print(results_df[['model', 'r2_score', 'mae', 'rmse', 'cv_rmse']])


--------------------------------------------------
Model: LinearRegression
R2 Score: 0.2093
MAE: 0.6430
RMSE: 0.7557
CV RMSE: 0.5514

--------------------------------------------------
Model: DecisionTreeRegressor
R2 Score: -0.5140
MAE: 0.7540
RMSE: 1.0457
CV RMSE: 1.0312

--------------------------------------------------
Model: XGBRegressor
R2 Score: 0.1552
MAE: 0.6516
RMSE: 0.7812
CV RMSE: 0.5960

Summary of Results:
                   model  r2_score       mae      rmse   cv_rmse
0       LinearRegression  0.209342  0.643045  0.755719  0.551439
1  DecisionTreeRegressor -0.513957  0.753978  1.045737  1.031216
2           XGBRegressor  0.155156  0.651628  0.781186  0.595981


In [41]:
new_data={'HomeTeam':'Manchester United', 'AwayTeam':'Liverpool','HS':14, 'AS':12, 'HST':5, 'AST':4, 'HC':6, 'AC':4
          , 'HF':11, 'AF':14, 'HY':2, 'AY':3,
       'HR':0, 'AR':0}
new_data_df=pd.DataFrame([new_data])

In [42]:
def make_prediction(pred): 
    try:
        if pred>=2.5: 
            return "Home win",3
        elif pred>=1.5:
            return "draw",2
        else:
            return "away win",1
    except Exception as e:
        return "unknown",pred

In [53]:
print("new match prediction:\n")
print(f"{new_data['HomeTeam']} vs {new_data['AwayTeam']}")
predictions=[]
for result in results:
    try :
        model_name=result['model']
        pipeline=result['pipeline']
        pred=pipeline.predict(new_data_df[0])
        outcome,num_outcome=make_prediction(pred)
        try:
            if hasattr(pipeline.named_steps['model'],predict_proba):
                proba=pipeline.predict_proba(new_data_df)[0]
                if len(proba)==3:
                    home_prob=proba[2]
                    draw_prob=proba[1]
                    away_prob=proba[0]
                    
                elif len(proba)==2:
                    home_prob=proba[1]
                    
                    away_prob=proba[0]
        except Exception as e:
            print("cant get probability")
        
        pred_data = {
            'model': model_name,
            'prediction': outcome,
            'numeric_prediction': numeric_outcome,
            'raw_value': float(pred),
            'home_prob': home_prob,
            'draw_prob': draw_prob,
            'away_prob': away_prob
        }
        predictions.append(pred_data)
        print(f"{model_name}:")
        print(f"- Predicted outcome: {outcome} ({numeric_outcome})")
        print(f"- Raw prediction value: {pred:.2f}")
        
        if home_prob is not None:
            print("- Probability estimates:")
            if home_prob is not None:
                print(f"  Home Win: {home_prob*100:.1f}%")
            if draw_prob is not None:
                print(f"  Draw: {draw_prob*100:.1f}%")
            if away_prob is not None:
                print(f"  Away Win: {away_prob*100:.1f}%")
                
    except Exception as e:
        print(f"Error making prediction with {result['model']}: {str(e)}")
        continue



    
            

new match prediction:

Manchester United vs Liverpool
Error making prediction with LinearRegression: 0
Error making prediction with DecisionTreeRegressor: 0
Error making prediction with XGBRegressor: 0


In [54]:
# 4. Save predictions to DataFrame
predictions_df = pd.DataFrame(predictions)

# 5. Get best model prediction
try:
    best_model_idx = results_df['rmse'].idxmin()
    best_model = results[best_model_idx]
    best_pred = best_model['pipeline'].predict(new_data_df)[0]
    best_outcome, _ = make_prediction(best_pred)
    
    print("\nDETAILED PREDICTION FROM BEST MODEL:", best_model['model'])
    print(f"Final Prediction: {best_outcome} ({best_pred:.2f})")
    
    # Add best model flag to predictions
    predictions_df['is_best'] = predictions_df['model'] == best_model['model']
    
except Exception as e:
    print(f"Error getting best model prediction: {str(e)}")

# 6. Save predictions to CSV
try:
    predictions_df.to_csv('match_predictions.csv', index=False)
    print("\nPredictions saved to match_predictions.csv")
    print(predictions_df[['model', 'prediction', 'numeric_prediction', 'raw_value']])
except Exception as e:
    print(f"Error saving predictions: {str(e)}")


DETAILED PREDICTION FROM BEST MODEL: LinearRegression
Final Prediction: draw (1.82)
Error getting best model prediction: 'model'

Predictions saved to match_predictions.csv
Error saving predictions: "None of [Index(['model', 'prediction', 'numeric_prediction', 'raw_value'], dtype='object')] are in the [columns]"
