## Prediction models

In [1]:
import sqlite3
import pandas as pd

In [14]:
# Path to the database file
db_path = 'C:/Users/Eliud/Desktop/Repositories/Soccer-Analytics/data/soccermatches.db'

# Connect to the database
conn = sqlite3.connect(db_path)
cursor = conn.cursor()

# Query to select the latest 20 results from the table, ordered by Match Date descending
query = '''
SELECT "Home Team", "Away Team", "League",
"Score", "Result", "Pick", "HomePower", 
"AwayPower", "H1.5", "H2.5", "H3.5", "A3.5",
"A1.5", "A2.5", "HomeBTTS", "AwayBTTS",
"Home Form", "Away Form", "1", "2", "Home GD",
"HomeGP", "AwayGP", "Home Position", "Away Position",
"Home Avg Scored", "Home Avg Conceded", 
"Away Avg Scored", "Away Avg Conceded", 
ABS("HomePower" - "AwayPower") AS "Powerdiff",
"Match Date"
FROM footballresults 
WHERE ("HomePower" > 100 OR "AwayPower" > 100)
ORDER BY "Match Date" DESC'''


# Load the query results into a DataFrame
df = pd.read_sql_query(query, conn)

# Close the connection
conn.close()

# Calculate total goals scored
scores = df['Score'].str.split()
all_goals = []
for value in scores:
    home_goal = int(value[0])
    total_goals = int(value[0]) + int(value[-1])
    all_goals.append(total_goals)
df['Total Goals'] = all_goals
print(df['Home Team'].count())

3399


In [15]:
df.head()

Unnamed: 0,Home Team,Away Team,League,Score,Result,Pick,HomePower,AwayPower,H1.5,H2.5,...,AwayGP,Home Position,Away Position,Home Avg Scored,Home Avg Conceded,Away Avg Scored,Away Avg Conceded,Powerdiff,Match Date,Total Goals
0,Nacional Potosí,Guabirá,Bolivia Primera División,0 - 0,X,1,117.2,77.1,90.0,80.0,...,18.0,8.0,15.0,2.0,1.6,1.4,2.1,40.1,2024-10-25 01:00:00,0
1,Huracan,Central Cordoba de Santiago,Argentina Copa Argentina,1 - 2,2,1,112.4,128.9,50.0,20.0,...,,,,1.1,0.6,1.5,0.6,16.5,2024-10-25 01:00:00,3
2,Always Ready,Jorge Wilstermann,Bolivia Primera División,1 - 0,1,1,102.6,103.9,60.0,50.0,...,17.0,9.0,6.0,1.4,1.1,1.3,0.7,1.3,2024-10-24 23:00:00,1
3,MC Alger,Oued Akbou,Algeria Ligue 1,0 - 0,X,1,125.0,113.0,60.0,10.0,...,5.0,1.0,3.0,1.2,0.4,1.5,0.8,12.0,2024-10-24 22:30:00,0
4,FC Porto,TSG Hoffenheim,UEFA Europa League,2 - 0,1,1,135.2,83.4,100.0,70.0,...,3.0,15.0,18.0,2.3,1.0,1.6,2.0,51.8,2024-10-24 22:00:00,2


### Over 1.5

In [60]:
tdf = df[ (df['Powerdiff'] >= 30)
        & (df['H1.5'] >= 90) & (df['H2.5'] >= 80) 
        & (df['A1.5'] >= 80) & (df['A2.5'] >= 70)
        & (df['Home GD'] > 1)  
        ].copy()

tts = tdf[(tdf['Total Goals'] > 1)].copy()
out =  tdf[(tdf['Total Goals'] <= 1)].copy()

orig = tdf['Result'].count()
pred = tts['Result'].count()

outliers = orig - pred
accuracy = (pred/orig) * 100

print(f'{int(accuracy)}% accurate model with {pred} teams & {outliers} outliers')

out

100% accurate model with 25 teams & 0 outliers


Unnamed: 0,Home Team,Away Team,League,Score,Result,Pick,HomePower,AwayPower,H1.5,H2.5,...,AwayGP,Home Position,Away Position,Home Avg Scored,Home Avg Conceded,Away Avg Scored,Away Avg Conceded,Powerdiff,Match Date,Total Goals


### Winners

In [57]:
# Home prediction based on top 3 teams
pos_home = df[(df['HomeGP'] > 10) & (df['Home Position'] <= 3)
             & (df['Away Position'] - df['Home Position'] > 3) 
             & (df['HomePower'] - df['AwayPower'] >= 20)
             & (df['Home Form'].str.count('W') >= 4)
             & (df['Away Form'].str.count('W') <= 3)
             ].copy()
 
# Calculate accuracy 
h_ct = pos_home['Home Team'].count()
h_win = pos_home[(pos_home['Result'] == '1')].copy()
p_ct = h_win['Home Team'].count()
model_acc = (p_ct/h_ct) * 100
print(f'{int(model_acc)}% accurate model with {h_ct} teams')

100% accurate model with 50 teams


In [58]:
# Querying specific features
home_df = df[((df['HomePower']) - (df['AwayPower']) >= 30)
             & (df['HomePower'] >= 140)
             & (df['Home Form'].str.count('W') >= 4)
             & (df['Away Form'].str.count('L') >= 3)
            ].copy()


# Home team predictive accuracy 
filt_h = home_df[(home_df['Result'] == '1')].copy()
h_count = filt_h['Result'].count()
home_acc = h_count / home_df['Home Team'].count()
model_acc = int(home_acc * 100)

print(f'\n{int(model_acc)}% accurate model with {h_count} teams')


100% accurate model with 63 teams


## Top leagues

In [59]:
# Group by 'league', aggregate total goals, and sort in descending order
league_df = df.groupby('League')['Total Goals'].sum()
league_df = league_df.reset_index()

# Count matches per league
league_df['Counts'] = df['League'].value_counts().reindex(league_df['League']).values
league_df['Avg Goals'] = round(league_df['Total Goals'] / league_df['Counts'], 2)
league_goals = league_df[(league_df['Counts'] >= 60)].copy()
league_goals = league_goals.sort_values(by='Avg Goals', ascending=False).reset_index(drop=True)
league_goals.head(10)

Unnamed: 0,League,Total Goals,Counts,Avg Goals
0,World Friendlies Clubs,222,63,3.52
1,Germany Bundesliga II,216,65,3.32
2,Netherlands Eerste Divisie,214,66,3.24
3,England EFL Cup,197,63,3.13
4,UEFA Champions League,386,127,3.04
5,Norway Eliteserien,180,61,2.95
6,UEFA Nations League,212,75,2.83
7,UEFA Conference League,649,232,2.8
8,UEFA Europa League,322,117,2.75
9,England Premier League,164,61,2.69
