### This notebook contains the Data visualization.

#### Notebook 1: Extract-Transform-Load
#### Notebook 2: Data Visualization
#### Notebook 3: Feature Engineering, Hyperparameter tuning and Modelling
#### Notebook 4: Result Evaluation

In [None]:
import pandas as pd
import numpy as np

In [None]:
%matplotlib inline 

import matplotlib as mpl
import matplotlib.pyplot as plt


In [None]:
# data import - since some raw datapoints are excluded in the modelling, this data comes from the 3. Notebook
Raw_data = pd.read_csv(r"C:\Users\### LOCAL PATH ###\Data_for_visualization.txt", sep = "\t")
Raw_data.shape

In [None]:
Raw_data.head()

##### Adding some columns with potentially informative content

In [None]:
Raw_data["Favorite"] = np.nan
for i in range(Raw_data.shape[0]):
    Raw_data.loc[i,"Favorite"] = np.where(Raw_data.loc[i,"Odds H"] < Raw_data.loc[i,"Odds A"], "Home Team", 
                                np.where(Raw_data.loc[i,"Odds H"] > Raw_data.loc[i,"Odds A"],"Away Team",
                                np.where(Raw_data.loc[i,"Odds H"] == Raw_data.loc[i,"Odds A"],"Even", Raw_data.loc[i,"Favorite"])))
    
Raw_data["Favorite"] = Raw_data["Favorite"].astype(str)
Onehot = pd.get_dummies(Raw_data[["Result"]], prefix='', prefix_sep='')
Onehot.columns = ["Win: " + str(col) for col in Onehot.columns]
Raw_data = pd.concat([Raw_data, Onehot], axis=1)

##### Goals scored statistics

In [None]:
Raw_data[["Home Goals", "Away Goals"]].plot(kind='box', figsize=(6,6))
plt.xlabel("Goals - Location")
plt.ylabel("Goals - Number")
plt.title("Goals Boxplot")
plt.show()

##### Odds

In [None]:
Raw_data[["Odds H", "Odds D", "Odds A"]].plot(kind='box', figsize=(6,6))

plt.ylabel("Odds")
plt.title("Odds Boxplot")
plt.show()

In [None]:
colors_list = ["green","coral"]
df1 = Raw_data[["AwayTeam", "Away Goals"]].rename(columns={"AwayTeam":"Team"})
df2 = Raw_data[["HomeTeam", "Home Goals"]].rename(columns={"HomeTeam":"Team"})
df = df2.merge(df1, how="left",on="Team")
df = df.groupby("Team", as_index=True).mean().sort_values(["Home Goals","Away Goals"],ascending=[False, False])

df.plot(kind='bar', figsize=(18,6), color = colors_list)
plt.xlabel("Team", size = 13)
plt.ylabel("Mean Goals", size = 13)
plt.title("Avg. Goals scored by Team", size = 16)
plt.ylim(0.,4.)
plt.show()

##### Wins statistics

In [None]:
colors_list = ["yellowgreen","gold","orangered"]

df1 = Raw_data[["AwayTeam", "Win: H", "Win: D", "Win: A"]].rename(columns={"AwayTeam":"Team","Win: H":"Loss", "Win: D":"Draw", "Win: A":"Win"})
df2 = Raw_data[["HomeTeam", "Win: H", "Win: D", "Win: A"]].rename(columns={"HomeTeam":"Team","Win: H":"Win", "Win: D":"Draw", "Win: A":"Loss"})
df = pd.concat([df2, df1])
df = df[["Team","Win","Draw","Loss"]]
df = df.groupby("Team", as_index=True).mean().sort_values(["Win","Loss"],ascending=[False, True])

df.plot(kind='bar', figsize=(18,6), color = colors_list)
plt.xlabel("Team", size = 13)
plt.xticks(rotation=70)
plt.ylabel("Mean Winner", size = 13)
plt.title("Avg. Game Result by Team", size = 16)
plt.ylim(0.,1.)
plt.show()

In [None]:
colors_list = ["pink", "grey","yellowgreen"]

df = Raw_data[["Favorite", "Win: A", "Win: D", "Win: H"]].rename(columns={"Win: H":"Home Team W", "Win: D":"Draw", "Win: A":"Away Team W"})
df = df.loc[df["Favorite"] != "nan"]
df = df.groupby("Favorite", as_index=True).mean()

ax = df.plot(kind='barh', figsize =(10,7), color = colors_list, edgecolor='w')

ax.set_alpha(0.8)
ax.set_title("Avg. Game Result by which Team was the Favorite", size = 16)

ax.set_xlabel("Winner %", size = 13)
ax.set_ylabel("Favorite", size = 13)
ax.set_xlim(0.,1.)
ax.set_yticklabels(["Away Team","Even","Home team"])

for i in ax.patches:
    ax.text(i.get_width()+0.01, i.get_y()+0.11, (str(round((i.get_width())*100,2))+"%"), fontsize=11, color='black')
    
ax.invert_yaxis()

In [None]:
colors_list = ["yellowgreen","grey","pink"]
explode_list =[0.03,0.03,0.03]

dfx = Raw_data["Result"].str.replace("H", "Home Team Win").str.replace("A","Away Team Win").str.replace("D", "Draw")
dfx.value_counts().plot(kind='pie', figsize =(10,6), colors = colors_list, 
                       autopct='%1.1f%%',
                        fontsize= 12,
                       shadow = True, 
                       startangle = 281, 
                       pctdistance = 1.15,
                      labels = None,
                      explode = explode_list)

plt.axis('equal')
plt.ylabel("")
plt.legend(labels=["Home Team Win","Draw","Away Team Win"], loc = 'upper right', prop={'size':12})
plt.title("Game results", y = 1.05, size = 16)
plt.show()

In [None]:
df = Raw_data.loc[:,["Favorite","Result"]]
df = df.loc[df["Favorite"] != "Even"].reset_index(drop=True)
df["Winner"] = np.nan

for i in range(df.shape[0]):
    df.loc[i,"Winner"] = np.where((df.loc[i,"Result"]=="H")&(df.loc[i,"Favorite"]=="Home Team"),"Favorite Win", 
                        np.where((df.loc[i,"Result"]=="A")&(df.loc[i,"Favorite"]=="Away Team"),"Favorite Win",
                        np.where((df.loc[i,"Result"]=="H")&(df.loc[i,"Favorite"]=="Away Team"),"Underdog Win",
                        np.where((df.loc[i,"Result"]=="A")&(df.loc[i,"Favorite"]=="Home Team"),"Underdog Win","Draw"))))
    
df["Winner"] = df["Winner"].astype(str)    
colors_list = ["yellowgreen","grey","orange"]
explode_list =[0.03,0.03,0.03]

df["Winner"].value_counts().plot(kind='pie', figsize =(10,6), colors = colors_list, 
                       autopct='%1.1f%%',
                        fontsize= 12,
                       shadow = True, 
                       startangle = 264, 
                       pctdistance = 1.15,
                      labels = None,
                      explode = explode_list)

plt.axis('equal')
plt.ylabel("")
plt.legend(labels=["Favorite Win", "Draw","Underdog Win"],loc = 'upper right', prop={'size':12})
plt.title("Did the favorite win?", y = 1.05, size = 16)
plt.show()

In [None]:
df = Raw_data[["HomeTeam","AwayTeam", "Home Goals","Away Goals","Win: A","Win: D","Win: H"]]
df1 = df[["HomeTeam","Home Goals","Win: A","Win: D","Win: H"]].rename(columns={"HomeTeam":"Team","Home Goals":"Goals"})
df2 = df[["AwayTeam","Away Goals","Win: A","Win: D","Win: H"]].rename(columns={"AwayTeam":"Team","Away Goals":"Goals"})
df1["Points"] = df1["Win: D"] + df1["Win: H"] *3
df2["Points"] = df1["Win: D"] + df1["Win: A"] *3
df1 = df1.drop(["Win: A","Win: D", "Win: H"], axis=1)
df2 = df2.drop(["Win: A","Win: D", "Win: H"], axis=1)

df = pd.concat([df1, df2])
df = df.groupby("Team", as_index=False).mean()

fit = np.polyfit(df["Goals"], df["Points"], deg=1)

df.plot(kind="scatter", x="Goals", y="Points", figsize = (12,6))
plt.plot(df["Goals"], fit[0] * df["Goals"] + fit[1], color = 'r')
plt.title("Goals-Points Averages entire sample", size=16)
plt.xlabel("Avg. Goals", size = 13)
plt.ylabel("Avg. Points", size = 13)


plt.show()

In [None]:
df.sort_values("Points", ascending=False).head()

In [None]:
df = Raw_data[["Season start","HomeTeam","AwayTeam", "Home Goals","Away Goals","Win: A","Win: D","Win: H"]]
df1 = df[["Season start","HomeTeam","Home Goals","Win: A","Win: D","Win: H"]].rename(columns={"HomeTeam":"Team","Home Goals":"Goals"})
df2 = df[["Season start","AwayTeam","Away Goals","Win: A","Win: D","Win: H"]].rename(columns={"AwayTeam":"Team","Away Goals":"Goals"})
df1["Points"] = df1["Win: D"] + df1["Win: H"] *3
df2["Points"] = df1["Win: D"] + df1["Win: A"] *3
df1 = df1.drop(["Win: A","Win: D", "Win: H"], axis=1)
df2 = df2.drop(["Win: A","Win: D", "Win: H"], axis=1)

df = pd.concat([df1, df2])
df = df.groupby(["Season start","Team"], as_index=False).mean()
df3 = df.rename(columns={"Goals":"prev. season Goals","Points":"prev. season Points"})
df3["Season start"] = df3["Season start"]+1

df = df.merge(df3, how="left", on=["Season start","Team"])
df = df.dropna()

In [None]:
fit = np.polyfit(df["Goals"], df["Points"], deg=1)

df.plot(kind="scatter", x="Goals", y="Points", figsize = (12,6))
plt.plot(df["Goals"], fit[0] * df["Goals"] + fit[1], color = 'r')
plt.title("Goals & Points in a season", size=16)
plt.xlabel("Avg. Goals", size = 13)
plt.ylabel("Avg. Points", size = 13)

plt.show()

In [None]:
fit = np.polyfit(df["prev. season Goals"], df["Goals"], deg=1)

df.plot(kind="scatter", x="prev. season Goals", y="Goals", figsize = (12,6))
plt.plot(df["prev. season Goals"], fit[0] * df["prev. season Goals"] + fit[1], color = 'r')
plt.title("Goals & prev. season's Goals", size=16)
plt.xlabel("Avg. Goals prior season", size = 13)
plt.ylabel("Avg. Goals", size = 13)

plt.show()

In [None]:
fit = np.polyfit(df["prev. season Points"], df["Points"], deg=1)

df.plot(kind="scatter", x="prev. season Points", y="Points", figsize = (12,6))
plt.plot(df["prev. season Points"], fit[0] * df["prev. season Points"] + fit[1], color = 'r')
plt.title("Points season(t-1) & Points season(t)", size=20)
plt.xlabel("Avg. Points prior season", size = 16)
plt.ylabel("Avg. Points", size = 16)

plt.show()

In [None]:
fit = np.polyfit(df["prev. season Goals"], df["Points"], deg=1)

df.plot(kind="scatter", x="prev. season Goals", y="Points", figsize = (12,6))
plt.plot(df["prev. season Goals"], fit[0] * df["prev. season Goals"] + fit[1], color = 'r')
plt.title("Goals season(t-1) & Points season(t)", size=20)
plt.xlabel("Avg. Goals prior season", size = 16)
plt.ylabel("Avg. Points", size = 16)

plt.show()

In [None]:
df = Raw_data[["Season start","HomeTeam","AwayTeam", "Home Goals","Away Goals","Win: A","Win: D","Win: H"]]
df1 = df[["Season start","HomeTeam","Home Goals","Win: A","Win: D","Win: H"]].rename(columns={"HomeTeam":"Team","Home Goals":"Goals"})
df2 = df[["Season start","AwayTeam","Away Goals","Win: A","Win: D","Win: H"]].rename(columns={"AwayTeam":"Team","Away Goals":"Goals"})
df1["Points"] = df1["Win: D"] + df1["Win: H"] *3
df2["Points"] = df1["Win: D"] + df1["Win: A"] *3
df1 = df1.drop(["Win: A","Win: D", "Win: H"], axis=1)
df2 = df2.drop(["Win: A","Win: D", "Win: H"], axis=1)

df = pd.concat([df1, df2])
df = df.groupby(["Season start"], as_index=False).mean()

df.plot(kind='line', x = ["Season start"], y = ["Goals","Points"], figsize=(20,6))
        

plt.xlabel("Year season startet", size = 13)
plt.ylabel("Avg. Goals & Points", size = 13)
plt.title("Avg. Goals & Points per Season", size = 16)
plt.legend(prop={'size':12})

plt.show()