# Data Frames

### Lo primero que voy a hacer es Web Scrapping para obtener la info de una pagina web en este caso es informacion de los mundiales de futbol desde 1930 hasta 2022.

In [85]:
#!pip install bs4

In [27]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

In [28]:
#!pip install lxml

In [29]:
years = [1930,1934,1938]
print(years)
for i in range(1950, 2023,4):
    years.append(i)

def get_matches(year):
    website = f'https://en.wikipedia.org/wiki/{year}_FIFA_World_Cup'
    respuesta = requests.get(website)
    contenido = respuesta.text
    soup = BeautifulSoup(contenido,'lxml')
    matches = soup.find_all('div', class_ = 'footballbox')

    home = []
    score = []
    away = []

    for match in matches:
        home.append(match.find('th', class_= 'fhome').get_text())
        away.append(match.find('th', class_= 'faway').get_text())
        score.append(match.find('th', class_= 'fscore').get_text())

    dict_football = {'home': home,'score': score, 'away': away}
    df_football = pd.DataFrame(dict_football)
    df_football['year']= year
    return df_football

fifa = [get_matches(year) for year in years]
df_fifa_cups = pd.concat(fifa, ignore_index = True) #Junta todos los dataframes de todos los partidos
df_fifa_cups.to_csv('historical_data_world_cups.csv', index = False)


[1930, 1934, 1938]


### Una vez adquirida la infomacion desde las paginas web de wikipedia en donde reposan todos los resultados de los diferentes partidos se procede a limpiar la info

In [30]:
df_fifa = pd.read_csv('historical_data_world_cups.csv')
df_fifa['home'] = df_fifa['home'].str.strip()
df_fifa['away'] = df_fifa['away'].str.strip()

#df_fifa[df_fifa['year'].isnull()] # Valido las filas que no tiene datos
df_fifa.dropna(inplace = True) # Elimina la data que falta
df_fifa

Unnamed: 0,home,score,away,year
0,France,4–1,Mexico,1930
1,Argentina,1–0,France,1930
2,Chile,3–0,Mexico,1930
3,Chile,1–0,France,1930
4,Argentina,6–3,Mexico,1930
...,...,...,...,...
960,England,1–2,France,2022
961,Argentina,3–0,Croatia,2022
962,France,2–0,Morocco,2022
963,Croatia,2–1,Morocco,2022


In [31]:
df_fifa.drop_duplicates(inplace = True)
df_fifa.sort_values('year', inplace = True)

In [32]:
df_fifa

Unnamed: 0,home,score,away,year
0,France,4–1,Mexico,1930
17,Uruguay,4–2,Argentina,1930
16,Uruguay,6–1,Yugoslavia,1930
15,Argentina,6–1,United States,1930
14,Paraguay,1–0,Belgium,1930
...,...,...,...,...
927,Japan,0–1,Costa Rica,2022
928,Spain,1–1,Germany,2022
929,Japan,2–1,Spain,2022
931,Morocco,0–0,Croatia,2022


In [33]:
index_eliminar = df_fifa[df_fifa['home'].str.contains('Sweden') & df_fifa['away'].str.contains('Austria')].index

In [34]:
df_fifa.drop(index = index_eliminar, inplace= True)
df_fifa[df_fifa['home'].str.contains('Sweden') & df_fifa['away'].str.contains('Austria')]

Unnamed: 0,home,score,away,year


In [35]:
df_fifa[df_fifa['score'].str.contains('[^\d–]')] #NO ME DA AUN 

Unnamed: 0,home,score,away,year
27,Italy,1–1 (a.e.t.),Spain,1934
34,Italy,2–1 (a.e.t.),Czechoslovakia,1934
24,Austria,3–2 (a.e.t.),France,1934
48,Brazil,1–1 (a.e.t.),Czechoslovakia,1938
42,Czechoslovakia,3–0 (a.e.t.),Netherlands,1938
...,...,...,...,...
953,Japan,1–1 (a.e.t.),Croatia,2022
955,Morocco,0–0 (a.e.t.),Spain,2022
957,Croatia,1–1 (a.e.t.),Brazil,2022
958,Netherlands,2–2 (a.e.t.),Argentina,2022


In [36]:
df_fifa['score'] = df_fifa['score'].str.replace('[^\d–]','',regex=True)

In [37]:
df_fifa['home'] = df_fifa['home'].str.strip()
df_fifa['away'] = df_fifa['away'].str.strip()

In [38]:
df_fifa

Unnamed: 0,home,score,away,year
0,France,4–1,Mexico,1930
17,Uruguay,4–2,Argentina,1930
16,Uruguay,6–1,Yugoslavia,1930
15,Argentina,6–1,United States,1930
14,Paraguay,1–0,Belgium,1930
...,...,...,...,...
927,Japan,0–1,Costa Rica,2022
928,Spain,1–1,Germany,2022
929,Japan,2–1,Spain,2022
931,Morocco,0–0,Croatia,2022


In [39]:

#df_fifa[['HomeGoals','AwayGoals']]=df_fifa['score'].str.split('–',expand = True)
#df_fifa.drop('score',axis = 1, inplace = True)



df_fifa.rename(columns={'home':'HomeTeam','away':'AwayTeam','year':'Year'}, inplace=True)
df_fifa

Unnamed: 0,HomeTeam,score,AwayTeam,Year
0,France,4–1,Mexico,1930
17,Uruguay,4–2,Argentina,1930
16,Uruguay,6–1,Yugoslavia,1930
15,Argentina,6–1,United States,1930
14,Paraguay,1–0,Belgium,1930
...,...,...,...,...
927,Japan,0–1,Costa Rica,2022
928,Spain,1–1,Germany,2022
929,Japan,2–1,Spain,2022
931,Morocco,0–0,Croatia,2022


In [41]:
df_fifa

Unnamed: 0,HomeTeam,score,AwayTeam,Year
0,France,4–1,Mexico,1930
17,Uruguay,4–2,Argentina,1930
16,Uruguay,6–1,Yugoslavia,1930
15,Argentina,6–1,United States,1930
14,Paraguay,1–0,Belgium,1930
...,...,...,...,...
927,Japan,0–1,Costa Rica,2022
928,Spain,1–1,Germany,2022
929,Japan,2–1,Spain,2022
931,Morocco,0–0,Croatia,2022


In [54]:
df_fifa[['HomeGoals','AwayGoals']] = df_fifa['score'].str.split('–',expand = True)

df_fifa = df_fifa.dropna(axis=0)
df_fifa = df_fifa.astype({'HomeGoals':int, 'AwayGoals':int, 'Year':int})
df_fifa

Unnamed: 0,HomeTeam,score,AwayTeam,Year,HomeGoals,AwayGoals
0,France,4–1,Mexico,1930,4,1
17,Uruguay,4–2,Argentina,1930,4,2
16,Uruguay,6–1,Yugoslavia,1930,6,1
15,Argentina,6–1,United States,1930,6,1
14,Paraguay,1–0,Belgium,1930,1,0
...,...,...,...,...,...,...
927,Japan,0–1,Costa Rica,2022,0,1
928,Spain,1–1,Germany,2022,1,1
929,Japan,2–1,Spain,2022,2,1
931,Morocco,0–0,Croatia,2022,0,0


In [48]:
df_fifa['TotalGoals'] = df_fifa['HomeGoals']+df_fifa['AwayGoals']

In [53]:
df_fifa

Unnamed: 0,HomeTeam,score,AwayTeam,Year,HomeGoals,AwayGoals
0,France,4–1,Mexico,1930,4,1
17,Uruguay,4–2,Argentina,1930,4,2
16,Uruguay,6–1,Yugoslavia,1930,6,1
15,Argentina,6–1,United States,1930,6,1
14,Paraguay,1–0,Belgium,1930,1,0
...,...,...,...,...,...,...
927,Japan,0–1,Costa Rica,2022,0,1
928,Spain,1–1,Germany,2022,1,1
929,Japan,2–1,Spain,2022,2,1
931,Morocco,0–0,Croatia,2022,0,0


In [51]:
df_fifa = df_fifa.drop('TotalGoals', axis=1)
df_fifa.to_csv('clean_fifa_worlcup_matches.csv', index=False)

In [55]:
df_fifa.dtypes

HomeTeam     object
score        object
AwayTeam     object
Year          int32
HomeGoals     int32
AwayGoals     int32
dtype: object

### Team Strength

In [58]:
df_home = df_fifa[['HomeTeam','HomeGoals','AwayGoals']]

In [59]:
df_away = df_fifa[['AwayTeam','HomeGoals','AwayGoals']]

### Renombrando Columnas

In [63]:
df_home = df_home.rename(columns = {'HomeTeam':'Team','HomeGoals':'GoalsScored','AwayGoals':'GoalsConceded'})
df_away = df_away.rename(columns = {'AwayTeam':'Team','HomeGoals':'GoalsConceded','AwayGoals':'GoalsScored'})
df_team_strength = pd.concat([df_home, df_away],ignore_index = True).groupby('Team').mean()

#### Promedio de goles recibidos vs goles anotados

In [65]:
df_team_strength

Unnamed: 0_level_0,GoalsScored,GoalsConceded
Team,Unnamed: 1_level_1,Unnamed: 2_level_1
Algeria,1.000000,1.461538
Angola,0.333333,0.666667
Argentina,1.727273,1.147727
Australia,0.850000,1.850000
Austria,1.482759,1.620690
...,...,...
Uruguay,1.508475,1.288136
Wales,0.625000,1.250000
West Germany,2.098361,1.213115
Yugoslavia,1.666667,1.272727


## Prediccion mediante funcion de Poisson

In [66]:
from scipy.stats import poisson

### La distribucion de Poisson es una distribucion discreta que describe el numero de eventos que ocurren en un intervalo de tiempo fijo o region de oportunidad

##### En este caso un gol puede ocurrir en los 90 minutos de un partido de futbol

### La distribucion de Poisson tiene ciertas condiciones

* El numero de eventos se puede contar.(Puede haber 1-2-3 o 5 goles, pero no pueden haber 2.5 goles)
* La ocurrencia de eventos son independientes.(La ocurrencia de un gol no afecta directamente la probabilidad de consecucion de otros)
* La tasa a la que ocurren los eventos es constante.(La probabilidad de que ocurra un gol en un partido de 90 minutos es la misma en todos los partidos)
* Dos eventos no pueden ocurrir en el mismo instante de tiempo.(Dos goles no pueden ocurrir en el mismo segundo en un partido).
![image.png](attachment:image.png)

In [86]:
def predict_points(home,away):
    if home in df_team_strength.index and away in df_team_strength.index:
        print("Los equipos estan en la tabla de fortalezas y debilidades.")
        # lamb_home = fortaleza_local * debilidad_visitante
        lamb_home = df_team_strength.at[home,'GoalsScored'] * df_team_strength.at[away,'GoalsConceded']
        # lamb_away = fortaleza_visitante * debilidad_local
        lamb_away = df_team_strength.at[away,'GoalsScored'] * df_team_strength.at[home,'GoalsConceded']
        #Lambda define el poderio de los equipos, si 
        print(f"Lamba_home: {lamb_home},Lamba_away: {lamb_away}")
        prob_home, prob_away, prob_draw = 0,0,0
        
        # Esto es para emular todos los posibles resultados de 0-0 a 10-10
        for x in range(0,11):
            for y in range(0,11):
                p = poisson.pmf(x,lamb_home)*poisson.pmf(y,lamb_away) #Distribucion de poisson
                if x == y:
                    prob_draw +=p
                elif x>y:
                    prob_home +=p
                else:
                    prob_away += p
        points_home = 3* prob_home + prob_draw
        points_away = 3* prob_away + prob_draw
        return(points_home , points_away)
        
    else:
        print("Alguno de los dos equipos no esta en la tabla de fortalezas y debilidades.")
        return(0 , 0)
prediccion_home_away = predict_points('Colombia','Bolivia')
prediccion_home_away

Los equipos estan en la tabla de fortalezas y debilidades.
Lamba_home: 4.848484848484849,Lamba_away: 0.22727272727272724


(2.9281852017553005, 0.023091305497698875)