### 1. Imports

In [129]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [130]:
############### Caminho das Bases ###############
goals_scorers = 'https://raw.githubusercontent.com/ErikHenning927/project_modulo3/main/goalscorers.csv'
results = 'https://raw.githubusercontent.com/ErikHenning927/project_modulo3/main/results.csv'
shootouts = 'https://raw.githubusercontent.com/ErikHenning927/project_modulo3/main/shootouts.csv'

############### Lendo as Bases ###############

df_goals = pd.read_csv(goals_scorers)
df_results = pd.read_csv(results)
df_shootouts = pd.read_csv(shootouts)

In [131]:
print("Colunas do df_goals:")
print(df_goals.columns)

print("\nColunas do df_results:")
print(df_results.columns)

print("\nColunas do df_shootouts:")
print(df_shootouts.columns)

Colunas do df_goals:
Index(['date', 'home_team', 'away_team', 'team', 'scorer', 'minute',
       'own_goal', 'penalty'],
      dtype='object')

Colunas do df_results:
Index(['date', 'home_team', 'away_team', 'home_score', 'away_score',
       'tournament', 'city', 'country', 'neutral'],
      dtype='object')

Colunas do df_shootouts:
Index(['date', 'home_team', 'away_team', 'winner', 'first_shooter'], dtype='object')


### 2. Renomeando as colunas

In [132]:
df_goals.rename(columns={
    'date': 'Data',
    'home_team': 'time_casa',
    'away_team': 'time_visitante',
    'team': 'Time_marcador',
    'scorer': 'artilheiro',
    'minute': 'minuto',
    'own_goal': 'gol_contra',
    'penalty': 'penalti'
}, inplace=True)

df_results.rename(columns={
    'date': 'Data',
    'home_team': 'Time_casa',
    'away_team': 'Time_visitante',
    'home_score': 'Gol_emcasa',
    'away_score': 'Gol_fora',
    'tournament': 'Torneio',
    'city': 'Cidade',
    'country': 'País',
    'neutral': 'Neutro'
 
}, inplace=True)
df_shootouts.rename(columns={
    'date': 'Data',
    'home_team': 'Time_casa',
    'away_team': 'Time_visitante',
    'winner': 'Vencedor',
    'first_shooter': 'Primeiro_Time_a_chutar_ao_Gol'
   
}, inplace=True)


In [133]:
## Função para verificar o formato de data de uma coluna
def verificar_formato_data(df, coluna):
    primeiros_valores = df[coluna].head().tolist()
    # Tenta inferir o formato da data com base nos primeiros valores
    formatos = ["%Y-%m-%d", "%d-%m-%Y", "%m-%d-%Y", "%Y/%m/%d", "%d/%m/%Y", "%m/%d/%Y"]
    for formato in formatos:
        try:
            pd.to_datetime(primeiros_valores, format=formato)
            return formato
        except ValueError:
            continue
    return "Formato não reconhecido"

# Verificar o formato de data para cada coluna
print("Formato de data para df_goals['data']: ", verificar_formato_data(df_goals, 'Data'))
print("Formato de data para df_results['Data']: ", verificar_formato_data(df_results, 'Data'))
print("Formato de data para df_shootouts['Data']: ", verificar_formato_data(df_shootouts, 'Data'))


Formato de data para df_goals['data']:  %Y-%m-%d
Formato de data para df_results['Data']:  %Y-%m-%d
Formato de data para df_shootouts['Data']:  %Y-%m-%d


### 2. Tratamento de Dados (Conversão para tipos adequados)

In [137]:
# Tratamento para a coluna 'data' em df_goals
df_goals['Data'] = pd.to_datetime(df_goals['Data'], format='%Y-%m-%d').dt.strftime('%d-%m-%Y')

# Tratamento para a coluna 'Data' em df_results
df_results['Data'] = pd.to_datetime(df_results['Data'], format='%Y-%m-%d').dt.strftime('%d-%m-%Y')

# Tratamento para a coluna 'Data' em df_shootouts
df_shootouts['Data'] = pd.to_datetime(df_shootouts['Data'], format='%Y-%m-%d').dt.strftime('%d-%m-%Y')

# Conversão da coluna 'minuto' para float em df_goals
df_goals['minuto'] = df_goals['minuto'].astype(float)

# Conversão das colunas 'Gol_emcasa' e 'Gol_fora' para int em df_results
df_results['Gol_emcasa'] = df_results['Gol_emcasa'].astype(int)
df_results['Gol_fora'] = df_results['Gol_fora'].astype(int)

# Imprimir as colunas tratadas
print(df_goals['Data'])
print(df_results['Data'])


0        02-07-1916
1        02-07-1916
2        02-07-1916
3        02-07-1916
4        06-07-1916
            ...    
44105    10-02-2024
44106    10-02-2024
44107    11-02-2024
44108    11-02-2024
44109    11-02-2024
Name: Data, Length: 44110, dtype: object
0        30-11-1872
1        08-03-1873
2        07-03-1874
3        06-03-1875
4        04-03-1876
            ...    
46284    07-02-2024
46285    07-02-2024
46286    10-02-2024
46287    10-02-2024
46288    11-02-2024
Name: Data, Length: 46289, dtype: object


### 3. Tratamento de Dados (removendo duplicados e NaN)

In [138]:
df_goals.dropna(inplace=True)
df_results.dropna(inplace=True)
df_shootouts.dropna(inplace=True)


df_goals.drop_duplicates(inplace=True)
df_results.drop_duplicates(inplace=True)
df_shootouts.drop_duplicates(inplace=True)


In [139]:
df_results[df_results['Gol_emcasa'] == df_results['Gol_fora']].shape[0]

10555

In [144]:
##Dataframe tratado
df_goals.head(3)

Unnamed: 0,Data,Time_casa,Time_visitante,Gol_emcasa,Gol_fora,Torneio,Cidade,País,Neutro
0,30-11-1872,Scotland,England,0,0,Friendly,Glasgow,Scotland,False
1,08-03-1873,England,Scotland,4,2,Friendly,London,England,False
2,07-03-1874,Scotland,England,2,1,Friendly,Glasgow,Scotland,False


In [141]:
##Dataframe tratado
df_results.head(3)

Unnamed: 0,Data,Time_casa,Time_visitante,Gol_emcasa,Gol_fora,Torneio,Cidade,País,Neutro
0,30-11-1872,Scotland,England,0,0,Friendly,Glasgow,Scotland,False
1,08-03-1873,England,Scotland,4,2,Friendly,London,England,False
2,07-03-1874,Scotland,England,2,1,Friendly,Glasgow,Scotland,False


In [142]:
##Dataframe tratado
df_shootouts.head(3)

Unnamed: 0,Data,Time_casa,Time_visitante,Vencedor,Primeiro_Time_a_chutar_ao_Gol
26,20-06-1976,Czechoslovakia,Germany,Czechoslovakia,Czechoslovakia
36,22-05-1979,Argentina,Netherlands,Argentina,Argentina
40,21-06-1980,Italy,Czechoslovakia,Czechoslovakia,Italy


## Levantamento de Inferências com base nos Dados:

**Questões:**

1. Qual é a média de gols marcados por jogo?

2. Quem são os 5 maiores artilheiros e quantos gols cada um marcou?

3. Qual país sediou o maior número de partidas?


In [145]:
df_results

Unnamed: 0,Data,Time_casa,Time_visitante,Gol_emcasa,Gol_fora,Torneio,Cidade,País,Neutro
0,30-11-1872,Scotland,England,0,0,Friendly,Glasgow,Scotland,False
1,08-03-1873,England,Scotland,4,2,Friendly,London,England,False
2,07-03-1874,Scotland,England,2,1,Friendly,Glasgow,Scotland,False
3,06-03-1875,England,Scotland,2,2,Friendly,London,England,False
4,04-03-1876,Scotland,England,3,0,Friendly,Glasgow,Scotland,False
...,...,...,...,...,...,...,...,...,...
46284,07-02-2024,Ivory Coast,DR Congo,1,0,African Cup of Nations,Ebimpé,Ivory Coast,False
46285,07-02-2024,Nigeria,South Africa,1,1,African Cup of Nations,Bouaké,Ivory Coast,True
46286,10-02-2024,Qatar,Jordan,3,1,AFC Asian Cup,Lusail,Qatar,False
46287,10-02-2024,South Africa,DR Congo,0,0,African Cup of Nations,Abidjan,Ivory Coast,True


In [146]:
(df_results['Gol_emcasa'].sum() + df_results['Gol_fora'].sum()) / df_results.shape[0]

2.9370908855235585

In [53]:
df_goals['artilheiro'].value_counts().head(5)

artilheiro
Cristiano Ronaldo     108
Robert Lewandowski     61
Romelu Lukaku          60
Harry Kane             55
Lionel Messi           54
Name: count, dtype: int64

In [55]:
df_results['País'].value_counts().idxmax()

'United States'

Qual o desempenho do brasil em casa e fora ao longo do tempo?

In [10]:
brazil_home_games = df_results[df_results['home_team'] == 'Brazil']
brazil_away_games = df_results[df_results['away_team'] == 'Brazil']

avg_goals_home = brazil_home_games.groupby('date')['home_score'].mean()
avg_goals_away = brazil_away_games.groupby('date')['away_score'].mean()

In [11]:
avg_goals_home

date
1916-07-08    1.0
1916-07-12    1.0
1917-10-12    5.0
1919-05-11    6.0
1919-05-18    3.0
             ... 
2023-06-17    4.0
2023-06-20    2.0
2023-09-08    5.0
2023-10-12    1.0
2023-11-21    0.0
Name: home_score, Length: 597, dtype: float64

In [12]:
avg_goals_away

date
1914-09-20    0.0
1914-09-27    1.0
1916-07-10    1.0
1916-07-18    1.0
1917-10-03    2.0
             ... 
2022-12-09    1.0
2023-03-25    1.0
2023-09-12    1.0
2023-10-17    0.0
2023-11-16    1.0
Name: away_score, Length: 432, dtype: float64

Existe uma correlação entre o time que inicia a cobrança de pênaltis e a probabilidade de vitória?

In [13]:
first_shooter_counts = df_shootouts['first_shooter'].value_counts()

win_rate_by_first_shooter = df_shootouts.groupby('first_shooter')['winner'].apply(lambda x: (x == x.name).mean())

In [14]:
win_rate_by_first_shooter

first_shooter
Algeria          1.000000
Argentina        0.571429
Australia        0.800000
Bahrain          1.000000
Belgium          1.000000
                   ...   
United States    0.500000
Uruguay          0.571429
Uzbekistan       0.000000
Vietnam          1.000000
Åland            1.000000
Name: winner, Length: 77, dtype: float64