In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
%matplotlib inline

In [None]:
matches = pd.read_csv('WorldCupMatches.csv')
matches.head()

In [None]:
players = pd.read_csv('WorldCupPlayers.csv')
players.head()

In [None]:
worldcups = pd.read_csv('WorldCups.csv')
worldcups.head()

In [None]:
matches.info()

In [None]:
players.info()

In [None]:
worldcups.info()

In [None]:
matches.describe()

In [None]:
players.describe()

In [None]:
worldcups.describe()

In [None]:
matches.shape

In [None]:
players.shape

In [None]:
worldcups.shape

In [None]:
matches.isnull().sum()

In [None]:
players.isnull().sum()

In [None]:
worldcups.isnull().sum()

In [None]:
matches.nunique()

In [None]:
players.nunique()

In [None]:
worldcups.nunique()

In [None]:
matches.dropna(inplace=True)

In [None]:
matches.shape

In [None]:
players.dropna(inplace=True)
players.shape

In [None]:
matches['Home Team Name'].value_counts()

In [None]:
names = matches[matches['Home Team Name'].str.contains('rn">')]['Home Team Name'].value_counts()
names

In [None]:
wrong_countries = list(names.index)
wrong_countries

In [None]:
corrected_form = [name.split('>')[1] for name in wrong_countries]
corrected_form

In [None]:
old_name = ['Germany FR', 'Maracan� - Est�dio Jornalista M�rio Filho', 'Estadio do Maracana']
new_name = ['Germany', 'Maracan Stadium', 'Maracan Stadium']

In [None]:
wrong_countries = wrong_countries + old_name
corrected_form = corrected_form + new_name

In [None]:
wrong_countries, corrected_form

In [None]:
for index, wr in enumerate(wrong_countries):
    worldcups = worldcups.replace(wrong_countries[index], corrected_form[index])

for index, wr in enumerate(wrong_countries):
    matches = matches.replace(wrong_countries[index], corrected_form[index])

for index, wr in enumerate(wrong_countries):
    players = players.replace(wrong_countries[index], corrected_form[index])

In [None]:
names = matches[matches['Home Team Name'].str.contains('rn">')]['Home Team Name'].value_counts()
names

In [None]:
winner = worldcups['Winner'].value_counts()
winner

In [None]:
winner.plot(kind='bar')

In [None]:
runnersup = worldcups['Runners-Up'].value_counts()
runnersup

In [None]:
runnersup.plot(kind='bar')

In [None]:
thrid_place = worldcups['Third'].value_counts()
thrid_place

In [None]:
thrid_place.plot(kind='bar')

In [None]:
winner_df = winner.reset_index()
winner_df.columns = ['Team', 'First']
winner_df

In [None]:
runners_df = runnersup.reset_index()
runners_df.columns = ['Team', 'Second']
runners_df

In [None]:
thrid_df = thrid_place.reset_index()
thrid_df.columns = ['Team', 'Third']
thrid_df

In [None]:
merged_df = pd.merge(winner_df, runners_df, on='Team', how='outer')
merged_df = pd.merge(merged_df, thrid_df, on='Team', how='outer')

In [None]:
merged_df = merged_df.fillna(0)

In [None]:
merged_df

In [None]:
ax = merged_df.plot(kind='bar', figsize=(12, 8))
ax.set_xlabel('Team')
ax.set_ylabel('Count')
ax.set_title('FIFA World Cup Winning Count')
plt.show()

In [None]:
home = matches[['Home Team Name', 'Home Team Goals']]
away = matches[['Away Team Name', 'Away Team Goals']]

In [None]:
home.columns = ['Team', 'Goals']
away.columns = ['Team', 'Goals']

In [None]:
home

In [None]:
away

In [None]:
home.columns = ['Countries', 'Goals']
away.columns = home.columns

In [None]:
goals = pd.concat([home, away], ignore_index=True)

In [None]:
goals.head()

In [None]:
goals = goals.groupby('Countries')['Goals'].sum().reset_index()

In [None]:
goals

In [None]:
goals = goals.sort_values(by = 'Goals', ascending=False)
goals

In [None]:
goals_sorted = goals.sort_values(by='Goals', ascending=False)

In [None]:
top_n = 10
top_countries = goals_sorted.head(top_n)
top_countries

In [None]:
plt.figure(figsize=(12, 8))
plt.bar(top_countries['Countries'], top_countries['Goals'], color='black')
plt.xlabel('Countries')
plt.ylabel('Goals')
plt.title(f'Top {top_n} Countries by Goals')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
def get_labels(matches):
    if matches['Home Team Goals'] > matches['Away Team Goals']:
        return 'Home Team Win'
    if matches['Home Team Goals'] < matches['Away Team Goals']:
        return 'Away Team Win'
    return 'DRAW'

In [None]:
matches['outcome'] = matches.apply(lambda x: get_labels(x), axis=1)

In [None]:
matches

In [None]:
match_s = matches['outcome'].value_counts()
match_s

In [None]:
match_s.plot(kind='pie')

In [None]:
matches['Datetime'] = pd.to_datetime(matches['Datetime'], errors='coerce')

In [None]:
matches['Datetime'] = matches['Datetime'].apply(lambda x: x.strftime('%d %b, %y') if pd.notnull(x) else '')

In [None]:
top10 = matches.sort_values(by = 'Attendance', ascending = False)[:10]
top10['vs'] = top10['Home Team Name'] + " vs " + top10['Away Team Name']

plt.figure(figsize = (12,10))

ax = sns.barplot(y = top10['vs'], x = top10['Attendance'])
sns.despine(right = True)

plt.ylabel('Match Teams')
plt.xlabel('Attendence')
plt.title('Matches with the highest number of Attendence')

for i, s in enumerate("Stadium: " + top10['Stadium']):
    ax.text(2000, i, s, fontsize = 12, color = 'white')
plt.show()

In [None]:
matches['Year'] = matches['Year'].astype(int)

std = matches.groupby(['Stadium', 'City'])['Attendance'].mean().reset_index().sort_values(by = 'Attendance', ascending =False)

top10 = std[:10]

plt.figure(figsize = (12,9))
ax = sns.barplot(y = top10['Stadium'], x = top10['Attendance'])
sns.despine(right = True)

plt.ylabel('Stadium Names')
plt.xlabel('Attendance')
plt.title('Stadium with the heighest number of attendance')
for i, s in enumerate("City: " + top10['City']):
        ax.text(2000, i, s, fontsize = 12, color = 'b')

plt.show()

In [None]:
matches['City'].value_counts().head(20).plot(kind='bar')

In [None]:
worldcups['Attendance'] = worldcups['Attendance'].str.replace(".", "")

In [None]:
worldcups.head()

In [None]:
fig, ax = plt.subplots(figsize = (10,5))
sns.despine(right = True)
g = sns.barplot(x = 'Year', y = 'QualifiedTeams', data = worldcups)
g.set_xticklabels(g.get_xticklabels(), rotation = 80)
g.set_title('Qualified Teams Per Year')

In [None]:
(matches.groupby('Year')['Home Team Goals'].sum() + matches.groupby('Year')['Away Team Goals'].sum()).plot(kind='line')