In [10]:
# Win Stacked Bar Data Processing

import pandas as pd

results = pd.read_csv("results.csv")
shootouts = pd.read_csv("shootouts.csv")

wins = pd.DataFrame(columns=['year', 'team'])
neutrals = pd.DataFrame(columns = ['year', 'team'])

for index, row in results.iterrows():
    year = pd.to_datetime(row['date']).year
    home_team = row['home_team']
    away_team = row['away_team']
    home_score = row['home_score']
    away_score = row['away_score']

    if home_score > away_score:
        wins = wins.append({'year': year, 'team': home_team}, ignore_index=True)
    elif home_score < away_score:
        wins = wins.append({'year': year, 'team': away_team}, ignore_index=True)
    else:
        
        shootout = shootouts[
            (shootouts['date'] == row['date']) & 
            (shootouts['home_team'] == home_team) & 
            (shootouts['away_team'] == away_team)
        ]
        if not shootout.empty:
            winner = shootout.iloc[0]['winner']
            wins = wins.append({'year': year, 'team': winner}, ignore_index=True)
        else:
            neutrals = neutrals.append({'year': year, 'team': home_team}, ignore_index=True)
            neutrals = neutrals.append({'year': year, 'team': away_team}, ignore_index=True)


In [12]:
win_counts = wins.groupby(['year', 'team']).size().unstack(fill_value=0)
neutral_counts = neutrals.groupby(['year', 'team']).size().unstack(fill_value=0)

In [21]:
win_counts.to_csv('win_counts.csv')
neutral_counts.to_csv('neutral_counts.csv')

In [None]:
# Network Data Processing
import csv
import json

team_counts = {}
matches = []
with open('results.csv', 'r') as file:
    reader = csv.DictReader(file)
    for row in reader:
        home_team = row['home_team']
        away_team = row['away_team']

        if home_team in team_counts:
            team_counts[home_team] += 1
        else:
            team_counts[home_team] = 1

        if away_team in team_counts:
            team_counts[away_team] += 1
        else:
            team_counts[away_team] = 1

# Filter teams that appear more than 250 times
filtered_teams = [team for team, count in team_counts.items() if count >= 250]


teams = {team: index + 1 for index, team in enumerate(filtered_teams)}


match_values = {}
with open('results.csv', 'r') as file:
    reader = csv.DictReader(file)
    for row in reader:
        home_team = row['home_team']
        away_team = row['away_team']

        if home_team in teams and away_team in teams:
            match = (teams[home_team], teams[away_team])

        if match in match_values:
            match_values[match] += 1
        else:
            match_values[match] = 1

# Create links with values
links = []
processed_matches = set()

for match, value in match_values.items():
    source, target = match
    reverse_match = (target, source)

    if reverse_match in processed_matches:
        continue

    combined_value = value
    if reverse_match in match_values:
        combined_value += match_values[reverse_match]

    links.append({"source": source, "target": target, "value": combined_value})
    links.append({"source": target, "target": source, "value": combined_value})
    processed_matches.add(match)


# Create JSON structure
data = {
    'nodes': [{'id': id, 'name': name} for name, id in teams.items()],
    'links': links
}

# Write JSON to file
with open('data.json', 'w') as file:
    json.dump(data, file)

In [13]:
# Map Data Processing

import pandas as pd

data = pd.read_csv("results.csv")

host_counts = data['country'].value_counts()

host_counts = host_counts.reset_index()
host_counts.columns = ['country', 'count']

host_counts.to_csv("host_counts.csv", index=False)

In [19]:
# Replace country name in host_counts.csv by GeoJSON country name

import pandas as pd
import requests
import json

data = pd.read_csv("host_counts.csv")
csv_countries = set(data['country'])

url = 'https://raw.githubusercontent.com/holtzy/D3-graph-gallery/master/DATA/world.geojson'
response = requests.get(url)
geojson_data = json.loads(response.text)

geojson_countries = set(feature['properties']['name'] for feature in geojson_data['features'])

missing_countries = csv_countries.difference(geojson_countries)

lst = []
for country in missing_countries:
    lst.append(country)

Countries in the CSV but not in the GeoJSON:


In [1]:
# Drop rows where any element is NaN

import pandas as pd

file_path = 'host_counts.csv'
df = pd.read_csv(file_path)

df = df.dropna()

new_file_path = 'host_counts.csv'
df.to_csv(new_file_path, index=False)

In [None]:
# Word Cloud Data Processing
import csv
import json

data = []
with open('goalscorers.csv', 'r') as file:
    reader = csv.DictReader(file)
    rows = list(reader)
    
    scorer_counts = {}
    
    for row in rows:
        scorer = row['scorer']
        team = row['team']
        
        scorer_counts[scorer] = scorer_counts.get(scorer, 0) + 1
    
    data = [{'scorer': scorer, 'team': team, 'time': time} for scorer, time in scorer_counts.items()]

print(data)


with open('goalscorers.json', 'w') as json_file:
    json.dump(data, json_file, indent=4)

[{'scorer': 'José Piendibene', 'team': 'France', 'time': 4}, {'scorer': 'Isabelino Gradín', 'team': 'France', 'time': 5}, {'scorer': 'Alberto Ohaco', 'team': 'France', 'time': 4}, {'scorer': 'Telésforo Báez', 'team': 'France', 'time': 1}, {'scorer': 'Juan Domingo Brown', 'team': 'France', 'time': 2}, {'scorer': 'Alberto Marcovecchio', 'team': 'France', 'time': 2}, {'scorer': 'Demóstenes Correia de Syllos', 'team': 'France', 'time': 1}, {'scorer': 'Hernando Salazar', 'team': 'France', 'time': 1}, {'scorer': 'José Durand Laguna', 'team': 'France', 'time': 1}, {'scorer': 'Manoel Alencar Monte', 'team': 'France', 'time': 1}, {'scorer': 'Arthur Friedenreich', 'team': 'France', 'time': 7}, {'scorer': 'Jose Tognola', 'team': 'France', 'time': 1}, {'scorer': 'Carlos Scarone', 'team': 'France', 'time': 6}, {'scorer': 'Ángel Romano', 'team': 'France', 'time': 12}, {'scorer': 'Neco', 'team': 'France', 'time': 8}, {'scorer': 'Pedro Calomino', 'team': 'France', 'time': 1}, {'scorer': 'Silvio Lagrec