# Network Analysis of Competitiveness Nocs

In [30]:
import os.path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math
%matplotlib inline
from bs4 import BeautifulSoup
import webbrowser
import urllib.request
from lxml import html
import zipfile
import re
import string
import sys, os
from IPython.display import Image
import itertools

# Create Dictionary for NOC codes and Country Names

In [31]:
filepath = '../../data/analysis/complete_data.csv'

In [32]:
if not os.path.exists( filepath ):
    print("Missing dataset file")
else:
    dicdf = pd.read_csv(filepath , encoding = "ISO-8859-1")
    print("File Read")

File Read


In [33]:
dicdf = dicdf.groupby('NOC')[['Country','Region']].last()
dicdf.head()

Unnamed: 0_level_0,Country,Region
NOC,Unnamed: 1_level_1,Unnamed: 2_level_1
AFG,Afghanistan,West and Central Asia
AHO,,
ALG,Algeria,North Africa
ARG,Argentina,South America
ARM,Armenia,West and Central Asia


In [34]:
indexs = dicdf[dicdf.Country.isnull()].index
for i in indexs:
    dicdf.loc[i,'Country'] = 'Unknown Nation'
    dicdf.loc[i,'Region'] = 'Unknown Region'
len(dicdf[dicdf.Region.isnull()])

0

# Summer

In [35]:
filepath = r"..\..\data\analysis\Summer_Games_Competitive_Network\SummerEdges.csv"

In [36]:
if not os.path.exists( filepath ):
    print("Missing dataset file")
else:
    df = pd.read_csv(filepath , encoding = "ISO-8859-1")
    print("File Read")

File Read


In [37]:
df.head()

Unnamed: 0,Source,Target
0,URS,EUA
1,EUA,AUS
2,EUA,USA
3,USA,GBR
4,USA,USA


To deal with the period of time where Germany was split we amalgamated ther two halves of the country and used the NOC code GERS to represent them, this makes the GDR and FRG codes defunct.
The EUN NOC code also causes an issue as to differenciate between the two Unified teams sent to the summer and winter olympics that year we used EUNS for the summer team and EUNW for the Winter

In [38]:
def replaceCodes(x):
    if x == 'FRG':
        return 'GERS'
    elif x == 'GDR':
        return 'GERS'
    elif x == 'EUN':
        return 'EUNS'
    else:
        return x

In [39]:
df = df.applymap(lambda x: replaceCodes(x))
df.head()

Unnamed: 0,Source,Target
0,URS,EUA
1,EUA,AUS
2,EUA,USA
3,USA,GBR
4,USA,USA


To make the network more user friendly we replace the NOC codes for the countries Names

In [40]:
def getName(x):
    try:
        country = dicdf.loc[x].Country
    except:
        print("Error - "+str(x))
        country = x
    return country

In [41]:
df = df.applymap(lambda x: getName(x))
df.head()

Unnamed: 0,Source,Target
0,Soviet Union,United Team of Germany
1,United Team of Germany,Australia
2,United Team of Germany,United States
3,United States,United Kingdom
4,United States,United States


## Top Competitors
Evaluates the top 5 (if available) opponents for each country

In [42]:
new_df = pd.DataFrame(columns=['Country','Occurances','Number_Competitors','Competitor1','Competitor2','Competitor3','Competitor4','Competitor5'])
new_df.head()

Unnamed: 0,Country,Occurances,Number_Competitors,Competitor1,Competitor2,Competitor3,Competitor4,Competitor5


Counting Number of Occurances of each value in the Source column

In [43]:
counts = df.Source.value_counts()
for i in counts.index:
    occ = counts[i]
    country = i
    new_df.loc[len(new_df)] = [country,occ,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan]
new_df.head()

Unnamed: 0,Country,Occurances,Number_Competitors,Competitor1,Competitor2,Competitor3,Competitor4,Competitor5
0,United States,1064,,,,,,
1,Soviet Union,607,,,,,,
2,China,415,,,,,,
3,Germany (Split),413,,,,,,
4,United Kingdom,298,,,,,,


In [44]:
new_df = new_df.set_index('Country')
for country in df.Source.unique():
    temp = df[df.Source == country]
    new_df.loc[country,'Number_Competitors'] = len(temp.Target.value_counts().index)
    if len(temp.Target.value_counts().index) < 5:
         for i in range(len(temp.Target.value_counts().index)):
            col = 'Competitor'+str(i+1)
            comp = temp.Target.value_counts().index[i]
            new_df.loc[country,col] = comp
        
    
    else: 
        for i in range(5):
            col = 'Competitor'+str(i+1)
            comp = temp.Target.value_counts().index[i]
            ##test
            new_df.loc[country,col] = comp
new_df.head()

Unnamed: 0_level_0,Occurances,Number_Competitors,Competitor1,Competitor2,Competitor3,Competitor4,Competitor5
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
United States,1064,79,United States,Australia,Germany (Split),United Kingdom,Soviet Union
Soviet Union,607,48,Soviet Union,Germany (Split),Hungary,United States,Bulgaria
China,415,60,China,Russian Federation,United States,"Korea, Rep.",Japan
Germany (Split),413,36,Germany (Split),Soviet Union,United States,Bulgaria,United Kingdom
United Kingdom,298,54,United States,Australia,France,Soviet Union,Germany (Split)


# Region
Dataframe will hold each country and the number of competitions it has, along with the percentage of these competitions it has with countries from every other region

Creates datafram with appropriate columns

In [182]:
columns = ['Country','Region','Occurances']+dicdf.Region.unique().tolist()
region_df = pd.DataFrame(columns=columns)
region_df

Unnamed: 0,Country,Region,Occurances,West and Central Asia,Unknown Region,North Africa,South America,Oceania,Europe,Caribbean Islands,Sub-Saharan Africa,North Asia,Brtitsh West Indies,North America,East Asia,Central America,South and Southeast Asia


Initially fills the dataframe with the names and number out degree (i.e. the number of times a country competes with others) of each country within the dataframe

In [183]:
counts = df.Source.value_counts()
for i in counts.index:
    occ = counts[i]
    country = i
    reg = dicdf.set_index('Country').loc[country].Region
    if country == "Unified Team":
        reg = 'North Asia'
    if country == 'Unknown Nation':
        reg = 'Unknown Region'
    region_df.loc[len(region_df)] = [country,reg,occ,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,np.nan]
region_df.head()

Unnamed: 0,Country,Region,Occurances,West and Central Asia,Unknown Region,North Africa,South America,Oceania,Europe,Caribbean Islands,Sub-Saharan Africa,North Asia,Brtitsh West Indies,North America,East Asia,Central America,South and Southeast Asia
0,United States,North America,1064,,,,,,,,,,,,,,
1,Soviet Union,North Asia,607,,,,,,,,,,,,,,
2,China,East Asia,415,,,,,,,,,,,,,,
3,Germany (Split),Europe,413,,,,,,,,,,,,,,
4,United Kingdom,Europe,298,,,,,,,,,,,,,,


In [184]:
def convertToRegion(country):
    reg = dicdf.set_index('Country').loc[country].Region
    if country == "Unified Team":
        reg = 'North Asia'
    if country == 'Unknown Nation':
        reg = 'Unknown Region'
    return reg

In [185]:
df_withRegion = df.copy()
df_withRegion.Target = df_withRegion.Target.apply(convertToRegion)
df_withRegion.head()

Unnamed: 0,Source,Target
0,Soviet Union,Europe
1,United Team of Germany,Oceania
2,United Team of Germany,North America
3,United States,Europe
4,United States,North America


In [186]:
region_df = region_df.set_index('Country')

for country in region_df.index:
    temp = df_withRegion[df_withRegion.Source == country]
    vals = temp.Target.value_counts()
    tot = region_df.loc[country].Occurances
    for i in vals.index:
        x = vals[i]*100
        x /= tot
        region_df.loc[country,i] = x
region_df = region_df.reset_index()
region_df.head()


Unnamed: 0,Country,Region,Occurances,West and Central Asia,Unknown Region,North Africa,South America,Oceania,Europe,Caribbean Islands,Sub-Saharan Africa,North Asia,Brtitsh West Indies,North America,East Asia,Central America,South and Southeast Asia
0,United States,North America,1064,1.40977,,0.56391,2.63158,8.08271,36.4662,5.26316,2.16165,10.5263,,23.0263,9.21053,0.37594,0.281955
1,Soviet Union,North Asia,607,0.658979,,0.329489,1.4827,2.30643,62.1087,1.4827,0.658979,15.6507,,7.24876,6.75453,0.988468,0.329489
2,China,East Asia,415,1.68675,,0.481928,1.20482,2.89157,25.5422,2.16867,0.963855,14.9398,,10.1205,32.5301,2.16867,5.3012
3,Germany (Split),Europe,413,0.242131,,,1.45278,3.1477,54.2373,1.21065,0.242131,19.8547,,15.2542,3.38983,0.968523,
4,United Kingdom,Europe,298,1.34228,,0.671141,3.69128,10.4027,44.2953,3.69128,3.02013,11.0738,,15.4362,6.04027,,0.33557


In [187]:
region_df.isnull().sum()

Country                       0
Region                        0
Occurances                    0
West and Central Asia        71
Unknown Region              123
North Africa                 99
South America                86
Oceania                      77
Europe                       28
Caribbean Islands            70
Sub-Saharan Africa           73
North Asia                   56
Brtitsh West Indies         126
North America                46
East Asia                    51
Central America             107
South and Southeast Asia     99
dtype: int64

Replace the null values with 0

In [188]:
cols = df_withRegion.Target.unique().tolist()
region_df[cols] = region_df[cols].fillna(0)
region_df.head()

Unnamed: 0,Country,Region,Occurances,West and Central Asia,Unknown Region,North Africa,South America,Oceania,Europe,Caribbean Islands,Sub-Saharan Africa,North Asia,Brtitsh West Indies,North America,East Asia,Central America,South and Southeast Asia
0,United States,North America,1064,1.409774,0.0,0.56391,2.631579,8.082707,36.466165,5.263158,2.161654,10.526316,0.0,23.026316,9.210526,0.37594,0.281955
1,Soviet Union,North Asia,607,0.658979,0.0,0.329489,1.482702,2.306425,62.108731,1.482702,0.658979,15.650741,0.0,7.248764,6.75453,0.988468,0.329489
2,China,East Asia,415,1.686747,0.0,0.481928,1.204819,2.891566,25.542169,2.168675,0.963855,14.939759,0.0,10.120482,32.53012,2.168675,5.301205
3,Germany (Split),Europe,413,0.242131,0.0,0.0,1.452785,3.1477,54.237288,1.210654,0.242131,19.854722,0.0,15.254237,3.389831,0.968523,0.0
4,United Kingdom,Europe,298,1.342282,0.0,0.671141,3.691275,10.402685,44.295302,3.691275,3.020134,11.073826,0.0,15.436242,6.040268,0.0,0.33557


## Percentage of competion on average that countries have with other nations in their own region

In [194]:
def percentSameRegion(x):
    reg1 = x.Region 
    val = x[reg1]
    return val

In [197]:
str(region_df.apply(percentSameRegion,axis=1).mean())+'%'

'25.79594427414235%'

In [192]:
region_df[region_df.Region == 'Unknown Region']

Unnamed: 0,Country,Region,Occurances,West and Central Asia,Unknown Region,North Africa,South America,Oceania,Europe,Caribbean Islands,Sub-Saharan Africa,North Asia,Brtitsh West Indies,North America,East Asia,Central America,South and Southeast Asia
86,Unknown Nation,Unknown Region,4,0.0,0.0,0.0,0.0,0.0,75.0,0.0,0.0,0.0,0.0,25.0,0.0,0.0,0.0


## In General Competion with other nations

In [204]:
grouped_region = region_df.groupby('Region')[df_withRegion.Target.unique().tolist()].mean()
grouped_region = grouped_region[grouped_region.index]
grouped_region.head()

Unnamed: 0_level_0,Caribbean Islands,Central America,East Asia,Europe,North Africa,North America,North Asia,Oceania,South America,South and Southeast Asia,Sub-Saharan Africa,Unknown Region,West and Central Asia
Region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
Caribbean Islands,20.343176,1.635599,10.68126,23.479532,0.333516,20.663638,3.682383,0.219298,1.858814,4.08443,3.199013,0.0,9.81934
Central America,1.515152,1.515152,27.272727,32.575758,0.757576,6.060606,3.787879,0.0,1.515152,0.0,25.0,0.0,0.0
East Asia,5.717825,1.696622,20.113135,31.985197,0.523981,8.135121,10.481172,8.356772,1.274566,7.384968,0.265016,0.0,4.065624
Europe,1.798358,0.474356,6.02668,57.428041,0.455148,9.923451,12.114247,5.108578,1.144284,0.481675,1.81412,0.144676,2.947866
North Africa,2.5,0.0,2.5,35.151515,5.0,7.272727,2.5,13.106061,0.0,0.0,22.19697,0.0,9.772727


In [213]:
def highlight_max(s):
    '''
    highlight the maximum in a Series yellow.
    '''
    is_max = s == s.max()
    return ['background-color: orange' if v else '' for v in is_max]

In [214]:
grouped_region.style.apply(highlight_max)

Unnamed: 0_level_0,Caribbean Islands,Central America,East Asia,Europe,North Africa,North America,North Asia,Oceania,South America,South and Southeast Asia,Sub-Saharan Africa,Unknown Region,West and Central Asia
Region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
Caribbean Islands,20.3432,1.6356,10.6813,23.4795,0.333516,20.6636,3.68238,0.219298,1.85881,4.08443,3.19901,0.0,9.81934
Central America,1.51515,1.51515,27.2727,32.5758,0.757576,6.06061,3.78788,0.0,1.51515,0.0,25.0,0.0,0.0
East Asia,5.71782,1.69662,20.1131,31.9852,0.523981,8.13512,10.4812,8.35677,1.27457,7.38497,0.265016,0.0,4.06562
Europe,1.79836,0.474356,6.02668,57.428,0.455148,9.92345,12.1142,5.10858,1.14428,0.481675,1.81412,0.144676,2.94787
North Africa,2.5,0.0,2.5,35.1515,5.0,7.27273,2.5,13.1061,0.0,0.0,22.197,0.0,9.77273
North America,2.63158,0.18797,10.3904,40.9604,0.695178,20.1908,8.56894,8.58681,2.14224,0.140977,2.3205,0.0,3.18423
North Asia,2.01363,0.93956,18.0878,45.0951,0.984904,10.0797,14.1774,1.56152,1.28423,0.398168,0.828749,0.203252,4.34597
Oceania,0.532915,0.0766284,2.8248,40.101,0.30303,8.38732,11.7555,3.35075,1.14246,0.153257,30.7628,0.30303,0.306513
South America,1.48718,0.0,5.76923,33.7308,0.153846,18.7436,16.5897,18.0897,1.61538,2.0,1.16667,0.0,0.653846
South and Southeast Asia,17.1498,0.0,27.8891,17.4483,0.0,6.64062,3.97758,0.0,3.51562,10.4552,2.5,0.0,10.4238


In [212]:
import seaborn as sns

cm = sns.light_palette("red", as_cmap=True)

s = grouped_region.style.background_gradient(cmap=cm)
s

Unnamed: 0_level_0,Caribbean Islands,Central America,East Asia,Europe,North Africa,North America,North Asia,Oceania,South America,South and Southeast Asia,Sub-Saharan Africa,Unknown Region,West and Central Asia
Region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
Caribbean Islands,20.3432,1.6356,10.6813,23.4795,0.333516,20.6636,3.68238,0.219298,1.85881,4.08443,3.19901,0.0,9.81934
Central America,1.51515,1.51515,27.2727,32.5758,0.757576,6.06061,3.78788,0.0,1.51515,0.0,25.0,0.0,0.0
East Asia,5.71782,1.69662,20.1131,31.9852,0.523981,8.13512,10.4812,8.35677,1.27457,7.38497,0.265016,0.0,4.06562
Europe,1.79836,0.474356,6.02668,57.428,0.455148,9.92345,12.1142,5.10858,1.14428,0.481675,1.81412,0.144676,2.94787
North Africa,2.5,0.0,2.5,35.1515,5.0,7.27273,2.5,13.1061,0.0,0.0,22.197,0.0,9.77273
North America,2.63158,0.18797,10.3904,40.9604,0.695178,20.1908,8.56894,8.58681,2.14224,0.140977,2.3205,0.0,3.18423
North Asia,2.01363,0.93956,18.0878,45.0951,0.984904,10.0797,14.1774,1.56152,1.28423,0.398168,0.828749,0.203252,4.34597
Oceania,0.532915,0.0766284,2.8248,40.101,0.30303,8.38732,11.7555,3.35075,1.14246,0.153257,30.7628,0.30303,0.306513
South America,1.48718,0.0,5.76923,33.7308,0.153846,18.7436,16.5897,18.0897,1.61538,2.0,1.16667,0.0,0.653846
South and Southeast Asia,17.1498,0.0,27.8891,17.4483,0.0,6.64062,3.97758,0.0,3.51562,10.4552,2.5,0.0,10.4238
