2. Leyendo y pre-procesando los datos

In [470]:
import numpy as np #importa la biblioteca numpy
import pandas as pd #importa la biblioteca pandas
from datetime import datetime


In [471]:
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set()
from plotly.offline import init_notebook_mode, iplot 
import plotly.graph_objs as go
import plotly.offline as py
import pycountry
import folium 
from folium import plugins


In [472]:
import numpy as np
from PIL import Image


import matplotlib.ticker as ticker
import matplotlib.animation as animation
from IPython.display import HTML

# Graphics in retina format 
%config InlineBackend.figure_format = 'retina' 

# Increase the default plot size and set the color scheme
plt.rcParams['figure.figsize'] = 8, 5


# Disable warnings in Anaconda
import warnings
warnings.filterwarnings('ignore')
import os

Una vez que hemos importado y configurado dependencias como Matplotlib o Numpy, es momento de leer el dataset usando Pandas de la siguiente manera:

In [473]:
df = pd.read_csv('Forbes Richest Atheletes (Forbes Richest Athletes 1990-2020).csv')
df.head()

Unnamed: 0,S.NO,Name,Nationality,Current Rank,Previous Year Rank,Sport,Year,earnings ($ million)
0,1,Mike Tyson,USA,1,,boxing,1990,28.6
1,2,Buster Douglas,USA,2,,boxing,1990,26.0
2,3,Sugar Ray Leonard,USA,3,,boxing,1990,13.0
3,4,Ayrton Senna,Brazil,4,,auto racing,1990,10.0
4,5,Alain Prost,France,5,,auto racing,1990,9.0


Después vamos a convertir la columna "Year" a un tipo de datos comprensible por Python como datetime.

In [474]:
# Creating a copy of the original dataframe- df
df1 = df.copy()
df1.drop('S.NO',axis=1,inplace=True)
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 301 entries, 0 to 300
Data columns (total 7 columns):
Name                    301 non-null object
Nationality             301 non-null object
Current Rank            301 non-null int64
Previous Year Rank      277 non-null object
Sport                   301 non-null object
Year                    301 non-null int64
earnings ($ million)    301 non-null float64
dtypes: float64(1), int64(2), object(4)
memory usage: 16.5+ KB


Finalmente debemos convertir el texto en la columna "Sport" en mayúsculas o minúsculas.

In [475]:
#Convert string to datetime64
df1['Year'] = df1['Year'].apply(pd.to_datetime,format='%Y')

#Set Date column as the index column.
df1['year'] = pd.DatetimeIndex(df1['Year']).year  
df1.set_index('year', inplace=True)
df1.drop('Year',axis=1,inplace=True)

# Converting the sport column to uppercase
df1['Sport'] = df1['Sport'].str.upper()
df1.head()

# df is the original dataframe while df1 is a copy where the Date has been set as an index column

Unnamed: 0_level_0,Name,Nationality,Current Rank,Previous Year Rank,Sport,earnings ($ million)
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1990,Mike Tyson,USA,1,,BOXING,28.6
1990,Buster Douglas,USA,2,,BOXING,26.0
1990,Sugar Ray Leonard,USA,3,,BOXING,13.0
1990,Ayrton Senna,Brazil,4,,AUTO RACING,10.0
1990,Alain Prost,France,5,,AUTO RACING,9.0


3. Los atletas mejor pagados en 2020

In [476]:
data_2020 = df1[df1.index == 2020]

trace = go.Bar(
                    x = data_2020["earnings ($ million)"],
                    y = data_2020['Name'] ,
                    orientation='h',
                    marker = dict(color='pink',
                                 line=dict(color='black',width=1)),
                    )
data = [trace]
layout = go.Layout(barmode = "group",title="World's Highest-Paid Athletes in 2020",width=800, height=500, 
                       #xaxis= dict(title='No of times ranked higest'),
                       yaxis=dict(autorange="reversed"),
                       showlegend=False)
fig = go.Figure(data = data, layout = layout)
iplot(fig)



4. Análisis de los atletas mejor pagados entre 1990 y 2020.

In [477]:
# Top Paid Athlete for Each Year
Top_paid_each_year = df1[df1['Current Rank'] == 1].sort_values(by='year',ascending=False)

z = Top_paid_each_year[['Name','Sport','Nationality','earnings ($ million)']]

z.style.background_gradient(cmap='Reds')  

Unnamed: 0_level_0,Name,Sport,Nationality,earnings ($ million)
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2020,Roger Federer,TENNIS,Switzerland,106.3
2019,Lionel Messi,SOCCER,Argentina,127.0
2018,Floyd Mayweather,BOXING,USA,285.0
2017,Cristiano Ronaldo,SOCCER,Portugal,93.0
2016,Cristiano Ronaldo,SOCCER,Portugal,88.0
2015,Floyd Mayweather,BOXING,USA,300.0
2014,Floyd Mayweather,BOXING,USA,105.0
2013,Tiger Woods,GOLF,USA,78.1
2012,Floyd Mayweather,BOXING,USA,85.0
2011,Tiger Woods,GOLF,USA,75.0


Atletas con mayor ingreso y máximo número de apariciones en la lista Forbes

In [478]:
counts_top = Top_paid_each_year['Name'].value_counts().to_frame()


trace = go.Bar(
                    y = counts_top.index,
                    x = counts_top['Name'] ,
                    orientation='h',
                    marker = dict(color='pink',
                                 line=dict(color='black',width=1)),
                    )
data = [trace]
layout = go.Layout(barmode = "group",title='Athlete earning the most,maximum number of times',width=800, height=500, 
                       xaxis= dict(title='No of times ranked higest'),
                       yaxis=dict(autorange="reversed"),
                       showlegend=False)
fig = go.Figure(data = data, layout = layout)
iplot(fig)


Podemos producir gráficas más sofisticadas utilizando las siguientes piezas de código:

In [479]:
# Top Paid Athlete for Each Year
total_earnings = Top_paid_each_year.groupby('Name')['earnings ($ million)'].sum().to_frame().reset_index()
top_ranks = Top_paid_each_year['Name'].value_counts().to_frame().reset_index()
top_ranks.rename(columns={'index':'Name',
                          'Name':'Rank_counts'}, inplace=True)
df_compare = total_earnings.merge(top_ranks, on='Name')

In [480]:
#source: https://www.kaggle.com/kanncaa1/plotly-tutorial-for-beginners#Bar-Charts

import plotly.graph_objs as go
from plotly import tools
trace0 = go.Bar(
                y=df_compare['Name'],
                x=df_compare['Rank_counts'],
                marker=dict(color='rgba(171, 50, 96, 0.6)',line=dict(color='rgba(171, 50, 96, 1.0)',width=1)),
                name='Top Ranks',
                orientation='h',
)

trace1 = go.Scatter(
                y=df_compare['Name'],
                x=df_compare['earnings ($ million)'],
                mode='lines+markers',
                line=dict(color='rgb(63, 72, 204)'),
                name='income',
)

layout = dict(
                title='Income and Top Ranks',
                yaxis=dict(showticklabels=True,domain=[0, 0.85]),
                yaxis2=dict(showline=True,showticklabels=False,linecolor='rgba(102, 102, 102, 0.8)',linewidth=2,domain=[0, 0.85]),
                xaxis=dict(zeroline=False,showline=False,showticklabels=True,showgrid=True,domain=[0, 0.42]),
                xaxis2=dict(zeroline=False,showline=False,showticklabels=False,showgrid=True,domain=[0.47, 1],side='top',dtick=25),
                legend=dict(x=0.029,y=1.038,font=dict(size=10) ),
                margin=dict(l=200, r=20,t=70,b=70),
                paper_bgcolor='rgb(248, 248, 255)',
                plot_bgcolor='rgb(248, 248, 255)',
)

annotations = []
y_s = df_compare['Rank_counts']
y_nw = np.rint(df_compare['earnings ($ million)'])
# Adding labels
for ydn, yd, xd in zip(y_nw, y_s, df_compare['Name']):
    # labeling the scatter savings
    annotations.append(dict(xref='x2', yref='y2', y=xd, x=ydn-1,text='{:,}'.format(ydn),font=dict(family='Arial', size=12,color='rgb(63, 72, 204)'),showarrow=False))
    # labeling the bar net worth
    annotations.append(dict(xref='x1', yref='y1', y=xd, x=yd+1,text=str(yd),font=dict(family='Arial', size=12,color='rgb(171, 50, 96)'),showarrow=False))

layout['annotations'] = annotations


# Creating two subplots
fig = tools.make_subplots(rows=1, cols=2, specs=[[{}, {}]], shared_xaxes=True,
                          shared_yaxes=False, vertical_spacing=0.001)

fig.append_trace(trace0, 1, 1)
fig.append_trace(trace1, 1, 2)

fig['layout'].update(layout)
fig.show()


Pais que produce los máximos generadores de ingreso en deporte.


In [481]:
counts_top = Top_paid_each_year['Nationality'].value_counts().to_frame()


trace = go.Bar(
                    x = counts_top.index,
                    y = counts_top['Nationality'] ,
                    orientation='v',
                    marker = dict(color='pink',
                                 line=dict(color='black',width=1)),
                    )
data = [trace]
layout = go.Layout(barmode = "group",title='Country which produces the maximum earners in Sports',width=800, height=500, 
                       xaxis= dict(title='No of times ranked higest'),
                       #yaxis=dict(autorange="reversed"),
                       showlegend=False)
fig = go.Figure(data = data, layout = layout)
iplot(fig)

¿Cuánto ingresan los mejores atletas cada año?

In [482]:
trace = go.Scatter(
                    x = Top_paid_each_year.index,
                    y = Top_paid_each_year['earnings ($ million)'] ,
                    orientation='v',
                    marker = dict(color='red',
                                 line=dict(color='royalblue',width=2)),
                    )
data = [trace]
layout = go.Layout(title='How much did the Top Paid Athlete for Each Year, earn? ',width=800, height=500, 
                       xaxis= dict(title='Years'),
                       yaxis=dict(title="Earning in US Dollars(million)"),
                       showlegend=False)
fig = go.Figure(data = data, layout = layout)
iplot(fig)

5. Análisis de los 10 atletas mejores pagados cada año entre 1990 y 2020.


In [483]:
df['Sport'] = df['Sport'].str.upper() # Converting the text to uppercase
max_sport = df['Sport'].value_counts().to_frame()

trace = go.Bar(
                    y = max_sport.index,
                    x = max_sport['Sport'] ,
                    orientation='h',
                    marker = dict(color='pink',
                                 line=dict(color='black',width=1)),
                    )
data = [trace]
layout = go.Layout(barmode = "group",title='Sport which dominates in earnings',width=800, height=500, 
                       xaxis= dict(title='No of times ranked highest'),
                       yaxis=dict(autorange="reversed"),
                       showlegend=False)
fig = go.Figure(data = data, layout = layout)
iplot(fig)

Pais que más ingresos tiene en deportes


In [484]:
max_sport = df['Nationality'].value_counts().to_frame()


trace = go.Bar(
                    y = max_sport.index,
                    x = max_sport['Nationality'] ,
                    orientation='h',
                    marker = dict(color='pink',
                                 line=dict(color='black',width=1)),
                    )
data = [trace]
layout = go.Layout(barmode = "group",title='Country which dominates in Sports earningss',width=800, height=500, 
                       xaxis= dict(title='No of times ranked highest'),
                       yaxis=dict(autorange="reversed"),
                       showlegend=False)
fig = go.Figure(data = data, layout = layout)
iplot(fig)

Atletas que aparecen más frecuentemente en la lista


In [485]:
s = df['Name'].value_counts().to_frame()[:5]
s.style.background_gradient(cmap='Reds')  

Unnamed: 0,Name
Tiger Woods,19
Michael Jordan,19
Kobe Bryant,14
LeBron James,13
Michael Schumacher,13


6. ¿Dónde se encuentran las mujeres?

In [486]:
# People who have appeared once on the list.
names = df['Name'].value_counts().to_frame()
names[names['Name']==1].index

# On scanning the list, we find the name of a sole women athlete- monica seles
monica = df[df['Name'] == 'Monica Seles']
monica.style.set_properties(**{'background-color': 'pink',
                            'color': 'black',
                            'border-color': 'black'})

Unnamed: 0,S.NO,Name,Nationality,Current Rank,Previous Year Rank,Sport,Year,earnings ($ million)
29,30,Monica Seles,USA,10,12,TENNIS,1992,8.5


7. Analizando a los 3 mejores pagados de todos los tiempos.


In [487]:
top_earners_alltime = pd.pivot_table(df, index='Name',values="earnings ($ million)", aggfunc='sum')
top3_earners_all = top_earners_alltime.sort_values(by="earnings ($ million)",ascending=False)[:3]

top3_earners_all.style.background_gradient(cmap='Reds') 

Unnamed: 0_level_0,earnings ($ million)
Name,Unnamed: 1_level_1
Tiger Woods,1373.8
LeBron James,844.8
Floyd Mayweather,840.0


Analizando los ingresos de Tiger Woods a lo largo de los años


In [488]:
def earnings_plot(dataframe,athlete,image_path,opacity):
    """
    function that creates a plotly line chart with image of the athlete in the background
    
    
    """
    athlete_df = df1[df1['Name'] == athlete]

    trace = go.Scatter(
                    x = athlete_df.index,
                    y = athlete_df['earnings ($ million)'] ,
                    orientation='v',
                    marker = dict(color='red',
                                 line=dict(color='red',width=6)),
                    )
    data = [trace]
    layout= go.Layout(title= f'{athlete}' +"'s earnings over the Years",
                  xaxis=dict(title='Year'),
                  yaxis=dict(title="Earnings in US$ (millions)"),
                  images= [dict(
                          source=Image.open(image_path),
                          xref= "paper",
                          yref= "paper",
                          x= 0.5,
                          y= 0.5,
                          sizex= 1,
                          sizey= 1,
                          sizing= "stretch",
                          opacity= opacity,
                          xanchor= "center", 
                          yanchor="middle",
                          layer= "below")])
    fig = go.Figure(data = data, layout = layout)
    iplot(fig)

Tiger Woods

In [489]:
image_path = "tiger-woods.jpeg"
earnings_plot(df1,'Tiger Woods',image_path,opacity=0.3)

Analizando los ingresos de Floyd Mayweather a lo largo de los años

In [490]:
image_path = "floyd-mayweather.jpg"
earnings_plot(df1,'Floyd Mayweather',image_path,opacity=0.2)

Analizando los ingresos de Lebron James a lo largo de los años


In [491]:
image_path = "lebron-james.jpg"
earnings_plot(df1,'LeBron James',image_path,opacity=0.2)

# Deportistas cuyo ranking ha subido al menos dos lugares entre 2010 y 2020.


### Primero, hacemos los encabezados más sencillos

In [496]:
df.rename(columns = {'Index':'index', 'Name':'name', 'Nationality':'nationality',
'Current Rank': 'current-rank', 'Previous Year Rank': 'previous-year-rank', 'Sport':'sport',
'Year':'year', 'earnings ($ million)':'earnings'}, inplace = True)

In [498]:
ranking2010_2020 = df[df["year"] >= 2010]
ranking2010_2020.tail()

Unnamed: 0,S.NO,name,nationality,current-rank,previous-year-rank,sport,year,earnings
296,297,Stephen Curry,USA,6,9,BASKETBALL,2020,74.4
297,298,Kevin Durant,USA,7,10,BASKETBALL,2020,63.9
298,299,Tiger Woods,USA,8,11,GOLF,2020,62.3
299,300,Kirk Cousins,USA,9,>100,AMERICAN FOOTBALL,2020,60.5
300,301,Carson Wentz,USA,10,>100,AMERICAN FOOTBALL,2020,59.1


### número de deportistas en el conjunto de datos

In [499]:
len(set(ranking2010_2020['name']))

36

In [494]:
earnings_1990 = df1[df1["year"] < 2000] 


KeyError: 'Year'

*Atleta con mayores ganancias por deporte por década*

In [None]:
df1.loc[df1.groupby('sport')['earnings'].idxmax(), :].reset_index()

#First, you are grouping your dataframe by column Sport. Then you get earnings ($ million) column and calculate an 
# index of (first) maximal element of this column in each group. Then you use these indexes to filter your initial dataframe. 
# Finally you resets indexes (if you need it).

Unnamed: 0,year,name,nationality,current-rank,previous-year-rank,sport,earnings
0,2019,Lionel Messi,Argentina,1,2,SOCCER,127.0
1,2019,Cristiano Ronaldo,Portugal,2,3,SOCCER,109.0
2,2019,Neymar,Brazil,3,5,SOCCER,105.0
3,2019,Canelo Alvarez,Mexico,4,15,BOXING,94.0
4,2019,Roger Federer,Switzerland,5,7,TENNIS,93.4
5,2019,Russell Wilson,USA,6,??,AMERICAN FOOTBALL,89.5
6,2019,Aaron Rogers,USA,7,??,AMERICAN FOOTBALL,89.3
7,2019,LeBron James,USA,8,6,BASKETBALL,89.0
8,2019,Stephen Curry,USA,9,8,BASKETBALL,79.8
9,2019,Kevin Durant,USA,10,11,BASKETBALL,65.4


KeyError: 'year'

_Ganancia total por cada deporte por cada año._

In [None]:
af = [df[df['sport'] == 'AMERICAN FOOTBALL'][['name','sport','year','earnings']]]

df.groupby(by='year')
#tips[['sum(total_bill)', 'tip', 'sex']]
print(af)
#pd.Series(af['earnings ($ million)'].sum())


[                 name              sport  year  earnings
133    Peyton Manning  AMERICAN FOOTBALL  2004      42.0
144      Michael Vick  AMERICAN FOOTBALL  2005      37.5
160         Tom Brady  AMERICAN FOOTBALL  2006      29.0
199       Eli Manning  AMERICAN FOOTBALL  2010      39.9
200     Terrell Suggs  AMERICAN FOOTBALL  2010      38.3
220    Peyton Manning  AMERICAN FOOTBALL  2012      42.4
225        Drew Brees  AMERICAN FOOTBALL  2013      51.0
226     Aaron Rodgers  AMERICAN FOOTBALL  2013      49.0
240         Matt Ryan  AMERICAN FOOTBALL  2014      43.8
257        Cam Newton  AMERICAN FOOTBALL  2016      53.1
266       Andrew Luck  AMERICAN FOOTBALL  2017      50.0
279         Matt Ryan  AMERICAN FOOTBALL  2018      67.3
280  Matthew Stafford  AMERICAN FOOTBALL  2018      59.5
286    Russell Wilson  AMERICAN FOOTBALL  2019      89.5
287      Aaron Rogers  AMERICAN FOOTBALL  2019      89.3
299      Kirk Cousins  AMERICAN FOOTBALL  2020      60.5
300      Carson Wentz  AMERICA

In [None]:
ar = [df[df['sport'] == 'AUTO RACING'][['name','sport','year','earnings']]]
print(ar)


[                   name        sport  year  earnings
3          Ayrton Senna  AUTO RACING  1990      10.0
4           Alain Prost  AUTO RACING  1990       9.0
14         Ayrton Senna  AUTO RACING  1991      13.0
15          Alain Prost  AUTO RACING  1991      11.0
18        Nigel Mansell  AUTO RACING  1991       9.0
22         Ayrton Senna  AUTO RACING  1992      22.0
23        Nigel Mansell  AUTO RACING  1992      14.5
32         Ayrton Senna  AUTO RACING  1993      18.5
33          Alain Prost  AUTO RACING  1993      16.0
44       Gerhard Berger  AUTO RACING  1994      13.5
49        Nigel Mansell  AUTO RACING  1994      11.3
58   Michael Schumacher  AUTO RACING  1995      15.0
90   Michael Schumacher  AUTO RACING  1999      49.0
98       Dale Earnhardt  AUTO RACING  1999      26.5
100  Michael Schumacher  AUTO RACING  2000      59.0
119  Jacques Villeneuve  AUTO RACING  2002      20.0
130  Jacques Villeneuve  AUTO RACING  2003      23.0
270      Lewis Hamilton  AUTO RACING  2017   

In [None]:
#AUTO RACING (NASCAR)	
arn = [df[df['sport'] == 'AUTO RACING (NASCAR)'][['name','sport','year','earnings']]]
print(ar)

[                   name        sport  year  earnings
3          Ayrton Senna  AUTO RACING  1990      10.0
4           Alain Prost  AUTO RACING  1990       9.0
14         Ayrton Senna  AUTO RACING  1991      13.0
15          Alain Prost  AUTO RACING  1991      11.0
18        Nigel Mansell  AUTO RACING  1991       9.0
22         Ayrton Senna  AUTO RACING  1992      22.0
23        Nigel Mansell  AUTO RACING  1992      14.5
32         Ayrton Senna  AUTO RACING  1993      18.5
33          Alain Prost  AUTO RACING  1993      16.0
44       Gerhard Berger  AUTO RACING  1994      13.5
49        Nigel Mansell  AUTO RACING  1994      11.3
58   Michael Schumacher  AUTO RACING  1995      15.0
90   Michael Schumacher  AUTO RACING  1999      49.0
98       Dale Earnhardt  AUTO RACING  1999      26.5
100  Michael Schumacher  AUTO RACING  2000      59.0
119  Jacques Villeneuve  AUTO RACING  2002      20.0
130  Jacques Villeneuve  AUTO RACING  2003      23.0
270      Lewis Hamilton  AUTO RACING  2017   