# Loading and Cleaning the data sete

## Import the mean libraries that we going to be using

In [1]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

## Specifying the path and loading the archive with pandas 

In [18]:
path = "C:/Users/ewill/OneDrive/Desktop/Richest/richest.csv"
df = pd.read_csv(path, delimiter=";")

In [8]:
df.head()

Unnamed: 0,Index,Name,Nationality,CurrentRank,PreviousYearRank,Sport,Year,earnings
0,1,Mike Tyson,USA,1,,boxing,1990,28.6
1,2,Buster Douglas,USA,2,,boxing,1990,26.0
2,3,Sugar Ray Leonard,USA,3,,boxing,1990,13.0
3,4,Ayrton Senna,Brazil,4,,auto racing,1990,10.0
4,5,Alain Prost,France,5,,auto racing,1990,9.0


## Areas to clean
- Set the idex column as index
- Delete previousyearrank column
- Guarantee consistency on the name entrys on sports  

### Set the index column as index

In [19]:
df = df.set_index("Index")

In [11]:
df.head()

Unnamed: 0_level_0,Name,Nationality,CurrentRank,PreviousYearRank,Sport,Year,earnings
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,Mike Tyson,USA,1,,boxing,1990,28.6
2,Buster Douglas,USA,2,,boxing,1990,26.0
3,Sugar Ray Leonard,USA,3,,boxing,1990,13.0
4,Ayrton Senna,Brazil,4,,auto racing,1990,10.0
5,Alain Prost,France,5,,auto racing,1990,9.0


### Delete previousyearrank column

In [20]:
df = df.drop("PreviousYearRank", axis=1)

In [14]:
df.head()

Unnamed: 0_level_0,Name,Nationality,CurrentRank,Sport,Year,earnings
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,Mike Tyson,USA,1,boxing,1990,28.6
2,Buster Douglas,USA,2,boxing,1990,26.0
3,Sugar Ray Leonard,USA,3,boxing,1990,13.0
4,Ayrton Senna,Brazil,4,auto racing,1990,10.0
5,Alain Prost,France,5,auto racing,1990,9.0


### Guarantee consistency on the name entrys on sports

In [24]:
df["Sport"].unique()

array(['boxing', 'auto racing', 'golf', 'basketball', 'tennis', 'nfl',
       'nba', 'baseball', 'ice hockey', 'american football / baseball',
       'f1 motorsports', 'nascar', 'hockey', 'auto racing (nascar)',
       'f1 racing', 'american football', 'soccer', 'cycling',
       'motorcycle gp', 'mma'], dtype=object)

#### First let's fix some captalizations issues  

In [25]:
df["Sport"] = df["Sport"].str.lower()

In [26]:
df.head()

Unnamed: 0_level_0,Name,Nationality,CurrentRank,Sport,Year,earnings
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,Mike Tyson,USA,1,boxing,1990,28.6
2,Buster Douglas,USA,2,boxing,1990,26.0
3,Sugar Ray Leonard,USA,3,boxing,1990,13.0
4,Ayrton Senna,Brazil,4,auto racing,1990,10.0
5,Alain Prost,France,5,auto racing,1990,9.0


In [27]:
df["Sport"].unique()

array(['boxing', 'auto racing', 'golf', 'basketball', 'tennis', 'nfl',
       'nba', 'baseball', 'ice hockey', 'american football / baseball',
       'f1 motorsports', 'nascar', 'hockey', 'auto racing (nascar)',
       'f1 racing', 'american football', 'soccer', 'cycling',
       'motorcycle gp', 'mma'], dtype=object)

 #### It looks like some sports has differentes names for example "nba " also is called "basketball". 
 #### Let's create a function to fix this problem

In [28]:
def unify_names(sport):
    if sport == 'auto racing' or sport == 'nascar' or sport == 'auto racing (nascar)':
        return 'car racing'
    elif sport == 'nba':
        return 'basketball'
    elif sport == 'nfl':
        return 'american football'
    elif sport == 'ice hockey':
        return 'hockey'
    elif sport == 'motorcycle gp' or sport == 'f1 motorsports':
        return 'motor racing'
    else:
        return sport 

In [29]:
df["Sport"] = df["Sport"].apply(unify_names)

In [30]:
df.head()

Unnamed: 0_level_0,Name,Nationality,CurrentRank,Sport,Year,earnings
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,Mike Tyson,USA,1,boxing,1990,28.6
2,Buster Douglas,USA,2,boxing,1990,26.0
3,Sugar Ray Leonard,USA,3,boxing,1990,13.0
4,Ayrton Senna,Brazil,4,car racing,1990,10.0
5,Alain Prost,France,5,car racing,1990,9.0


In [31]:
df["Sport"].unique()

array(['boxing', 'car racing', 'golf', 'basketball', 'tennis',
       'american football', 'baseball', 'hockey',
       'american football / baseball', 'motor racing', 'f1 racing',
       'soccer', 'cycling', 'mma'], dtype=object)

#### It's looks like we have an entry call  'american football / baseball'. let's look which player is 

In [32]:
df[df["Sport"] == 'american football / baseball']

Unnamed: 0_level_0,Name,Nationality,CurrentRank,Sport,Year,earnings
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
53,Deion Sanders,USA,3,american football / baseball,1995,22.5


#### it's looks like is Deion Sanders let's search online to see his real Sport 

#### He was an American football player accordingly to wikipedia you can see [here](https://en.wikipedia.org/wiki/Deion_Sanders)  

#### Let's fix this 

In [33]:
df["Sport"] = df["Sport"].apply(lambda sport: 'american football' if sport=='american football / baseball' else sport)

In [35]:
df["Sport"].unique()

array(['boxing', 'car racing', 'golf', 'basketball', 'tennis',
       'american football', 'baseball', 'hockey', 'motor racing',
       'f1 racing', 'soccer', 'cycling', 'mma'], dtype=object)

#### Just a quick note first we use a def function because we have multiple entrys to fix, since we only have to fix one entry on the last one we decide to use a lamda function

### Saving as csv file 

In [36]:
df.to_csv("C:/Users/ewill/OneDrive/Desktop/Richest/richest_cleaned_data.csv")