In [1]:
import pandas as pd
import numpy as np

In [3]:
data = pd.DataFrame({
    'name': ['Messi', 'Ronaldo', 'Neymar', 'Mbappe', 'Lingard', 'Messi'],
    'goals': [45, 40, 32.5, 30, 25, 10],
    'assists': [20, np.nan, 10, 5, 0, 20,],
    'position': ['Forward', 'Forward', 'Forward', 'FWD', 'Midfielder', 'FWD'],
    'birth_date': ['1987-06-24 00:00:00', '1985-02-05', '1992-02-05', '1998-12-20', '1992-12-15', '1987-06-24 00:00:00']
})

In [5]:
#create two of the same dataframe
data = pd.concat([data, data])

In [6]:
data

Unnamed: 0,name,goals,assists,position,birth_date
0,Messi,45.0,20.0,Forward,1987-06-24 00:00:00
1,Ronaldo,40.0,,Forward,1985-02-05
2,Neymar,32.5,10.0,Forward,1992-02-05
3,Mbappe,30.0,5.0,FWD,1998-12-20
4,Lingard,25.0,0.0,Midfielder,1992-12-15
5,Messi,10.0,20.0,FWD,1987-06-24 00:00:00
0,Messi,45.0,20.0,Forward,1987-06-24 00:00:00
1,Ronaldo,40.0,,Forward,1985-02-05
2,Neymar,32.5,10.0,Forward,1992-02-05
3,Mbappe,30.0,5.0,FWD,1998-12-20


In [7]:
#remove duplicates
data = data.drop_duplicates()

In [8]:
data

Unnamed: 0,name,goals,assists,position,birth_date
0,Messi,45.0,20.0,Forward,1987-06-24 00:00:00
1,Ronaldo,40.0,,Forward,1985-02-05
2,Neymar,32.5,10.0,Forward,1992-02-05
3,Mbappe,30.0,5.0,FWD,1998-12-20
4,Lingard,25.0,0.0,Midfielder,1992-12-15
5,Messi,10.0,20.0,FWD,1987-06-24 00:00:00


In [9]:
#drop specific duplicates
data = data.drop_duplicates(subset=['name'], keep='first')

In [10]:
data

Unnamed: 0,name,goals,assists,position,birth_date
0,Messi,45.0,20.0,Forward,1987-06-24 00:00:00
1,Ronaldo,40.0,,Forward,1985-02-05
2,Neymar,32.5,10.0,Forward,1992-02-05
3,Mbappe,30.0,5.0,FWD,1998-12-20
4,Lingard,25.0,0.0,Midfielder,1992-12-15


In [11]:
#fill null rows with 0
data['assists'] = data['assists'].fillna(0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['assists'] = data['assists'].fillna(0)


In [12]:
data

Unnamed: 0,name,goals,assists,position,birth_date
0,Messi,45.0,20.0,Forward,1987-06-24 00:00:00
1,Ronaldo,40.0,0.0,Forward,1985-02-05
2,Neymar,32.5,10.0,Forward,1992-02-05
3,Mbappe,30.0,5.0,FWD,1998-12-20
4,Lingard,25.0,0.0,Midfielder,1992-12-15


In [13]:
#cast columns to integers
data[['goals', 'assists']] = data[['goals', 'assists']].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[['goals', 'assists']] = data[['goals', 'assists']].astype(int)


In [14]:
data

Unnamed: 0,name,goals,assists,position,birth_date
0,Messi,45,20,Forward,1987-06-24 00:00:00
1,Ronaldo,40,0,Forward,1985-02-05
2,Neymar,32,10,Forward,1992-02-05
3,Mbappe,30,5,FWD,1998-12-20
4,Lingard,25,0,Midfielder,1992-12-15


In [15]:
#replace strings 'FWD' with 'Forward'
data['position'] = data['position'].str.replace('FWD', 'Forward')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['position'] = data['position'].str.replace('FWD', 'Forward')


In [16]:
data

Unnamed: 0,name,goals,assists,position,birth_date
0,Messi,45,20,Forward,1987-06-24 00:00:00
1,Ronaldo,40,0,Forward,1985-02-05
2,Neymar,32,10,Forward,1992-02-05
3,Mbappe,30,5,Forward,1998-12-20
4,Lingard,25,0,Midfielder,1992-12-15


In [17]:
#make dates more consistent
data['birth_date'] = pd.to_datetime(data['birth_date'], format='mixed')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['birth_date'] = pd.to_datetime(data['birth_date'], format='mixed')


In [18]:
data

Unnamed: 0,name,goals,assists,position,birth_date
0,Messi,45,20,Forward,1987-06-24
1,Ronaldo,40,0,Forward,1985-02-05
2,Neymar,32,10,Forward,1992-02-05
3,Mbappe,30,5,Forward,1998-12-20
4,Lingard,25,0,Midfielder,1992-12-15


In [19]:
#capitalise names
data['name'] = data['name'].apply(lambda x: x.upper())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['name'] = data['name'].apply(lambda x: x.upper())


In [20]:
data

Unnamed: 0,name,goals,assists,position,birth_date
0,MESSI,45,20,Forward,1987-06-24
1,RONALDO,40,0,Forward,1985-02-05
2,NEYMAR,32,10,Forward,1992-02-05
3,MBAPPE,30,5,Forward,1998-12-20
4,LINGARD,25,0,Midfielder,1992-12-15
