In [1]:
import pandas as pd

# DataFrames I

## Table of Contents


1. Methods and Attributes between Series and DataFrames  
   - a. Methods Series & Dataframe have in common  
   - b. Methods specific to Series vs Dataframes  
2. Differences between Shared Methods  
3. Select One Column from a DataFrame  
4. Select Multiple Columns from a DataFrame  
5. Add New Column to a DataFrame  
6. A Review of the `value_counts` Method  
7. Drop Rows with Missing Values  
8. Fill in Missing Values with the `.fillna()` Method  
9. The `astype` Method I  
10. The `astype` Method II  
11. Sort a DataFrame with the `.sort_values()` Method I  
12. Sort a DataFrame with the `.sort_values()` Method II  
13. Sort a DataFrame by its Index  
14. Rank Values with the `.rank()` Method





## 1. Methods and Attributes between Series and DataFrames

- A **DataFrame** is a 2-dimensional table consisting of rows and columns: a sequence of Series stitched together with common index
- Pandas uses a `NaN` designation for cells that have a missing value. It is short for "not a number". Most operations on `NaN` values will produce `NaN` values.
- Like with a **Series**, Pandas assigns an index position/label to each **DataFrame** row.
- The **DataFrame** and **Series** have common and exclusive methods/attributes.
  - The `hasnans` attribute exists only on a **Series**. The `columns` attribute exists only on a **DataFrame**.
  - Some methods/attributes will return different types of data.
  - The `info` method returns a summary of the pandas object.


In [2]:
nba = pd.read_csv("nba.csv") # read_csv returns df by default
nba

Unnamed: 0,Name,Team,Position,Birthday,Salary
0,Shake Milton,Philadelphia 76ers,SG,9/26/96,1445697.0
1,Christian Wood,Detroit Pistons,PF,9/27/95,1645357.0
2,PJ Washington,Charlotte Hornets,,8/23/98,
3,Derrick Rose,Detroit Pistons,PG,10/4/88,7317074.0
4,Marial Shayok,Philadelphia 76ers,G,7/26/95,79568.0
...,...,...,...,...,...
446,Harry Giles,Sacramento Kings,PF,4/22/98,2578800.0
447,Robin Lopez,Milwaukee Bucks,C,4/1/88,4767000.0
448,Collin Sexton,Cleveland Cavaliers,PG,,4764960.0
449,Ricky Rubio,Phoenix Suns,PG,10/21/90,16200000.0


In [3]:
s = pd.Series([1,2,3,4,5])
s

0    1
1    2
2    3
3    4
4    5
dtype: int64

### a. Methods Series & Dataframe have in common

In [4]:
nba.head() # head() method works same for Series and Dataframe
nba.tail() # idem

Unnamed: 0,Name,Team,Position,Birthday,Salary
446,Harry Giles,Sacramento Kings,PF,4/22/98,2578800.0
447,Robin Lopez,Milwaukee Bucks,C,4/1/88,4767000.0
448,Collin Sexton,Cleveland Cavaliers,PG,,4764960.0
449,Ricky Rubio,Phoenix Suns,PG,10/21/90,16200000.0
450,,,,,


In [5]:
nba.index # idem

RangeIndex(start=0, stop=451, step=1)

In [6]:
nba.values # idem: a Series of Series

array([['Shake Milton', 'Philadelphia 76ers', 'SG', '9/26/96', 1445697.0],
       ['Christian Wood', 'Detroit Pistons', 'PF', '9/27/95', 1645357.0],
       ['PJ Washington', 'Charlotte Hornets', nan, '8/23/98', nan],
       ...,
       ['Collin Sexton', 'Cleveland Cavaliers', 'PG', nan, 4764960.0],
       ['Ricky Rubio', 'Phoenix Suns', 'PG', '10/21/90', 16200000.0],
       [nan, nan, nan, nan, nan]], dtype=object)

In [7]:
nba.shape # idem

(451, 5)

In [8]:
nba.dtypes # return value is Series with datatype per column

Name         object
Team         object
Position     object
Birthday     object
Salary      float64
dtype: object

In [9]:
s.axes # row index

[RangeIndex(start=0, stop=5, step=1)]

In [10]:
nba.axes # row & column index

[RangeIndex(start=0, stop=451, step=1),
 Index(['Name', 'Team', 'Position', 'Birthday', 'Salary'], dtype='object')]

In [11]:
s.info()

<class 'pandas.core.series.Series'>
RangeIndex: 5 entries, 0 to 4
Series name: None
Non-Null Count  Dtype
--------------  -----
5 non-null      int64
dtypes: int64(1)
memory usage: 172.0 bytes


In [12]:
nba.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 451 entries, 0 to 450
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Name      450 non-null    object 
 1   Team      450 non-null    object 
 2   Position  449 non-null    object 
 3   Birthday  449 non-null    object 
 4   Salary    449 non-null    float64
dtypes: float64(1), object(4)
memory usage: 17.7+ KB


### b. Methods specific to Series vs Dataframes

In [13]:
s.hasnans # does Series have NaNs?

False

In [14]:
nba.columns # complement to row index: column index

Index(['Name', 'Team', 'Position', 'Birthday', 'Salary'], dtype='object')

## 2. Differences between Shared Methods

- The `sum` method adds a `Series`'s values.
- On a `DataFrame`, the `sum` method defaults to adding the values by traversing the index (row values).
- The `axis` parameter customizes the direction that we add across. Pass `"columns"` or `1` to add "across" the columns.

In [15]:
pd.read_csv("revenue.csv")

Unnamed: 0,Date,New York,Los Angeles,Miami
0,1/1/16,985,122,499
1,1/2/16,738,788,534
2,1/3/16,14,20,933
3,1/4/16,730,904,885
4,1/5/16,114,71,253
5,1/6/16,936,502,497
6,1/7/16,123,996,115
7,1/8/16,935,492,886
8,1/9/16,846,954,823
9,1/10/16,54,285,216


In [16]:
revenue = pd.read_csv("revenue.csv", index_col="Date") # set 'Date' column as index label
revenue

Unnamed: 0_level_0,New York,Los Angeles,Miami
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1/1/16,985,122,499
1/2/16,738,788,534
1/3/16,14,20,933
1/4/16,730,904,885
1/5/16,114,71,253
1/6/16,936,502,497
1/7/16,123,996,115
1/8/16,935,492,886
1/9/16,846,954,823
1/10/16,54,285,216


In [17]:
s = pd.Series([1,2,3]) # in Series there is just 1 column
s.sum() # so there is only 1 way to sum these up: by column

6

In [18]:
revenue.sum() # here there are several columns and several rows, so there are 3 ways to sum up!

# Way 1: by column, sum along rows (axis = 0): default

# by default the sum() adds along rows index/downwards/axis = 0 while the column stays the same
# question: 'how much did we earn in each city?'

New York       5475
Los Angeles    5134
Miami          5641
dtype: int64

In [19]:
# Way 2: by row, sum along columns (axis = 1)

revenue.sum(axis = 1) # we can change the direction to add along the columns while row stays same
# question: 'how much did we earn each day?'

Date
1/1/16     1606
1/2/16     2060
1/3/16      967
1/4/16     2519
1/5/16      438
1/6/16     1935
1/7/16     1234
1/8/16     2313
1/9/16     2623
1/10/16     555
dtype: int64

In [20]:
# Way 3: by row and by column, to arrive at total sum

print(revenue.sum(axis = 1).sum()) # we can chain method to take sum of sum
print(revenue.sum(axis = 1).sum())


16250
16250


## 3. Select One Column from a DataFrame

- We can use attribute syntax (`df.column_name`) to select a column from a DataFrame. The syntax will not work if the column name has spaces.
- We can also use square bracket syntax (`df["column name"]`), which will work for any column name.
- Pandas extracts a column from a DataFrame as a Series.
- The Series is a view, so changes to the Series **will** affect the DataFrame.
- Pandas will display a warning if you mutate the Series. Use the `copy` method to create a duplicate.


In [21]:
# import dataset

nba = pd.read_csv("nba.csv")
nba.head()

Unnamed: 0,Name,Team,Position,Birthday,Salary
0,Shake Milton,Philadelphia 76ers,SG,9/26/96,1445697.0
1,Christian Wood,Detroit Pistons,PF,9/27/95,1645357.0
2,PJ Washington,Charlotte Hornets,,8/23/98,
3,Derrick Rose,Detroit Pistons,PG,10/4/88,7317074.0
4,Marial Shayok,Philadelphia 76ers,G,7/26/95,79568.0


In [22]:
# Two ways to select 1 column of df

# Way 1: column as attribute

# we can enter subcomponents of composed object as attributes
# df is a Series of Series, composed of several Series
# we can access these components in themselves as attributes
# we use attribute notation `df.column_name`

nba.Team

0       Philadelphia 76ers
1          Detroit Pistons
2        Charlotte Hornets
3          Detroit Pistons
4       Philadelphia 76ers
              ...         
446       Sacramento Kings
447        Milwaukee Bucks
448    Cleveland Cavaliers
449           Phoenix Suns
450                    NaN
Name: Team, Length: 451, dtype: object

In [23]:
# However: not ideal, because:
# a) does not work if columns has spaces
# b) might conflict with Df methods, for example when column name is 'sum'

# Therefore: preferred way to select column of Df which works 100% of time:

# Way 2: by subsetting Df

nba["Team"]

0       Philadelphia 76ers
1          Detroit Pistons
2        Charlotte Hornets
3          Detroit Pistons
4       Philadelphia 76ers
              ...         
446       Sacramento Kings
447        Milwaukee Bucks
448    Cleveland Cavaliers
449           Phoenix Suns
450                    NaN
Name: Team, Length: 451, dtype: object

In [24]:
# Remark:

# PROBLEM
# be cautious however: both methods give a view, not a copy

names = nba["Name"]
names

0        Shake Milton
1      Christian Wood
2       PJ Washington
3        Derrick Rose
4       Marial Shayok
            ...      
446       Harry Giles
447       Robin Lopez
448     Collin Sexton
449       Ricky Rubio
450               NaN
Name: Name, Length: 451, dtype: object

In [25]:
# now we might think this Series is a separate variable 
# & change it without realizing we also change the Df

names.iloc[1] = "Whatever"

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  names.iloc[1] = "Whatever"


In [26]:
# Pandas also warns us for this in the message above 

names

0       Shake Milton
1           Whatever
2      PJ Washington
3       Derrick Rose
4      Marial Shayok
           ...      
446      Harry Giles
447      Robin Lopez
448    Collin Sexton
449      Ricky Rubio
450              NaN
Name: Name, Length: 451, dtype: object

In [27]:
# indeed: the name of the second player has been changed
# to 'Whatever'

# But this is ALSO the case for the Df:

nba

Unnamed: 0,Name,Team,Position,Birthday,Salary
0,Shake Milton,Philadelphia 76ers,SG,9/26/96,1445697.0
1,Whatever,Detroit Pistons,PF,9/27/95,1645357.0
2,PJ Washington,Charlotte Hornets,,8/23/98,
3,Derrick Rose,Detroit Pistons,PG,10/4/88,7317074.0
4,Marial Shayok,Philadelphia 76ers,G,7/26/95,79568.0
...,...,...,...,...,...
446,Harry Giles,Sacramento Kings,PF,4/22/98,2578800.0
447,Robin Lopez,Milwaukee Bucks,C,4/1/88,4767000.0
448,Collin Sexton,Cleveland Cavaliers,PG,,4764960.0
449,Ricky Rubio,Phoenix Suns,PG,10/21/90,16200000.0


In [28]:
# SOLUTION:
# make a copy of the view to get a separate Series 

names_copy = nba["Name"].copy()

In [29]:
names_copy.iloc[0] = "Distinct from Df we have another Whatever!"
print(nba) # original Df not changed
print(names_copy) # copy of view is changed

              Name                 Team Position  Birthday      Salary
0     Shake Milton   Philadelphia 76ers       SG   9/26/96   1445697.0
1         Whatever      Detroit Pistons       PF   9/27/95   1645357.0
2    PJ Washington    Charlotte Hornets      NaN   8/23/98         NaN
3     Derrick Rose      Detroit Pistons       PG   10/4/88   7317074.0
4    Marial Shayok   Philadelphia 76ers        G   7/26/95     79568.0
..             ...                  ...      ...       ...         ...
446    Harry Giles     Sacramento Kings       PF   4/22/98   2578800.0
447    Robin Lopez      Milwaukee Bucks        C    4/1/88   4767000.0
448  Collin Sexton  Cleveland Cavaliers       PG       NaN   4764960.0
449    Ricky Rubio         Phoenix Suns       PG  10/21/90  16200000.0
450            NaN                  NaN      NaN       NaN         NaN

[451 rows x 5 columns]
0      Distinct from Df we have another Whatever!
1                                        Whatever
2                       

## 4. Select Multiple Columns from a DataFrame

- Use square brackets with a list of column names to extract multiple columns from a DataFrame.
- Pandas stores the result in a new DataFrame (a copy).


In [30]:
# import dataset

nba = pd.read_csv("nba.csv")
nba.head()

Unnamed: 0,Name,Team,Position,Birthday,Salary
0,Shake Milton,Philadelphia 76ers,SG,9/26/96,1445697.0
1,Christian Wood,Detroit Pistons,PF,9/27/95,1645357.0
2,PJ Washington,Charlotte Hornets,,8/23/98,
3,Derrick Rose,Detroit Pistons,PG,10/4/88,7317074.0
4,Marial Shayok,Philadelphia 76ers,G,7/26/95,79568.0


In [31]:
# to extract multiple columns: use a list nested within square subsetting brackets

nba[["Name","Team"]] 


Unnamed: 0,Name,Team
0,Shake Milton,Philadelphia 76ers
1,Christian Wood,Detroit Pistons
2,PJ Washington,Charlotte Hornets
3,Derrick Rose,Detroit Pistons
4,Marial Shayok,Philadelphia 76ers
...,...,...
446,Harry Giles,Sacramento Kings
447,Robin Lopez,Milwaukee Bucks
448,Collin Sexton,Cleveland Cavaliers
449,Ricky Rubio,Phoenix Suns


In [32]:
# Remarks:
# 1) confusingly, contrary to subsetting just 1 column, this procedure to select multiple columns 
# creates a copy and not a view ..
# 2) the order in which you specify the columns in the list matters
# 3) to make this syntax more readable and avoid double square brackets [[ ]]: 
# first create a variable for the list, then use variable in subsetting brackets

columns_to_select = ["Name","Team"]
nba[columns_to_select]

Unnamed: 0,Name,Team
0,Shake Milton,Philadelphia 76ers
1,Christian Wood,Detroit Pistons
2,PJ Washington,Charlotte Hornets
3,Derrick Rose,Detroit Pistons
4,Marial Shayok,Philadelphia 76ers
...,...,...
446,Harry Giles,Sacramento Kings
447,Robin Lopez,Milwaukee Bucks
448,Collin Sexton,Cleveland Cavaliers
449,Ricky Rubio,Phoenix Suns


## 5. Add New Column to a DataFrame

- Use square bracket notation with an equal sign to add a new Series to a DataFrame.
- The `insert` method allows us to insert an element at a specific column index.
- On the right-hand side, we can reference an existing DataFrame column and perform a broadcasting operation on it to create the new Series.



In [33]:
# import dataset

nba = pd.read_csv("nba.csv")
nba.head()

Unnamed: 0,Name,Team,Position,Birthday,Salary
0,Shake Milton,Philadelphia 76ers,SG,9/26/96,1445697.0
1,Christian Wood,Detroit Pistons,PF,9/27/95,1645357.0
2,PJ Washington,Charlotte Hornets,,8/23/98,
3,Derrick Rose,Detroit Pistons,PG,10/4/88,7317074.0
4,Marial Shayok,Philadelphia 76ers,G,7/26/95,79568.0


In [34]:
# 2 methods to add a new colummn:

# Way 1: with assignment

nba["Sport"] = "Basketball" # Pandas broadcasts single consistent value automatically
nba


# Remark:
# this method always adds new column at the end of Df;
# if we want to insert new column elsewhere, we can use method 2

Unnamed: 0,Name,Team,Position,Birthday,Salary,Sport
0,Shake Milton,Philadelphia 76ers,SG,9/26/96,1445697.0,Basketball
1,Christian Wood,Detroit Pistons,PF,9/27/95,1645357.0,Basketball
2,PJ Washington,Charlotte Hornets,,8/23/98,,Basketball
3,Derrick Rose,Detroit Pistons,PG,10/4/88,7317074.0,Basketball
4,Marial Shayok,Philadelphia 76ers,G,7/26/95,79568.0,Basketball
...,...,...,...,...,...,...
446,Harry Giles,Sacramento Kings,PF,4/22/98,2578800.0,Basketball
447,Robin Lopez,Milwaukee Bucks,C,4/1/88,4767000.0,Basketball
448,Collin Sexton,Cleveland Cavaliers,PG,,4764960.0,Basketball
449,Ricky Rubio,Phoenix Suns,PG,10/21/90,16200000.0,Basketball


In [35]:
# reload Df
nba = pd.read_csv("nba.csv")

# Way 2: with insert()-method

nba.insert(loc=4, column = "Sport", value = "Basketball")
nba

# Remark:
# confusingly, while for data selection .loc[] is used to reference existing row/column LABELS
# and .iloc[] for referencing row/column indices;
# here, the argument .insert(loc = ) indicates the index number at which to insert the new column

Unnamed: 0,Name,Team,Position,Birthday,Sport,Salary
0,Shake Milton,Philadelphia 76ers,SG,9/26/96,Basketball,1445697.0
1,Christian Wood,Detroit Pistons,PF,9/27/95,Basketball,1645357.0
2,PJ Washington,Charlotte Hornets,,8/23/98,Basketball,
3,Derrick Rose,Detroit Pistons,PG,10/4/88,Basketball,7317074.0
4,Marial Shayok,Philadelphia 76ers,G,7/26/95,Basketball,79568.0
...,...,...,...,...,...,...
446,Harry Giles,Sacramento Kings,PF,4/22/98,Basketball,2578800.0
447,Robin Lopez,Milwaukee Bucks,C,4/1/88,Basketball,4767000.0
448,Collin Sexton,Cleveland Cavaliers,PG,,Basketball,4764960.0
449,Ricky Rubio,Phoenix Suns,PG,10/21/90,Basketball,16200000.0


In [36]:
# mostly, we will not want to add column with 1 single consistent value
# many times, values of new column will come from existing column & performing operation on it

# for example: what would salary be if we doubled it by 2?

nba["Salary Doubled"] = nba["Salary"] * 2 # broadcasting applies to each value of column automatically
# or we can use method to arrive at same result: nba["Salary Doubled"] = nba["Salary"].mul(2)

nba

Unnamed: 0,Name,Team,Position,Birthday,Sport,Salary,Salary Doubled
0,Shake Milton,Philadelphia 76ers,SG,9/26/96,Basketball,1445697.0,2891394.0
1,Christian Wood,Detroit Pistons,PF,9/27/95,Basketball,1645357.0,3290714.0
2,PJ Washington,Charlotte Hornets,,8/23/98,Basketball,,
3,Derrick Rose,Detroit Pistons,PG,10/4/88,Basketball,7317074.0,14634148.0
4,Marial Shayok,Philadelphia 76ers,G,7/26/95,Basketball,79568.0,159136.0
...,...,...,...,...,...,...,...
446,Harry Giles,Sacramento Kings,PF,4/22/98,Basketball,2578800.0,5157600.0
447,Robin Lopez,Milwaukee Bucks,C,4/1/88,Basketball,4767000.0,9534000.0
448,Collin Sexton,Cleveland Cavaliers,PG,,Basketball,4764960.0,9529920.0
449,Ricky Rubio,Phoenix Suns,PG,10/21/90,Basketball,16200000.0,32400000.0


 ## 6. A Review of the `value_counts` Method

- The `value_counts` method counts the number of times that each unique value occurs in a Series.


In [37]:
# import dataset

nba = pd.read_csv("nba.csv")
nba.head()

Unnamed: 0,Name,Team,Position,Birthday,Salary
0,Shake Milton,Philadelphia 76ers,SG,9/26/96,1445697.0
1,Christian Wood,Detroit Pistons,PF,9/27/95,1645357.0
2,PJ Washington,Charlotte Hornets,,8/23/98,
3,Derrick Rose,Detroit Pistons,PG,10/4/88,7317074.0
4,Marial Shayok,Philadelphia 76ers,G,7/26/95,79568.0


In [38]:
nba["Team"].value_counts() # Philadelphia 76ers are largest team

Team
Philadelphia 76ers        17
Minnesota Timberwolves    17
Utah Jazz                 17
Brooklyn Nets             17
Toronto Raptors           16
Boston Celtics            16
Chicago Bulls             16
Detroit Pistons           16
New York Knicks           16
New Orleans Pelicans      16
Los Angeles Lakers        16
Charlotte Hornets         16
Atlanta Hawks             15
Denver Nuggets            15
Memphis Grizzlies         15
Phoenix Suns              15
Sacramento Kings          15
Miami Heat                15
Golden State Warriors     15
Los Angeles Clippers      14
Washington Wizards        14
Portland Trail Blazers    14
Milwaukee Bucks           14
Houston Rockets           14
San Antonio Spurs         14
Dallas Mavericks          13
Oklahoma City Thunder     13
Orlando Magic             13
Cleveland Cavaliers       13
Indiana Pacers            13
Name: count, dtype: int64

In [39]:
nba["Team"].value_counts(normalize = True) * 100 # their team makes up 3.8 % of total amount of players

Team
Philadelphia 76ers        3.777778
Minnesota Timberwolves    3.777778
Utah Jazz                 3.777778
Brooklyn Nets             3.777778
Toronto Raptors           3.555556
Boston Celtics            3.555556
Chicago Bulls             3.555556
Detroit Pistons           3.555556
New York Knicks           3.555556
New Orleans Pelicans      3.555556
Los Angeles Lakers        3.555556
Charlotte Hornets         3.555556
Atlanta Hawks             3.333333
Denver Nuggets            3.333333
Memphis Grizzlies         3.333333
Phoenix Suns              3.333333
Sacramento Kings          3.333333
Miami Heat                3.333333
Golden State Warriors     3.333333
Los Angeles Clippers      3.111111
Washington Wizards        3.111111
Portland Trail Blazers    3.111111
Milwaukee Bucks           3.111111
Houston Rockets           3.111111
San Antonio Spurs         3.111111
Dallas Mavericks          2.888889
Oklahoma City Thunder     2.888889
Orlando Magic             2.888889
Cleveland Caval

## 7. Drop Rows with Missing Values

- Pandas uses a `NaN` designation for cells that have a missing value.
- The `.dropna()` method deletes rows with missing values. Its default behavior is to remove a row if it has **any** missing values.
- Pass the `.dropna(how = )` parameter an argument of `"all"` to delete rows where **all** the values are `NaN`.
- The `.dropna(subset = )` parameter customizes/limits the columns that Pandas will use to drop rows with missing values.


In [40]:
# import dataset

nba = pd.read_csv("nba.csv")
nba

Unnamed: 0,Name,Team,Position,Birthday,Salary
0,Shake Milton,Philadelphia 76ers,SG,9/26/96,1445697.0
1,Christian Wood,Detroit Pistons,PF,9/27/95,1645357.0
2,PJ Washington,Charlotte Hornets,,8/23/98,
3,Derrick Rose,Detroit Pistons,PG,10/4/88,7317074.0
4,Marial Shayok,Philadelphia 76ers,G,7/26/95,79568.0
...,...,...,...,...,...
446,Harry Giles,Sacramento Kings,PF,4/22/98,2578800.0
447,Robin Lopez,Milwaukee Bucks,C,4/1/88,4767000.0
448,Collin Sexton,Cleveland Cavaliers,PG,,4764960.0
449,Ricky Rubio,Phoenix Suns,PG,10/21/90,16200000.0


In [41]:
nba.dropna() # drops a row with any missing value: 3 rows with NaN get dropped

Unnamed: 0,Name,Team,Position,Birthday,Salary
0,Shake Milton,Philadelphia 76ers,SG,9/26/96,1445697.0
1,Christian Wood,Detroit Pistons,PF,9/27/95,1645357.0
3,Derrick Rose,Detroit Pistons,PG,10/4/88,7317074.0
4,Marial Shayok,Philadelphia 76ers,G,7/26/95,79568.0
5,Draymond Green,Golden State Warriors,PF,3/4/90,18539130.0
...,...,...,...,...,...
444,Melvin Frazier,Orlando Magic,SG,8/30/96,1416852.0
445,Austin Rivers,Houston Rockets,PG,8/1/92,2174310.0
446,Harry Giles,Sacramento Kings,PF,4/22/98,2578800.0
447,Robin Lopez,Milwaukee Bucks,C,4/1/88,4767000.0


In [42]:
nba.dropna(how = 'any') # this is the default setting

nba.dropna(how= 'all') # we can change it to "all" if we only want to remove the rows with ALL NaN values
# now only 1 row, the last one, gets dropped

Unnamed: 0,Name,Team,Position,Birthday,Salary
0,Shake Milton,Philadelphia 76ers,SG,9/26/96,1445697.0
1,Christian Wood,Detroit Pistons,PF,9/27/95,1645357.0
2,PJ Washington,Charlotte Hornets,,8/23/98,
3,Derrick Rose,Detroit Pistons,PG,10/4/88,7317074.0
4,Marial Shayok,Philadelphia 76ers,G,7/26/95,79568.0
...,...,...,...,...,...
445,Austin Rivers,Houston Rockets,PG,8/1/92,2174310.0
446,Harry Giles,Sacramento Kings,PF,4/22/98,2578800.0
447,Robin Lopez,Milwaukee Bucks,C,4/1/88,4767000.0
448,Collin Sexton,Cleveland Cavaliers,PG,,4764960.0


In [43]:
nba.dropna(subset=["Position"]) # we can also specify in which column
# nba.dropna(subset=["Position","Birthday"]) # if multiple columns: an OR relation

Unnamed: 0,Name,Team,Position,Birthday,Salary
0,Shake Milton,Philadelphia 76ers,SG,9/26/96,1445697.0
1,Christian Wood,Detroit Pistons,PF,9/27/95,1645357.0
3,Derrick Rose,Detroit Pistons,PG,10/4/88,7317074.0
4,Marial Shayok,Philadelphia 76ers,G,7/26/95,79568.0
5,Draymond Green,Golden State Warriors,PF,3/4/90,18539130.0
...,...,...,...,...,...
445,Austin Rivers,Houston Rockets,PG,8/1/92,2174310.0
446,Harry Giles,Sacramento Kings,PF,4/22/98,2578800.0
447,Robin Lopez,Milwaukee Bucks,C,4/1/88,4767000.0
448,Collin Sexton,Cleveland Cavaliers,PG,,4764960.0


## 8. Fill in Missing Values with the `.fillna()` Method

- The `.fillna()` method replaces missing `NaN` values with its argument.
- The `.fillna()` method is available on both DataFrames and Series.
- An extracted Series is a view on the original DataFrame, but the `.fillna()` method returns a copy.


In [44]:
# import dataset

nba = pd.read_csv("nba.csv").dropna(how = 'all') # we already drop the last row with all NaN values
nba

Unnamed: 0,Name,Team,Position,Birthday,Salary
0,Shake Milton,Philadelphia 76ers,SG,9/26/96,1445697.0
1,Christian Wood,Detroit Pistons,PF,9/27/95,1645357.0
2,PJ Washington,Charlotte Hornets,,8/23/98,
3,Derrick Rose,Detroit Pistons,PG,10/4/88,7317074.0
4,Marial Shayok,Philadelphia 76ers,G,7/26/95,79568.0
...,...,...,...,...,...
445,Austin Rivers,Houston Rockets,PG,8/1/92,2174310.0
446,Harry Giles,Sacramento Kings,PF,4/22/98,2578800.0
447,Robin Lopez,Milwaukee Bucks,C,4/1/88,4767000.0
448,Collin Sexton,Cleveland Cavaliers,PG,,4764960.0


In [45]:
nba.fillna(0) # every missing value replaced by same fill value
# probably not accurate, as '0' is appropriate for Salary, but not so much for Position or Birthday

Unnamed: 0,Name,Team,Position,Birthday,Salary
0,Shake Milton,Philadelphia 76ers,SG,9/26/96,1445697.0
1,Christian Wood,Detroit Pistons,PF,9/27/95,1645357.0
2,PJ Washington,Charlotte Hornets,0,8/23/98,0.0
3,Derrick Rose,Detroit Pistons,PG,10/4/88,7317074.0
4,Marial Shayok,Philadelphia 76ers,G,7/26/95,79568.0
...,...,...,...,...,...
445,Austin Rivers,Houston Rockets,PG,8/1/92,2174310.0
446,Harry Giles,Sacramento Kings,PF,4/22/98,2578800.0
447,Robin Lopez,Milwaukee Bucks,C,4/1/88,4767000.0
448,Collin Sexton,Cleveland Cavaliers,PG,0,4764960.0


In [46]:
# we can target a specific column and specify its fitting filling value
# .fillna() method makes a copy, not a view
# so we can overwrite the original column with the copy generated by the .fillna() method 
# to adjust the Df

nba["Salary"].fillna(0) # this is copy of the Series view 'Salary'
nba["Salary"] = nba["Salary"].fillna(0) # we can overwrite original column with this copy
nba
# now we see that only the NaN value in the Salary column is filled in with value '0',
# and not Position or Birthday

Unnamed: 0,Name,Team,Position,Birthday,Salary
0,Shake Milton,Philadelphia 76ers,SG,9/26/96,1445697.0
1,Christian Wood,Detroit Pistons,PF,9/27/95,1645357.0
2,PJ Washington,Charlotte Hornets,,8/23/98,0.0
3,Derrick Rose,Detroit Pistons,PG,10/4/88,7317074.0
4,Marial Shayok,Philadelphia 76ers,G,7/26/95,79568.0
...,...,...,...,...,...
445,Austin Rivers,Houston Rockets,PG,8/1/92,2174310.0
446,Harry Giles,Sacramento Kings,PF,4/22/98,2578800.0
447,Robin Lopez,Milwaukee Bucks,C,4/1/88,4767000.0
448,Collin Sexton,Cleveland Cavaliers,PG,,4764960.0


In [47]:
# We can also do this for other columns with missing values of other datatype
# for example: 'unknown' for NaN in columns of Position and Birthday
nba[["Position","Birthday"]] = nba[["Position","Birthday"]].fillna(value="Unknown")
nba
# now we have filled in value '0' for numerical col 'Salary' and 'Unknown' for other col

Unnamed: 0,Name,Team,Position,Birthday,Salary
0,Shake Milton,Philadelphia 76ers,SG,9/26/96,1445697.0
1,Christian Wood,Detroit Pistons,PF,9/27/95,1645357.0
2,PJ Washington,Charlotte Hornets,Unknown,8/23/98,0.0
3,Derrick Rose,Detroit Pistons,PG,10/4/88,7317074.0
4,Marial Shayok,Philadelphia 76ers,G,7/26/95,79568.0
...,...,...,...,...,...
445,Austin Rivers,Houston Rockets,PG,8/1/92,2174310.0
446,Harry Giles,Sacramento Kings,PF,4/22/98,2578800.0
447,Robin Lopez,Milwaukee Bucks,C,4/1/88,4767000.0
448,Collin Sexton,Cleveland Cavaliers,PG,Unknown,4764960.0


## 9. The `astype` Method I

- The `.astype()` method converts a Series's values to a specified type.
- Pass in the specified type as either a string or the core Python data type.
- Pandas cannot convert `NaN` values to numeric types, so we need to eliminate/replace them before we perform the conversion.
- The `dtypes` attribute returns a Series with the DataFrame's columns and their types.


In [48]:
# import dataset

nba = pd.read_csv("nba.csv").dropna(how = 'all') # we already drop the last row with all NaN values
nba

Unnamed: 0,Name,Team,Position,Birthday,Salary
0,Shake Milton,Philadelphia 76ers,SG,9/26/96,1445697.0
1,Christian Wood,Detroit Pistons,PF,9/27/95,1645357.0
2,PJ Washington,Charlotte Hornets,,8/23/98,
3,Derrick Rose,Detroit Pistons,PG,10/4/88,7317074.0
4,Marial Shayok,Philadelphia 76ers,G,7/26/95,79568.0
...,...,...,...,...,...
445,Austin Rivers,Houston Rockets,PG,8/1/92,2174310.0
446,Harry Giles,Sacramento Kings,PF,4/22/98,2578800.0
447,Robin Lopez,Milwaukee Bucks,C,4/1/88,4767000.0
448,Collin Sexton,Cleveland Cavaliers,PG,,4764960.0


In [49]:
# we can check current datatypes for different columns with .dtypes attribute

nba.dtypes

Name         object
Team         object
Position     object
Birthday     object
Salary      float64
dtype: object

In [50]:
# say, we want to convert values Salary from float to integer
# remember: in fact in dataset these values are already integers
# but when there is at least 1 NaN value in a column, these integers become floats
# SO: first thing we have to do is get rid off NaN value in 'Salary' column

nba["Salary"] = nba["Salary"].fillna(0) # step 1: replace NaN values

nba["Salary"].astype("int")   # step 2: convert float to integer
nba["Salary"].astype(int)     # either as string or as dtype

0       1445697
1       1645357
2             0
3       7317074
4         79568
         ...   
445     2174310
446     2578800
447     4767000
448     4764960
449    16200000
Name: Salary, Length: 450, dtype: int64

In [51]:
nba["Salary"] = nba["Salary"].astype(int) # and overwrite existing column 
nba

Unnamed: 0,Name,Team,Position,Birthday,Salary
0,Shake Milton,Philadelphia 76ers,SG,9/26/96,1445697
1,Christian Wood,Detroit Pistons,PF,9/27/95,1645357
2,PJ Washington,Charlotte Hornets,,8/23/98,0
3,Derrick Rose,Detroit Pistons,PG,10/4/88,7317074
4,Marial Shayok,Philadelphia 76ers,G,7/26/95,79568
...,...,...,...,...,...
445,Austin Rivers,Houston Rockets,PG,8/1/92,2174310
446,Harry Giles,Sacramento Kings,PF,4/22/98,2578800
447,Robin Lopez,Milwaukee Bucks,C,4/1/88,4767000
448,Collin Sexton,Cleveland Cavaliers,PG,,4764960


In [52]:
nba.dtypes # when we check dataframe & dtypes, we see floats are changed to int

Name        object
Team        object
Position    object
Birthday    object
Salary       int64
dtype: object

## 10. The `astype` Method II

- The `category` type is ideal for columns with a limited number of unique values.
- The `nunique` method will return a Series with the number of unique values in each column.
- With categories, Pandas does not create a separate value in memory for each "cell." Rather, the cells point to a single copy for each unique value.


In [53]:
# import dataset

nba = pd.read_csv("nba.csv").dropna(how = 'all') # we already drop the last row with all NaN values
nba.tail()

Unnamed: 0,Name,Team,Position,Birthday,Salary
445,Austin Rivers,Houston Rockets,PG,8/1/92,2174310.0
446,Harry Giles,Sacramento Kings,PF,4/22/98,2578800.0
447,Robin Lopez,Milwaukee Bucks,C,4/1/88,4767000.0
448,Collin Sexton,Cleveland Cavaliers,PG,,4764960.0
449,Ricky Rubio,Phoenix Suns,PG,10/21/90,16200000.0


In [54]:
# we have 449 rows in our dataframe
# however the 'teams' column only has small number of unique values that often repeat
# this makes it an ideal column to convert it to be a category to reduce memory load
# in essence, not for every cell a value is saved, but the limited amount of values are saved
# & for every cell there is a reference to one of these saved values
# other examples where category might be helpful: gender, US state, ...

nba["Team"].nunique() # applied to Series: counts number 'n' of unique values column
nba.nunique() # applied to Df: counts number of unique values in all columns of Df

Name        450
Team         30
Position      9
Birthday    429
Salary      268
dtype: int64

In [55]:
# so we see: good candidates to apply category type are 'Team' and 'Position'

# to check memory space, we can use .info() method

nba.info()

<class 'pandas.core.frame.DataFrame'>
Index: 450 entries, 0 to 449
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Name      450 non-null    object 
 1   Team      450 non-null    object 
 2   Position  449 non-null    object 
 3   Birthday  449 non-null    object 
 4   Salary    449 non-null    float64
dtypes: float64(1), object(4)
memory usage: 21.1+ KB


In [56]:
nba["Position"] = nba["Position"].astype("category") # replace 'Position' column, first argument in astype is dtype, which we set to 'category'
nba["Team"] = nba["Team"].astype("category") # replace 'Team' column

In [57]:
nba.info() 

<class 'pandas.core.frame.DataFrame'>
Index: 450 entries, 0 to 449
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype   
---  ------    --------------  -----   
 0   Name      450 non-null    object  
 1   Team      450 non-null    category
 2   Position  449 non-null    category
 3   Birthday  449 non-null    object  
 4   Salary    449 non-null    float64 
dtypes: category(2), float64(1), object(2)
memory usage: 16.6+ KB


In [58]:
16.6/21.1 # and we see the memory space has reduced quite substantially: with more than 20%

0.7867298578199052

## 11. Sort a DataFrame with the `.sort_values()` Method I

- The `.sort_values()` method sorts a DataFrame by the values in **one** (or more: see later) **column**(s). The default sort is ascending (alphabetical for strings).
- The first parameter (`.sort_values(by= )`) expects the column(s) to sort by.
- If sorting by a single column, pass a string with its name.
- The `.sort_values(ascending = )` parameter customizes the sort order.
- The `.sort_values(na_position = )` parameter customizes where Pandas places `NaN` values.


In [59]:
nba = pd.read_csv("nba.csv")
nba.tail()

Unnamed: 0,Name,Team,Position,Birthday,Salary
446,Harry Giles,Sacramento Kings,PF,4/22/98,2578800.0
447,Robin Lopez,Milwaukee Bucks,C,4/1/88,4767000.0
448,Collin Sexton,Cleveland Cavaliers,PG,,4764960.0
449,Ricky Rubio,Phoenix Suns,PG,10/21/90,16200000.0
450,,,,,


In [60]:
# with Series .sort_values() method is straightforward as there is only 1 column

nba["Name"].sort_values() # we see values sorted in ascending order a-z

52          Aaron Gordon
101        Aaron Holiday
437          Abdel Nader
81           Adam Mokoka
399    Admiral Schofield
             ...        
302         Zach Norvell
312         Zhaire Smith
137      Zion Williamson
248       Zylan Cheatham
450                  NaN
Name: Name, Length: 451, dtype: object

In [61]:
# with Df .sort_values() method needs an indication of which column(s) to sort by
# if not, Pandas does not know what to do, and will throw an error

nba.sort_values()

TypeError: DataFrame.sort_values() missing 1 required positional argument: 'by'

In [62]:
# with Df we need at least to specify 1 argument: .sort_values(by = ) which specifies the row(s) by which to sort

nba.sort_values(by = "Name")

Unnamed: 0,Name,Team,Position,Birthday,Salary
52,Aaron Gordon,Orlando Magic,PF,9/16/95,19863636.0
101,Aaron Holiday,Indiana Pacers,PG,9/30/96,2239200.0
437,Abdel Nader,Oklahoma City Thunder,SF,9/25/93,1618520.0
81,Adam Mokoka,Chicago Bulls,G,7/18/98,79568.0
399,Admiral Schofield,Washington Wizards,SF,3/30/97,1000000.0
...,...,...,...,...,...
302,Zach Norvell,Los Angeles Lakers,SG,12/9/97,79568.0
312,Zhaire Smith,Philadelphia 76ers,SG,6/4/99,3058800.0
137,Zion Williamson,New Orleans Pelicans,F,7/6/00,9757440.0
248,Zylan Cheatham,New Orleans Pelicans,SF,11/17/95,79568.0


In [63]:
nba.sort_values(by = "Salary", ascending = False) # we can also sort in descending order

Unnamed: 0,Name,Team,Position,Birthday,Salary
205,Stephen Curry,Golden State Warriors,PG,3/14/88,40231758.0
219,Russell Westbrook,Houston Rockets,PG,11/12/88,38506482.0
38,Chris Paul,Oklahoma City Thunder,PG,5/6/85,38506482.0
264,James Harden,Houston Rockets,PG,8/26/89,38199000.0
251,John Wall,Washington Wizards,PG,9/6/90,38199000.0
...,...,...,...,...,...
218,Jared Harper,Phoenix Suns,PG,9/14/97,79568.0
411,Chris Clemons,Houston Rockets,SG,7/23/97,79568.0
283,Garrison Mathews,Washington Wizards,SG,10/24/96,79568.0
2,PJ Washington,Charlotte Hornets,,8/23/98,


In [64]:
# we see that, no matter the order we sort in, by default Pandas puts NaN at the very end
# however, we can also change the parameter na_position

nba.sort_values(by = "Salary", ascending = False, na_position = "first")

Unnamed: 0,Name,Team,Position,Birthday,Salary
2,PJ Washington,Charlotte Hornets,,8/23/98,
450,,,,,
205,Stephen Curry,Golden State Warriors,PG,3/14/88,40231758.0
219,Russell Westbrook,Houston Rockets,PG,11/12/88,38506482.0
38,Chris Paul,Oklahoma City Thunder,PG,5/6/85,38506482.0
...,...,...,...,...,...
227,Amir Coffey,Los Angeles Clippers,G,6/17/97,79568.0
221,Oshae Brissett,Toronto Raptors,SF,6/20/98,79568.0
218,Jared Harper,Phoenix Suns,PG,9/14/97,79568.0
411,Chris Clemons,Houston Rockets,SG,7/23/97,79568.0


## 12. Sort a DataFrame with the `sort_values` Method II

- To sort by **multiple columns**, pass the `by` parameter a list of column names. Pandas will sort in the specified column order (first to last).
- Pass the `ascending` parameter a Boolean to sort all columns in a consistent order (all ascending or all descending).
- Pass `ascending` a list to customize the sort order **per column**. The `ascending` list length must match the `by` list.


In [65]:
# load dataset

nba = pd.read_csv("nba.csv")
nba.tail()

Unnamed: 0,Name,Team,Position,Birthday,Salary
446,Harry Giles,Sacramento Kings,PF,4/22/98,2578800.0
447,Robin Lopez,Milwaukee Bucks,C,4/1/88,4767000.0
448,Collin Sexton,Cleveland Cavaliers,PG,,4764960.0
449,Ricky Rubio,Phoenix Suns,PG,10/21/90,16200000.0
450,,,,,


In [66]:
nba.sort_values(by = ["Team","Name"]) # sequential sort: provide a list that tells Pandas to first sort by 'Team', then by 'Name'

Unnamed: 0,Name,Team,Position,Birthday,Salary
359,Alex Len,Atlanta Hawks,C,6/16/93,4160000.0
167,Allen Crabbe,Atlanta Hawks,SG,4/9/92,18500000.0
276,Brandon Goodwin,Atlanta Hawks,PG,10/2/95,79568.0
438,Bruno Fernando,Atlanta Hawks,C,8/15/98,1400000.0
194,Cam Reddish,Atlanta Hawks,SF,9/1/99,4245720.0
...,...,...,...,...,...
273,Justin Robinson,Washington Wizards,PG,10/12/97,898310.0
428,Moritz Wagner,Washington Wizards,C,4/26/97,2063520.0
21,Rui Hachimura,Washington Wizards,PF,2/8/98,4469160.0
36,Thomas Bryant,Washington Wizards,C,7/31/97,8000000.0


In [67]:
nba.sort_values(by = ["Team","Name"], ascending = True) # if we only give 1 boolean value for ascending ALL columns will be sorted thus
nba.sort_values(by = ["Team","Name"], ascending = [False, True]) # if you want one column to be sorted one way, other column other way: pass another list

Unnamed: 0,Name,Team,Position,Birthday,Salary
399,Admiral Schofield,Washington Wizards,SF,3/30/97,1000000.0
35,Bradley Beal,Washington Wizards,SG,6/28/93,27093018.0
353,Chris Chiozza,Washington Wizards,PG,11/21/95,79568.0
226,Davis Bertans,Washington Wizards,PF,11/12/92,7000000.0
283,Garrison Mathews,Washington Wizards,SG,10/24/96,79568.0
...,...,...,...,...,...
84,John Collins,Atlanta Hawks,PF,9/23/97,2686560.0
20,Kevin Huerter,Atlanta Hawks,SG,8/27/98,2636280.0
290,Tyrone Wallace,Atlanta Hawks,PG,6/10/94,1620564.0
98,Vince Carter,Atlanta Hawks,PF,1/26/77,2564753.0


## 13. Sort a DataFrame by its Index

- The `sort_index` method sorts the DataFrame by its index positions/labels.


In [68]:
# load dataset

nba = pd.read_csv("nba.csv")
nba = nba.sort_values(["Team", "Name"]) # make nba sorted Df
nba

Unnamed: 0,Name,Team,Position,Birthday,Salary
359,Alex Len,Atlanta Hawks,C,6/16/93,4160000.0
167,Allen Crabbe,Atlanta Hawks,SG,4/9/92,18500000.0
276,Brandon Goodwin,Atlanta Hawks,PG,10/2/95,79568.0
438,Bruno Fernando,Atlanta Hawks,C,8/15/98,1400000.0
194,Cam Reddish,Atlanta Hawks,SF,9/1/99,4245720.0
...,...,...,...,...,...
273,Justin Robinson,Washington Wizards,PG,10/12/97,898310.0
428,Moritz Wagner,Washington Wizards,C,4/26/97,2063520.0
21,Rui Hachimura,Washington Wizards,PF,2/8/98,4469160.0
36,Thomas Bryant,Washington Wizards,C,7/31/97,8000000.0


In [69]:
# to return back to the original unsorted Df we can re-open csv
# but we can also use .sort_index() method

nba.sort_index()

Unnamed: 0,Name,Team,Position,Birthday,Salary
0,Shake Milton,Philadelphia 76ers,SG,9/26/96,1445697.0
1,Christian Wood,Detroit Pistons,PF,9/27/95,1645357.0
2,PJ Washington,Charlotte Hornets,,8/23/98,
3,Derrick Rose,Detroit Pistons,PG,10/4/88,7317074.0
4,Marial Shayok,Philadelphia 76ers,G,7/26/95,79568.0
...,...,...,...,...,...
446,Harry Giles,Sacramento Kings,PF,4/22/98,2578800.0
447,Robin Lopez,Milwaukee Bucks,C,4/1/88,4767000.0
448,Collin Sexton,Cleveland Cavaliers,PG,,4764960.0
449,Ricky Rubio,Phoenix Suns,PG,10/21/90,16200000.0


## 14. Rank Values with the `.rank()` Method

- The `.rank()` method assigns a numeric ranking to each Series value.
- Pandas will assign the same rank to equal values and create a "gap" in the dataset for the ranks.


In [70]:
# load dataset

nba = pd.read_csv("nba.csv").dropna(how = "all")
nba

Unnamed: 0,Name,Team,Position,Birthday,Salary
0,Shake Milton,Philadelphia 76ers,SG,9/26/96,1445697.0
1,Christian Wood,Detroit Pistons,PF,9/27/95,1645357.0
2,PJ Washington,Charlotte Hornets,,8/23/98,
3,Derrick Rose,Detroit Pistons,PG,10/4/88,7317074.0
4,Marial Shayok,Philadelphia 76ers,G,7/26/95,79568.0
...,...,...,...,...,...
445,Austin Rivers,Houston Rockets,PG,8/1/92,2174310.0
446,Harry Giles,Sacramento Kings,PF,4/22/98,2578800.0
447,Robin Lopez,Milwaukee Bucks,C,4/1/88,4767000.0
448,Collin Sexton,Cleveland Cavaliers,PG,,4764960.0


In [71]:
# we want to rank the players by salary
# so we want to rank by 'Salary' column

# Step 1: remove and replace NaN with 0
# Step 2: convert floats to integers
# Step 3: re-assign Salary column

nba["Salary"] = nba["Salary"].fillna(0).astype(int)
nba

Unnamed: 0,Name,Team,Position,Birthday,Salary
0,Shake Milton,Philadelphia 76ers,SG,9/26/96,1445697
1,Christian Wood,Detroit Pistons,PF,9/27/95,1645357
2,PJ Washington,Charlotte Hornets,,8/23/98,0
3,Derrick Rose,Detroit Pistons,PG,10/4/88,7317074
4,Marial Shayok,Philadelphia 76ers,G,7/26/95,79568
...,...,...,...,...,...
445,Austin Rivers,Houston Rockets,PG,8/1/92,2174310
446,Harry Giles,Sacramento Kings,PF,4/22/98,2578800
447,Robin Lopez,Milwaukee Bucks,C,4/1/88,4767000
448,Collin Sexton,Cleveland Cavaliers,PG,,4764960


In [72]:
# now we can apply .rank() method to Salary column

nba["Salary"].rank(ascending = False) # we can set parameter ascending = False to have highest salaries at top

0      350.5
1      325.5
2      450.0
3      150.0
4      425.0
       ...  
445    281.0
446    247.0
447    184.0
448    189.0
449     74.0
Name: Salary, Length: 450, dtype: float64

In [73]:
nba["Salary"].rank(ascending = False).astype(int) # this again returns float values, so we set back to integers

0      350
1      325
2      450
3      150
4      425
      ... 
445    281
446    247
447    184
448    189
449     74
Name: Salary, Length: 450, dtype: int64

In [74]:
# now we add this rank as a new column to our Df
nba["Salary Rank"] = nba["Salary"].rank(ascending = False).astype(int)
nba

Unnamed: 0,Name,Team,Position,Birthday,Salary,Salary Rank
0,Shake Milton,Philadelphia 76ers,SG,9/26/96,1445697,350
1,Christian Wood,Detroit Pistons,PF,9/27/95,1645357,325
2,PJ Washington,Charlotte Hornets,,8/23/98,0,450
3,Derrick Rose,Detroit Pistons,PG,10/4/88,7317074,150
4,Marial Shayok,Philadelphia 76ers,G,7/26/95,79568,425
...,...,...,...,...,...,...
445,Austin Rivers,Houston Rockets,PG,8/1/92,2174310,281
446,Harry Giles,Sacramento Kings,PF,4/22/98,2578800,247
447,Robin Lopez,Milwaukee Bucks,C,4/1/88,4767000,184
448,Collin Sexton,Cleveland Cavaliers,PG,,4764960,189


In [75]:
# we can check whether this is correct, of course, by sorting on Salary column, and then corresponding Rank should be 1

nba.sort_values("Salary", ascending = False).head(10)

Unnamed: 0,Name,Team,Position,Birthday,Salary,Salary Rank
205,Stephen Curry,Golden State Warriors,PG,3/14/88,40231758,1
219,Russell Westbrook,Houston Rockets,PG,11/12/88,38506482,2
38,Chris Paul,Oklahoma City Thunder,PG,5/6/85,38506482,2
251,John Wall,Washington Wizards,PG,9/6/90,38199000,4
264,James Harden,Houston Rockets,PG,8/26/89,38199000,4
408,LeBron James,Los Angeles Lakers,PF,12/30/84,37436858,6
95,Kevin Durant,Brooklyn Nets,PF,9/29/88,37199000,7
317,Blake Griffin,Detroit Pistons,PF,3/16/89,34449964,8
323,Kyle Lowry,Toronto Raptors,PG,3/25/86,33296296,9
397,Paul George,Los Angeles Clippers,SF,5/2/90,33005556,10
