In [66]:
import pandas as pd

# Ranking with rank()

In [67]:
sales = pd.Series([15, 32, 45, 21, 55, 15, 0], index = ["Mo", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"])

In [68]:
sales

Mo     15
Tue    32
Wed    45
Thu    21
Fri    55
Sat    15
Sun     0
dtype: int64

In [69]:
sales.sort_values(ascending = False)

Fri    55
Wed    45
Tue    32
Thu    21
Mo     15
Sat    15
Sun     0
dtype: int64

In [70]:
sales.rank()

Mo     2.5
Tue    5.0
Wed    6.0
Thu    4.0
Fri    7.0
Sat    2.5
Sun    1.0
dtype: float64

In [71]:
sales.rank().sort_values(ascending = False)

Fri    7.0
Wed    6.0
Tue    5.0
Thu    4.0
Mo     2.5
Sat    2.5
Sun    1.0
dtype: float64

In [72]:
sales.rank(ascending=False).sort_values(ascending = True)

Fri    1.0
Wed    2.0
Tue    3.0
Thu    4.0
Mo     5.5
Sat    5.5
Sun    7.0
dtype: float64

In [73]:
# 5.5 is an error... let's fix it:
sales.rank(ascending=False, method = "min").sort_values(ascending = True)
# The method="min" parameter specifies that if there are ties 
# (i.e., multiple elements with the same value), 
# the minimum rank should be assigned to all tied elements.

Fri    1.0
Wed    2.0
Tue    3.0
Thu    4.0
Mo     5.0
Sat    5.0
Sun    7.0
dtype: float64

In [74]:
# percentage
sales.rank(ascending=False, method = "min", pct=True).sort_values(ascending = True)

Fri    0.142857
Wed    0.285714
Tue    0.428571
Thu    0.571429
Mo     0.714286
Sat    0.714286
Sun    1.000000
dtype: float64

In [75]:
titanic = pd.read_csv("titanic.csv")

In [76]:
titanic.fare.rank(ascending = False)

0      815.0
1      103.0
2      659.5
3      144.0
4      628.0
       ...  
886    484.5
887    237.5
888    345.5
889    237.5
890    768.5
Name: fare, Length: 891, dtype: float64

In [77]:
titanic["fare_rank"] = titanic.fare.rank(ascending = False, method="min")
titanic

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,deck,fare_rank
0,0,3,male,22.0,1,0,7.2500,S,,809.0
1,1,1,female,38.0,1,0,71.2833,C,C,103.0
2,1,3,female,26.0,0,0,7.9250,S,,651.0
3,1,1,female,35.0,1,0,53.1000,S,C,142.0
4,0,3,male,35.0,0,0,8.0500,S,,607.0
...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S,,464.0
887,1,1,female,19.0,0,0,30.0000,S,B,235.0
888,0,3,female,,1,2,23.4500,S,,345.0
889,1,1,male,26.0,0,0,30.0000,C,C,235.0


In [78]:
titanic.sort_values("fare", ascending = False)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,deck,fare_rank
258,1,1,female,35.0,0,0,512.3292,C,,1.0
737,1,1,male,35.0,0,0,512.3292,C,B,1.0
679,1,1,male,36.0,0,1,512.3292,C,B,1.0
88,1,1,female,23.0,3,2,263.0000,S,C,4.0
27,0,1,male,19.0,3,2,263.0000,S,C,4.0
...,...,...,...,...,...,...,...,...,...,...
633,0,1,male,,0,0,0.0000,S,,877.0
413,0,2,male,,0,0,0.0000,S,,877.0
822,0,1,male,38.0,0,0,0.0000,S,,877.0
732,0,2,male,,0,0,0.0000,S,,877.0


# nunique(), nlargest() and nsmallest() with DataFrames

• The nunique() method in pandas is used to **count the number of unique** elements in a Series or DataFrame.

• The nlargest() method in pandas is used to **get the n largest** elements from a Series or DataFrame.

• The nsmallest() method in pandas is used to **get the n smallest** elements from a Series or DataFrame.

In [79]:
titanic.age.unique()

array([22.  , 38.  , 26.  , 35.  ,   nan, 54.  ,  2.  , 27.  , 14.  ,
        4.  , 58.  , 20.  , 39.  , 55.  , 31.  , 34.  , 15.  , 28.  ,
        8.  , 19.  , 40.  , 66.  , 42.  , 21.  , 18.  ,  3.  ,  7.  ,
       49.  , 29.  , 65.  , 28.5 ,  5.  , 11.  , 45.  , 17.  , 32.  ,
       16.  , 25.  ,  0.83, 30.  , 33.  , 23.  , 24.  , 46.  , 59.  ,
       71.  , 37.  , 47.  , 14.5 , 70.5 , 32.5 , 12.  ,  9.  , 36.5 ,
       51.  , 55.5 , 40.5 , 44.  ,  1.  , 61.  , 56.  , 50.  , 36.  ,
       45.5 , 20.5 , 62.  , 41.  , 52.  , 63.  , 23.5 ,  0.92, 43.  ,
       60.  , 10.  , 64.  , 13.  , 48.  ,  0.75, 53.  , 57.  , 80.  ,
       70.  , 24.5 ,  6.  ,  0.67, 30.5 ,  0.42, 34.5 , 74.  ])

In [80]:
titanic.nunique(axis = 0, dropna=False)

survived       2
pclass         3
sex            2
age           89
sibsp          7
parch          7
fare         248
embarked       4
deck           8
fare_rank    248
dtype: int64

In [81]:
titanic.nlargest(n = 5, columns = "fare")

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,deck,fare_rank
258,1,1,female,35.0,0,0,512.3292,C,,1.0
679,1,1,male,36.0,0,1,512.3292,C,B,1.0
737,1,1,male,35.0,0,0,512.3292,C,B,1.0
27,0,1,male,19.0,3,2,263.0,S,C,4.0
88,1,1,female,23.0,3,2,263.0,S,C,4.0


In [82]:
titanic.sort_values("fare", ascending = False)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,deck,fare_rank
258,1,1,female,35.0,0,0,512.3292,C,,1.0
737,1,1,male,35.0,0,0,512.3292,C,B,1.0
679,1,1,male,36.0,0,1,512.3292,C,B,1.0
88,1,1,female,23.0,3,2,263.0000,S,C,4.0
27,0,1,male,19.0,3,2,263.0000,S,C,4.0
...,...,...,...,...,...,...,...,...,...,...
633,0,1,male,,0,0,0.0000,S,,877.0
413,0,2,male,,0,0,0.0000,S,,877.0
822,0,1,male,38.0,0,0,0.0000,S,,877.0
732,0,2,male,,0,0,0.0000,S,,877.0


In [83]:
titanic.nsmallest(n = 1, columns = "age")

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,deck,fare_rank
803,1,3,male,0.42,0,1,8.5167,C,,598.0


In [1]:
titanic.loc[titanic.age.idxmin()]
#  it returns the entire row where the age is the smallest

NameError: name 'titanic' is not defined

# Summary Statistics

In [85]:
titanic.describe()

Unnamed: 0,survived,pclass,age,sibsp,parch,fare,fare_rank
count,891.0,891.0,714.0,891.0,891.0,891.0,891.0
mean,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208,439.961841
std,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429,254.525014
min,0.0,1.0,0.42,0.0,0.0,0.0,1.0
25%,0.0,2.0,20.125,0.0,0.0,7.9104,223.0
50%,0.0,3.0,28.0,0.0,0.0,14.4542,445.0
75%,1.0,3.0,38.0,1.0,0.0,31.0,660.0
max,1.0,3.0,80.0,8.0,6.0,512.3292,877.0


In [86]:
# checking missing values
titanic.count(axis = "index")

survived     891
pclass       891
sex          891
age          714
sibsp        891
parch        891
fare         891
embarked     889
deck         203
fare_rank    891
dtype: int64

In [87]:
# summing y values (horizontally)
titanic.mean(axis = 1, numeric_only=True)

0      120.321429
1       30.754757
2       98.417857
3       33.300000
4       93.292857
          ...    
886     72.285714
887     40.857143
888     62.408333
889     41.857143
890    113.535714
Length: 891, dtype: float64

In [88]:
# summing x values (vertically)
titanic.sum(axis = 0, numeric_only = True)

survived        342.0000
pclass         2057.0000
age           21205.1700
sibsp           466.0000
parch           340.0000
fare          28693.9493
fare_rank    392006.0000
dtype: float64

In [89]:
# cumsum() calculates the cumulative sum of the values along the specified axis.
titanic.fare.cumsum(axis = 0)

0          7.2500
1         78.5333
2         86.4583
3        139.5583
4        147.6083
          ...    
886    28602.7493
887    28632.7493
888    28656.1993
889    28686.1993
890    28693.9493
Name: fare, Length: 891, dtype: float64

In [90]:
# calculating the correlation between data!
titanic.corr(numeric_only = True)

# 1 indicates a perfect positive correlation,
# -1 indicates a perfect negative correlation, and
# 0 indicates no correlation between the variables.

Unnamed: 0,survived,pclass,age,sibsp,parch,fare,fare_rank
survived,1.0,-0.338481,-0.077221,-0.035322,0.081629,0.257307,-0.322046
pclass,-0.338481,1.0,-0.369226,0.083081,0.018443,-0.5495,0.70015
age,-0.077221,-0.369226,1.0,-0.308247,-0.189119,0.096067,-0.116334
sibsp,-0.035322,0.083081,-0.308247,1.0,0.414838,0.159651,-0.357585
parch,0.081629,0.018443,-0.189119,0.414838,1.0,0.216225,-0.36602
fare,0.257307,-0.5495,0.096067,0.159651,0.216225,1.0,-0.63949
fare_rank,-0.322046,0.70015,-0.116334,-0.357585,-0.36602,-0.63949,1.0


Typically, correlations between 0.1 and 0.3 are considered weak, correlations between 0.3 and 0.5 are considered moderate, and correlations above 0.5 are considered strong.

# apply(), map()

1. apply() method:
apply() is a DataFrame method that applies a function along an axis of the DataFrame.

* It can apply functions row-wise (axis=0) or column-wise (axis=1), or even to the entire DataFrame.
* It is very versatile and can take any function as an argument, including custom functions.
* It is more commonly used for complex operations that require applying a function to each row or column of a DataFrame.

2. map() method:
map() is a Series method that applies a function to each element of a Series.
* It is primarily used for element-wise operations on Series objects.
* It can take a function, a dictionary, or a Series as an argument.
* It is often used for simple transformations like replacing values or mapping values to other values.

In [91]:
sales = pd.read_csv("sales.csv", index_col = 0)
sales

Unnamed: 0,Mon,Tue,Wed,Thu,Fri
Steven,34,27,15,,33
Mike,45,9,74,87.0,12
Andi,17,33,54,8.0,29
Paul,87,67,27,45.0,7


In [92]:
sales.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4 entries, Steven to Paul
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Mon     4 non-null      int64  
 1   Tue     4 non-null      int64  
 2   Wed     4 non-null      int64  
 3   Thu     3 non-null      float64
 4   Fri     4 non-null      int64  
dtypes: float64(1), int64(4)
memory usage: 192.0+ bytes


In [93]:
sales.min(axis = 0)

Mon    17.0
Tue     9.0
Wed    15.0
Thu     8.0
Fri     7.0
dtype: float64

In [94]:
def range(series):
    return series.max() - series.min()

In [95]:
sales.apply(range, axis = 0)

Mon    70.0
Tue    58.0
Wed    59.0
Thu    79.0
Fri    26.0
dtype: float64

In [96]:
sales.apply(lambda x: x.max() - x.min(), axis = 0)

Mon    70.0
Tue    58.0
Wed    59.0
Thu    79.0
Fri    26.0
dtype: float64

In [97]:
summer = pd.read_csv("summer.csv")

In [98]:
summer.head()

Unnamed: 0,Year,City,Sport,Discipline,Athlete,Country,Gender,Event,Medal
0,1896,Athens,Aquatics,Swimming,"HAJOS, Alfred",HUN,Men,100M Freestyle,Gold
1,1896,Athens,Aquatics,Swimming,"HERSCHMANN, Otto",AUT,Men,100M Freestyle,Silver
2,1896,Athens,Aquatics,Swimming,"DRIVAS, Dimitrios",GRE,Men,100M Freestyle For Sailors,Bronze
3,1896,Athens,Aquatics,Swimming,"MALOKINIS, Ioannis",GRE,Men,100M Freestyle For Sailors,Gold
4,1896,Athens,Aquatics,Swimming,"CHASAPIS, Spiridon",GRE,Men,100M Freestyle For Sailors,Silver


In [99]:
summer.Athlete.apply(lambda x: x[0])

0        H
1        H
2        D
3        M
4        C
        ..
31160    J
31161    R
31162    T
31163    A
31164    L
Name: Athlete, Length: 31165, dtype: object

In [100]:
summer.Athlete.map(lambda x: x[0])

0        H
1        H
2        D
3        M
4        C
        ..
31160    J
31161    R
31162    T
31163    A
31164    L
Name: Athlete, Length: 31165, dtype: object

In [101]:
summer.iloc[:,1:3].map(lambda x: x[0])

Unnamed: 0,City,Sport
0,A,A
1,A,A
2,A,A
3,A,A
4,A,A
...,...,...
31160,L,W
31161,L,W
31162,L,W
31163,L,W
