# Appendix 3: Python Libraries Crash Course

## Part 5: Pandas Intermediate 2

## Filtering DataFrames with one Condition

In [3]:
import pandas as pd

In [4]:
titanic = pd.read_csv("titanic.csv")

In [5]:
titanic.head(10)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,deck
0,0,3,male,22.0,1,0,7.25,S,
1,1,1,female,38.0,1,0,71.2833,C,C
2,1,3,female,26.0,0,0,7.925,S,
3,1,1,female,35.0,1,0,53.1,S,C
4,0,3,male,35.0,0,0,8.05,S,
5,0,3,male,,0,0,8.4583,Q,
6,0,1,male,54.0,0,0,51.8625,S,E
7,0,3,male,2.0,3,1,21.075,S,
8,1,3,female,27.0,0,2,11.1333,S,
9,1,2,female,14.0,1,0,30.0708,C,


In [6]:
titanic.sex.head(10)

0      male
1    female
2    female
3    female
4      male
5      male
6      male
7      male
8    female
9    female
Name: sex, dtype: object

In [7]:
titanic.sex == "male"

0       True
1      False
2      False
3      False
4       True
       ...  
886     True
887    False
888    False
889     True
890     True
Name: sex, Length: 891, dtype: bool

In [8]:
titanic[titanic.sex == "male"]["fare"]

0       7.2500
4       8.0500
5       8.4583
6      51.8625
7      21.0750
        ...   
883    10.5000
884     7.0500
886    13.0000
889    30.0000
890     7.7500
Name: fare, Length: 577, dtype: float64

In [9]:
titanic.loc[titanic.sex == "male", "fare"]

0       7.2500
4       8.0500
5       8.4583
6      51.8625
7      21.0750
        ...   
883    10.5000
884     7.0500
886    13.0000
889    30.0000
890     7.7500
Name: fare, Length: 577, dtype: float64

In [10]:
mask1 = titanic.sex == "male"
mask1

0       True
1      False
2      False
3      False
4       True
       ...  
886     True
887    False
888    False
889     True
890     True
Name: sex, Length: 891, dtype: bool

In [11]:
titanic_male = titanic.loc[mask1]

In [12]:
titanic_male.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,deck
0,0,3,male,22.0,1,0,7.25,S,
4,0,3,male,35.0,0,0,8.05,S,
5,0,3,male,,0,0,8.4583,Q,
6,0,1,male,54.0,0,0,51.8625,S,E
7,0,3,male,2.0,3,1,21.075,S,


In [13]:
titanic.dtypes# == object

survived      int64
pclass        int64
sex          object
age         float64
sibsp         int64
parch         int64
fare        float64
embarked     object
deck         object
dtype: object

In [14]:
mask2 = titanic.dtypes == object
mask2

survived    False
pclass      False
sex          True
age         False
sibsp       False
parch       False
fare        False
embarked     True
deck         True
dtype: bool

In [15]:
titanic.loc[:, ~mask2]

Unnamed: 0,survived,pclass,age,sibsp,parch,fare
0,0,3,22.0,1,0,7.2500
1,1,1,38.0,1,0,71.2833
2,1,3,26.0,0,0,7.9250
3,1,1,35.0,1,0,53.1000
4,0,3,35.0,0,0,8.0500
...,...,...,...,...,...,...
886,0,2,27.0,0,0,13.0000
887,1,1,19.0,0,0,30.0000
888,0,3,,1,2,23.4500
889,1,1,26.0,0,0,30.0000


In [16]:
titanic.loc[mask1, ~mask2]

Unnamed: 0,survived,pclass,age,sibsp,parch,fare
0,0,3,22.0,1,0,7.2500
4,0,3,35.0,0,0,8.0500
5,0,3,,0,0,8.4583
6,0,1,54.0,0,0,51.8625
7,0,3,2.0,3,1,21.0750
...,...,...,...,...,...,...
883,0,2,28.0,0,0,10.5000
884,0,3,25.0,0,0,7.0500
886,0,2,27.0,0,0,13.0000
889,1,1,26.0,0,0,30.0000


## Filtering DataFrames with many Conditions (AND)

In [17]:
import pandas as pd

In [18]:
titanic = pd.read_csv("titanic.csv")

In [19]:
titanic.head(10)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,deck
0,0,3,male,22.0,1,0,7.25,S,
1,1,1,female,38.0,1,0,71.2833,C,C
2,1,3,female,26.0,0,0,7.925,S,
3,1,1,female,35.0,1,0,53.1,S,C
4,0,3,male,35.0,0,0,8.05,S,
5,0,3,male,,0,0,8.4583,Q,
6,0,1,male,54.0,0,0,51.8625,S,E
7,0,3,male,2.0,3,1,21.075,S,
8,1,3,female,27.0,0,2,11.1333,S,
9,1,2,female,14.0,1,0,30.0708,C,


In [20]:
mask1 = titanic.sex == "male"
mask1.head()

0     True
1    False
2    False
3    False
4     True
Name: sex, dtype: bool

In [21]:
mask2 = titanic.age > 14
mask2.head()

0    True
1    True
2    True
3    True
4    True
Name: age, dtype: bool

In [22]:
(mask1 & mask2).head()

0     True
1    False
2    False
3    False
4     True
dtype: bool

In [23]:
male_adult = titanic.loc[mask1 % mask2, ["survived", "pclass", "sex", "age"]]
male_adult.head(20)

Unnamed: 0,survived,pclass,sex,age
0,0,3,male,22.0
0,0,3,male,22.0
0,0,3,male,22.0
0,0,3,male,22.0
0,0,3,male,22.0
0,0,3,male,22.0
0,0,3,male,22.0
0,0,3,male,22.0
0,0,3,male,22.0
0,0,3,male,22.0


In [24]:
male_adult.info()

<class 'pandas.core.frame.DataFrame'>
Index: 891 entries, 0 to 0
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   survived  891 non-null    int64  
 1   pclass    891 non-null    int64  
 2   sex       891 non-null    object 
 3   age       891 non-null    float64
dtypes: float64(1), int64(2), object(1)
memory usage: 34.8+ KB


In [25]:
male_adult.describe()

Unnamed: 0,survived,pclass,age
count,891.0,891.0,891.0
mean,0.0,3.0,22.0
std,0.0,0.0,0.0
min,0.0,3.0,22.0
25%,0.0,3.0,22.0
50%,0.0,3.0,22.0
75%,0.0,3.0,22.0
max,0.0,3.0,22.0


In [26]:
titanic.describe()

Unnamed: 0,survived,pclass,age,sibsp,parch,fare
count,891.0,891.0,714.0,891.0,891.0,891.0
mean,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,0.0,1.0,0.42,0.0,0.0,0.0
25%,0.0,2.0,20.125,0.0,0.0,7.9104
50%,0.0,3.0,28.0,0.0,0.0,14.4542
75%,1.0,3.0,38.0,1.0,0.0,31.0
max,1.0,3.0,80.0,8.0,6.0,512.3292


## Filtering DataFrames with many Conditions (OR)

In [27]:
import pandas as pd

In [28]:
titanic = pd.read_csv("titanic.csv")

In [29]:
titanic.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,deck
0,0,3,male,22.0,1,0,7.25,S,
1,1,1,female,38.0,1,0,71.2833,C,C
2,1,3,female,26.0,0,0,7.925,S,
3,1,1,female,35.0,1,0,53.1,S,C
4,0,3,male,35.0,0,0,8.05,S,


In [30]:
mask1 = titanic.sex == "female"
mask1.head(20)

0     False
1      True
2      True
3      True
4     False
5     False
6     False
7     False
8      True
9      True
10     True
11     True
12    False
13    False
14     True
15     True
16    False
17    False
18     True
19     True
Name: sex, dtype: bool

In [31]:
mask2 = titanic.age < 14
mask2.head(20)

0     False
1     False
2     False
3     False
4     False
5     False
6     False
7      True
8     False
9     False
10     True
11    False
12    False
13    False
14    False
15    False
16     True
17    False
18    False
19    False
Name: age, dtype: bool

In [32]:
(mask1 | mask2).head(11)

0     False
1      True
2      True
3      True
4     False
5     False
6     False
7      True
8      True
9      True
10     True
dtype: bool

In [33]:
titanic.loc[mask1 | mask2]

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,deck
1,1,1,female,38.0,1,0,71.2833,C,C
2,1,3,female,26.0,0,0,7.9250,S,
3,1,1,female,35.0,1,0,53.1000,S,C
7,0,3,male,2.0,3,1,21.0750,S,
8,1,3,female,27.0,0,2,11.1333,S,
...,...,...,...,...,...,...,...,...,...
880,1,2,female,25.0,0,1,26.0000,S,
882,0,3,female,22.0,0,0,10.5167,S,
885,0,3,female,39.0,0,5,29.1250,Q,
887,1,1,female,19.0,0,0,30.0000,S,B


In [34]:
wom_or_chi = titanic.loc[mask1 | mask2, ["survived", "pclass", "sex", "age"]]

In [35]:
wom_or_chi.head()

Unnamed: 0,survived,pclass,sex,age
1,1,1,female,38.0
2,1,3,female,26.0
3,1,1,female,35.0
7,0,3,male,2.0
8,1,3,female,27.0


In [36]:
wom_or_chi.info()

<class 'pandas.core.frame.DataFrame'>
Index: 351 entries, 1 to 888
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   survived  351 non-null    int64  
 1   pclass    351 non-null    int64  
 2   sex       351 non-null    object 
 3   age       298 non-null    float64
dtypes: float64(1), int64(2), object(1)
memory usage: 13.7+ KB


In [37]:
wom_or_chi.describe()

Unnamed: 0,survived,pclass,age
count,351.0,351.0,298.0
mean,0.723647,2.205128,25.039161
std,0.447832,0.847232,15.314631
min,0.0,1.0,0.42
25%,0.0,1.0,14.125
50%,1.0,2.0,24.0
75%,1.0,3.0,35.0
max,1.0,3.0,63.0


In [38]:
titanic.describe()

Unnamed: 0,survived,pclass,age,sibsp,parch,fare
count,891.0,891.0,714.0,891.0,891.0,891.0
mean,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,0.0,1.0,0.42,0.0,0.0,0.0
25%,0.0,2.0,20.125,0.0,0.0,7.9104
50%,0.0,3.0,28.0,0.0,0.0,14.4542
75%,1.0,3.0,38.0,1.0,0.0,31.0
max,1.0,3.0,80.0,8.0,6.0,512.3292


## Advanced Filtering with between(), isin() and ~

In [39]:
import pandas as pd

In [40]:
summer = pd.read_csv("summer.csv")

In [41]:
summer.head()

Unnamed: 0,Year,City,Sport,Discipline,Athlete,Country,Gender,Event,Medal
0,1896,Athens,Aquatics,Swimming,"HAJOS, Alfred",HUN,Men,100M Freestyle,Gold
1,1896,Athens,Aquatics,Swimming,"HERSCHMANN, Otto",AUT,Men,100M Freestyle,Silver
2,1896,Athens,Aquatics,Swimming,"DRIVAS, Dimitrios",GRE,Men,100M Freestyle For Sailors,Bronze
3,1896,Athens,Aquatics,Swimming,"MALOKINIS, Ioannis",GRE,Men,100M Freestyle For Sailors,Gold
4,1896,Athens,Aquatics,Swimming,"CHASAPIS, Spiridon",GRE,Men,100M Freestyle For Sailors,Silver


In [42]:
og_1988 = summer.loc[summer.Year == 1988]

In [43]:
og_1988.head()

Unnamed: 0,Year,City,Sport,Discipline,Athlete,Country,Gender,Event,Medal
18051,1988,Seoul,Aquatics,Diving,"MENA CAMPOS, Jesus",MEX,Men,10M Platform,Bronze
18052,1988,Seoul,Aquatics,Diving,"LOUGANIS, Gregory",USA,Men,10M Platform,Gold
18053,1988,Seoul,Aquatics,Diving,"XIONG, Ni",CHN,Men,10M Platform,Silver
18054,1988,Seoul,Aquatics,Diving,"WYLAND-WILLIAMS, Wendy Lian",USA,Women,10M Platform,Bronze
18055,1988,Seoul,Aquatics,Diving,"XU, Yan-Mei",CHN,Women,10M Platform,Gold


In [44]:
og_1988.tail()

Unnamed: 0,Year,City,Sport,Discipline,Athlete,Country,Gender,Event,Medal
19592,1988,Seoul,Wrestling,Wrestling Gre-R,"KOMCHEV, Atanas Slavov",BUL,Men,82 - 90KG (Light-Heavyweight),Gold
19593,1988,Seoul,Wrestling,Wrestling Gre-R,"KOSKELA, Harri Matias",FIN,Men,82 - 90KG (Light-Heavyweight),Silver
19594,1988,Seoul,Wrestling,Wrestling Gre-R,"KOSLOWSKI, Dennis Marvin",USA,Men,90 - 100KG (Heavyweight),Bronze
19595,1988,Seoul,Wrestling,Wrestling Gre-R,"WRONSKI, Andrzej",POL,Men,90 - 100KG (Heavyweight),Gold
19596,1988,Seoul,Wrestling,Wrestling Gre-R,"HIMMEL, Gerhard",FRG,Men,90 - 100KG (Heavyweight),Silver


In [45]:
og_1988.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1546 entries, 18051 to 19596
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Year        1546 non-null   int64 
 1   City        1546 non-null   object
 2   Sport       1546 non-null   object
 3   Discipline  1546 non-null   object
 4   Athlete     1546 non-null   object
 5   Country     1546 non-null   object
 6   Gender      1546 non-null   object
 7   Event       1546 non-null   object
 8   Medal       1546 non-null   object
dtypes: int64(1), object(8)
memory usage: 120.8+ KB


In [46]:
og_since1992 = summer.loc[summer.Year >= 1992]

In [47]:
og_since1992.head()

Unnamed: 0,Year,City,Sport,Discipline,Athlete,Country,Gender,Event,Medal
19597,1992,Barcelona,Aquatics,Diving,"XIONG, Ni",CHN,Men,10M Platform,Bronze
19598,1992,Barcelona,Aquatics,Diving,"SUN, Shuwei",CHN,Men,10M Platform,Gold
19599,1992,Barcelona,Aquatics,Diving,"DONIE, Scott R.",USA,Men,10M Platform,Silver
19600,1992,Barcelona,Aquatics,Diving,"CLARK, Mary Ellen",USA,Women,10M Platform,Bronze
19601,1992,Barcelona,Aquatics,Diving,"FU, Mingxia",CHN,Women,10M Platform,Gold


In [48]:
og_since1992.tail()

Unnamed: 0,Year,City,Sport,Discipline,Athlete,Country,Gender,Event,Medal
31160,2012,London,Wrestling,Wrestling Freestyle,"JANIKOWSKI, Damian",POL,Men,Wg 84 KG,Bronze
31161,2012,London,Wrestling,Wrestling Freestyle,"REZAEI, Ghasem Gholamreza",IRI,Men,Wg 96 KG,Gold
31162,2012,London,Wrestling,Wrestling Freestyle,"TOTROV, Rustam",RUS,Men,Wg 96 KG,Silver
31163,2012,London,Wrestling,Wrestling Freestyle,"ALEKSANYAN, Artur",ARM,Men,Wg 96 KG,Bronze
31164,2012,London,Wrestling,Wrestling Freestyle,"LIDBERG, Jimmy",SWE,Men,Wg 96 KG,Bronze


In [49]:
summer.Year.between(1960, 1969).head()

0    False
1    False
2    False
3    False
4    False
Name: Year, dtype: bool

In [50]:
#og_60s = summer.loc[summer.Year.between(1960, 1969, inclusive=True)] # old

In [51]:
og_60s = summer.loc[summer.Year.between(1960, 1969, inclusive="both")] # new

In [52]:
og_60s.head()

Unnamed: 0,Year,City,Sport,Discipline,Athlete,Country,Gender,Event,Medal
9792,1960,Rome,Aquatics,Diving,"PHELPS, Brian Eric",GBR,Men,10M Platform,Bronze
9793,1960,Rome,Aquatics,Diving,"WEBSTER, Robert David",USA,Men,10M Platform,Gold
9794,1960,Rome,Aquatics,Diving,"TOBIAN, Gary Milburn",USA,Men,10M Platform,Silver
9795,1960,Rome,Aquatics,Diving,"KRUTOVA, Ninel",URS,Women,10M Platform,Bronze
9796,1960,Rome,Aquatics,Diving,"KRÄMER-ENGEL-GULBIN, Ingrid",EUA,Women,10M Platform,Gold


In [53]:
og_60s.tail()

Unnamed: 0,Year,City,Sport,Discipline,Athlete,Country,Gender,Event,Medal
12710,1968,Mexico,Wrestling,Wrestling Gre-R,"METZ, Lothar",GDR,Men,78 - 87KG (Middleweight),Gold
12711,1968,Mexico,Wrestling,Wrestling Gre-R,"OLEINIK, Valentin",URS,Men,78 - 87KG (Middleweight),Silver
12712,1968,Mexico,Wrestling,Wrestling Gre-R,"MARTINESCU, Nicolae",ROU,Men,87 - 97KG (Light-Heavyweight),Bronze
12713,1968,Mexico,Wrestling,Wrestling Gre-R,"RADEV, Boyan Aleksandrov",BUL,Men,87 - 97KG (Light-Heavyweight),Gold
12714,1968,Mexico,Wrestling,Wrestling Gre-R,"YAKOVENKO, Nikolai",URS,Men,87 - 97KG (Light-Heavyweight),Silver


In [54]:
my_favourite_games = [1972, 1996]

In [55]:
summer.Year.isin(my_favourite_games).head()

0    False
1    False
2    False
3    False
4    False
Name: Year, dtype: bool

In [56]:
og_72_96 = summer.loc[summer.Year.isin(my_favourite_games)]

In [57]:
og_72_96.head()

Unnamed: 0,Year,City,Sport,Discipline,Athlete,Country,Gender,Event,Medal
12715,1972,Munich,Aquatics,Diving,"CAGNOTTO, Giorgio Franco",ITA,Men,10M Platform,Bronze
12716,1972,Munich,Aquatics,Diving,"DIBIASI, Klaus",ITA,Men,10M Platform,Gold
12717,1972,Munich,Aquatics,Diving,"RYDZE, Richard Anthony",USA,Men,10M Platform,Silver
12718,1972,Munich,Aquatics,Diving,"JANICKE, Marina",GDR,Women,10M Platform,Bronze
12719,1972,Munich,Aquatics,Diving,"KNAPE-LINDBERGH, Ulrika",SWE,Women,10M Platform,Gold


In [58]:
og_72_96.tail()

Unnamed: 0,Year,City,Sport,Discipline,Athlete,Country,Gender,Event,Medal
23156,1996,Atlanta,Wrestling,Wrestling Gre-R,"OLEYNYK, Vyacheslav",UKR,Men,82 - 90KG (Light-Heavyweight),Gold
23157,1996,Atlanta,Wrestling,Wrestling Gre-R,"FAFINSKI, Jacek",POL,Men,82 - 90KG (Light-Heavyweight),Silver
23158,1996,Atlanta,Wrestling,Wrestling Gre-R,"LJUNGBERG, Mikael",SWE,Men,90 - 100KG (Heavyweight),Bronze
23159,1996,Atlanta,Wrestling,Wrestling Gre-R,"WRONSKI, Andrzej",POL,Men,90 - 100KG (Heavyweight),Gold
23160,1996,Atlanta,Wrestling,Wrestling Gre-R,"LISHTVAN, Sergey",BLR,Men,90 - 100KG (Heavyweight),Silver


In [59]:
og_not_72_96 = summer.loc[~summer.Year.isin(my_favourite_games)]

In [60]:
og_not_72_96.head()

Unnamed: 0,Year,City,Sport,Discipline,Athlete,Country,Gender,Event,Medal
0,1896,Athens,Aquatics,Swimming,"HAJOS, Alfred",HUN,Men,100M Freestyle,Gold
1,1896,Athens,Aquatics,Swimming,"HERSCHMANN, Otto",AUT,Men,100M Freestyle,Silver
2,1896,Athens,Aquatics,Swimming,"DRIVAS, Dimitrios",GRE,Men,100M Freestyle For Sailors,Bronze
3,1896,Athens,Aquatics,Swimming,"MALOKINIS, Ioannis",GRE,Men,100M Freestyle For Sailors,Gold
4,1896,Athens,Aquatics,Swimming,"CHASAPIS, Spiridon",GRE,Men,100M Freestyle For Sailors,Silver


In [61]:
og_not_72_96.Year.unique()

array([1896, 1900, 1904, 1908, 1912, 1920, 1924, 1928, 1932, 1936, 1948,
       1952, 1956, 1960, 1964, 1968, 1976, 1980, 1984, 1988, 1992, 2000,
       2004, 2008, 2012])

## Intro to NA Values

In [62]:
import pandas as pd
import numpy as np

In [63]:
sales = pd.read_csv("sales.csv", index_col = 0)

In [64]:
sales

Unnamed: 0,Mon,Tue,Wed,Thu,Fri
Steven,34,27,15,,33
Mike,45,9,74,87.0,12
Andi,17,33,54,8.0,29
Paul,87,67,27,45.0,7


In [65]:
sales.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4 entries, Steven to Paul
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Mon     4 non-null      int64  
 1   Tue     4 non-null      int64  
 2   Wed     4 non-null      int64  
 3   Thu     3 non-null      float64
 4   Fri     4 non-null      int64  
dtypes: float64(1), int64(4)
memory usage: 192.0+ bytes


In [66]:
sales.loc["Steven", "Thu"]

nan

In [67]:
sales.iloc[1,1] = None

In [68]:
sales

Unnamed: 0,Mon,Tue,Wed,Thu,Fri
Steven,34,27.0,15,,33
Mike,45,,74,87.0,12
Andi,17,33.0,54,8.0,29
Paul,87,67.0,27,45.0,7


In [69]:
sales.iloc[2,2] = np.nan

In [70]:
sales

Unnamed: 0,Mon,Tue,Wed,Thu,Fri
Steven,34,27.0,15.0,,33
Mike,45,,74.0,87.0,12
Andi,17,33.0,,8.0,29
Paul,87,67.0,27.0,45.0,7


In [71]:
sales.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4 entries, Steven to Paul
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Mon     4 non-null      int64  
 1   Tue     3 non-null      float64
 2   Wed     3 non-null      float64
 3   Thu     3 non-null      float64
 4   Fri     4 non-null      int64  
dtypes: float64(3), int64(2)
memory usage: 364.0+ bytes


## Handling NA Values / missing Values

In [72]:
import pandas as pd

In [73]:
titanic = pd.read_csv("titanic.csv")

In [74]:
titanic.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,deck
0,0,3,male,22.0,1,0,7.25,S,
1,1,1,female,38.0,1,0,71.2833,C,C
2,1,3,female,26.0,0,0,7.925,S,
3,1,1,female,35.0,1,0,53.1,S,C
4,0,3,male,35.0,0,0,8.05,S,


In [75]:
titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   survived  891 non-null    int64  
 1   pclass    891 non-null    int64  
 2   sex       891 non-null    object 
 3   age       714 non-null    float64
 4   sibsp     891 non-null    int64  
 5   parch     891 non-null    int64  
 6   fare      891 non-null    float64
 7   embarked  889 non-null    object 
 8   deck      203 non-null    object 
dtypes: float64(2), int64(4), object(3)
memory usage: 62.8+ KB


In [76]:
titanic.isna().sum()

survived      0
pclass        0
sex           0
age         177
sibsp         0
parch         0
fare          0
embarked      2
deck        688
dtype: int64

In [77]:
titanic.notna().sum()

survived    891
pclass      891
sex         891
age         714
sibsp       891
parch       891
fare        891
embarked    889
deck        203
dtype: int64

In [78]:
titanic.loc[titanic.embarked.isna()]

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,deck
61,1,1,female,38.0,0,0,80.0,,B
829,1,1,female,62.0,0,0,80.0,,B


In [79]:
titanic.shape

(891, 9)

In [80]:
titanic.dropna()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,deck
1,1,1,female,38.0,1,0,71.2833,C,C
3,1,1,female,35.0,1,0,53.1000,S,C
6,0,1,male,54.0,0,0,51.8625,S,E
10,1,3,female,4.0,1,1,16.7000,S,G
11,1,1,female,58.0,0,0,26.5500,S,C
...,...,...,...,...,...,...,...,...,...
871,1,1,female,47.0,1,1,52.5542,S,D
872,0,1,male,33.0,0,0,5.0000,S,B
879,1,1,female,56.0,0,1,83.1583,C,C
887,1,1,female,19.0,0,0,30.0000,S,B


In [81]:
titanic.dropna().shape

(182, 9)

In [82]:
titanic.dropna(how = "all").shape

(891, 9)

In [83]:
titanic.dropna(axis = 1, how = "any").shape

(891, 6)

In [84]:
titanic.dropna(axis = 1, thresh = 500).shape

(891, 8)

In [85]:
titanic.dropna(axis = 1, thresh = 500, inplace = True)

In [86]:
titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   survived  891 non-null    int64  
 1   pclass    891 non-null    int64  
 2   sex       891 non-null    object 
 3   age       714 non-null    float64
 4   sibsp     891 non-null    int64  
 5   parch     891 non-null    int64  
 6   fare      891 non-null    float64
 7   embarked  889 non-null    object 
dtypes: float64(2), int64(4), object(2)
memory usage: 55.8+ KB


In [87]:
titanic.loc[titanic.age.isna()]

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked
5,0,3,male,,0,0,8.4583,Q
17,1,2,male,,0,0,13.0000,S
19,1,3,female,,0,0,7.2250,C
26,0,3,male,,0,0,7.2250,C
28,1,3,female,,0,0,7.8792,Q
...,...,...,...,...,...,...,...,...
859,0,3,male,,0,0,7.2292,C
863,0,3,female,,8,2,69.5500,S
868,0,3,male,,0,0,9.5000,S
878,0,3,male,,0,0,7.8958,S


In [88]:
mean_age = titanic.age.mean()
mean_age

29.69911764705882

In [89]:
# titanic.age.fillna(value = mean_age, inplace = True) # old

In [90]:
titanic["age"] = titanic["age"].fillna(value = mean_age) # new

In [91]:
titanic.age

0      22.000000
1      38.000000
2      26.000000
3      35.000000
4      35.000000
         ...    
886    27.000000
887    19.000000
888    29.699118
889    26.000000
890    32.000000
Name: age, Length: 891, dtype: float64

In [92]:
titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   survived  891 non-null    int64  
 1   pclass    891 non-null    int64  
 2   sex       891 non-null    object 
 3   age       891 non-null    float64
 4   sibsp     891 non-null    int64  
 5   parch     891 non-null    int64  
 6   fare      891 non-null    float64
 7   embarked  889 non-null    object 
dtypes: float64(2), int64(4), object(2)
memory usage: 55.8+ KB


## Exporting DataFrames to csv

In [93]:
titanic.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S


In [94]:
titanic.to_csv("clean_df.csv", index = False)

In [95]:
pd.read_csv("clean_df.csv")

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked
0,0,3,male,22.000000,1,0,7.2500,S
1,1,1,female,38.000000,1,0,71.2833,C
2,1,3,female,26.000000,0,0,7.9250,S
3,1,1,female,35.000000,1,0,53.1000,S
4,0,3,male,35.000000,0,0,8.0500,S
...,...,...,...,...,...,...,...,...
886,0,2,male,27.000000,0,0,13.0000,S
887,1,1,female,19.000000,0,0,30.0000,S
888,0,3,female,29.699118,1,2,23.4500,S
889,1,1,male,26.000000,0,0,30.0000,C


## Summary Statistics and Accumulations

In [96]:
import pandas as pd

In [97]:
titanic = pd.read_csv("titanic.csv")

In [98]:
titanic.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,deck
0,0,3,male,22.0,1,0,7.25,S,
1,1,1,female,38.0,1,0,71.2833,C,C
2,1,3,female,26.0,0,0,7.925,S,
3,1,1,female,35.0,1,0,53.1,S,C
4,0,3,male,35.0,0,0,8.05,S,


In [99]:
titanic.describe()

Unnamed: 0,survived,pclass,age,sibsp,parch,fare
count,891.0,891.0,714.0,891.0,891.0,891.0
mean,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,0.0,1.0,0.42,0.0,0.0,0.0
25%,0.0,2.0,20.125,0.0,0.0,7.9104
50%,0.0,3.0,28.0,0.0,0.0,14.4542
75%,1.0,3.0,38.0,1.0,0.0,31.0
max,1.0,3.0,80.0,8.0,6.0,512.3292


In [100]:
titanic.count(axis = "columns")

0      8
1      9
2      8
3      9
4      8
      ..
886    8
887    9
888    7
889    9
890    8
Length: 891, dtype: int64

In [101]:
titanic.count(axis = 1)

0      8
1      9
2      8
3      9
4      8
      ..
886    8
887    9
888    7
889    9
890    8
Length: 891, dtype: int64

In [102]:
#titanic.mean(axis = 1) # old

In [103]:
titanic.mean(axis = 1, numeric_only=True) # new

0       5.541667
1      18.713883
2       6.320833
3      15.183333
4       7.675000
         ...    
886     7.000000
887     8.500000
888     5.890000
889     9.666667
890     7.125000
Length: 891, dtype: float64

In [104]:
#titanic.sum(axis = 0) # old

In [105]:
titanic.sum(axis = 0, numeric_only=True) # new

survived      342.0000
pclass       2057.0000
age         21205.1700
sibsp         466.0000
parch         340.0000
fare        28693.9493
dtype: float64

In [106]:
titanic.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,deck
0,0,3,male,22.0,1,0,7.25,S,
1,1,1,female,38.0,1,0,71.2833,C,C
2,1,3,female,26.0,0,0,7.925,S,
3,1,1,female,35.0,1,0,53.1,S,C
4,0,3,male,35.0,0,0,8.05,S,


In [107]:
titanic.fare.cumsum(axis = 0)

0          7.2500
1         78.5333
2         86.4583
3        139.5583
4        147.6083
          ...    
886    28602.7493
887    28632.7493
888    28656.1993
889    28686.1993
890    28693.9493
Name: fare, Length: 891, dtype: float64

In [108]:
#titanic.corr() # old

In [109]:
titanic.corr(numeric_only=True) # new

Unnamed: 0,survived,pclass,age,sibsp,parch,fare
survived,1.0,-0.338481,-0.077221,-0.035322,0.081629,0.257307
pclass,-0.338481,1.0,-0.369226,0.083081,0.018443,-0.5495
age,-0.077221,-0.369226,1.0,-0.308247,-0.189119,0.096067
sibsp,-0.035322,0.083081,-0.308247,1.0,0.414838,0.159651
parch,0.081629,0.018443,-0.189119,0.414838,1.0,0.216225
fare,0.257307,-0.5495,0.096067,0.159651,0.216225,1.0


In [110]:
titanic.survived.corr(titanic.pclass)

-0.3384810359610158