In [1]:
import pandas as pd

titanic_survival = pd.read_csv("titanic_survival.csv")

In [2]:
age = titanic_survival["age"]
print(age.loc[10:20])

age_null_true = age[age.isnull()]

age_null_count = len(age_null_true)

print(age_null_count)

10    47.0
11    18.0
12    24.0
13    26.0
14    80.0
15     NaN
16    24.0
17    50.0
18    32.0
19    36.0
20    37.0
Name: age, dtype: float64
264


In [3]:
age_is_null = pd.isnull(titanic_survival["age"])

age = titanic_survival["age"]

age_not_null = age[age_is_null == False]

correct_mean_age = age_not_null.sum()/age_not_null.shape[0]

correct_mean_age

29.8811345124283

### Using in-build functions, they ignore NaN values

In [5]:
correct_mean_age = titanic_survival["age"].mean()

correct_mean_fare = titanic_survival["fare"].mean()

correct_mean_fare

33.29547928134572

### Summary statics:
Calculating by fare per class

In [6]:
passenger_classes = [1, 2, 3]
fares_by_class = {}

for classe in passenger_classes:
    titanic_survival_class = titanic_survival[titanic_survival["pclass"] == classe]
    mean_fare_class = titanic_survival_class["fare"].mean()
    fares_by_class[classe] = mean_fare_class

fares_by_class

{1: 87.50899164086687, 2: 21.1791963898917, 3: 13.302888700564957}

### Pivot tables
https://en.wikipedia.org/wiki/Pivot_table

In [16]:
import numpy as np
passenger_survival = titanic_survival.pivot_table(index="pclass", values="survived")

print(passenger_survival)
passenger_age = titanic_survival.pivot_table(index="pclass", values="age", aggfunc=np.mean)

passenger_age

pclass
1.0    0.619195
2.0    0.429603
3.0    0.255289
Name: survived, dtype: float64


pclass
1.0    39.159918
2.0    29.506705
3.0    24.816367
Name: age, dtype: float64

In [8]:
import numpy as np

port_stats = titanic_survival.pivot_table(index="embarked", values=["fare", "survived"], aggfunc=np.sum)

port_stats

Unnamed: 0_level_0,fare,survived
embarked,Unnamed: 1_level_1,Unnamed: 2_level_1
C,16830.7922,150.0
Q,1526.3085,44.0
S,25033.3862,304.0


### Drop-Na
http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.dropna.html

In [9]:
drop_na_rows = titanic_survival.dropna(axis=0)

drop_na_columns =  titanic_survival.dropna(axis=1)

new_titanic_survival = titanic_survival.dropna(axis=0, subset=["age", "sex"])

new_titanic_survival.head(5)

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1.0,1.0,"Allen, Miss. Elisabeth Walton",female,29.0,0.0,0.0,24160,211.3375,B5,S,2.0,,"St Louis, MO"
1,1.0,1.0,"Allison, Master. Hudson Trevor",male,0.9167,1.0,2.0,113781,151.55,C22 C26,S,11.0,,"Montreal, PQ / Chesterville, ON"
2,1.0,0.0,"Allison, Miss. Helen Loraine",female,2.0,1.0,2.0,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1.0,0.0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1.0,2.0,113781,151.55,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1.0,0.0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1.0,2.0,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"


### iloc and loc
iLoc is basically integer location. When u sort_values by a column, row index tend to change. iloc provide interger location for the rows.

In [10]:
# We have already sorted new_titanic_survival by age
first_five_rows = new_titanic_survival.iloc[0:5]

first_ten_rows = new_titanic_survival.iloc[0:10]

row_position_fifth = new_titanic_survival.iloc[4]

row_index_25 = new_titanic_survival.loc[25]

row_position_fifth

pclass                                                     1
survived                                                   0
name         Allison, Mrs. Hudson J C (Bessie Waldo Daniels)
sex                                                   female
age                                                       25
sibsp                                                      1
parch                                                      2
ticket                                                113781
fare                                                  151.55
cabin                                                C22 C26
embarked                                                   S
boat                                                     NaN
body                                                     NaN
home.dest                    Montreal, PQ / Chesterville, ON
Name: 4, dtype: object

In [11]:
first_row_first_column = new_titanic_survival.iloc[0,0]
all_rows_first_three_columns = new_titanic_survival.iloc[:,0:3]
row__index_83_age = new_titanic_survival.loc[83,"age"]
row_index_1000_pclass = new_titanic_survival.loc[766,"pclass"]

row_index_1100_age = new_titanic_survival.loc[1100,"age"]

row_index_25_survived = new_titanic_survival.loc[25,"survived"]

five_rows_three_cols = new_titanic_survival.iloc[0:5,0:3]

### Rest index
http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.reset_index.html

In [12]:
titanic_reindexed = new_titanic_survival.reset_index(drop=True)

titanic_reindexed.head(3)

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1.0,1.0,"Allen, Miss. Elisabeth Walton",female,29.0,0.0,0.0,24160,211.3375,B5,S,2.0,,"St Louis, MO"
1,1.0,1.0,"Allison, Master. Hudson Trevor",male,0.9167,1.0,2.0,113781,151.55,C22 C26,S,11.0,,"Montreal, PQ / Chesterville, ON"
2,1.0,0.0,"Allison, Miss. Helen Loraine",female,2.0,1.0,2.0,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"


### Apply method
http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.apply.html

In [13]:
def hundredth_row(column):
    hundredth_item = column.iloc[99]
    return hundredth_item

hundredth_row = titanic_survival.apply(hundredth_row)

def column_null(column):
    count = column[column.isnull()].shape[0]
    return count

column_null_count = titanic_survival.apply(column_null)

column_null_count

pclass          1
survived        1
name            1
sex             1
age           264
sibsp           1
parch           1
ticket          1
fare            2
cabin        1015
embarked        3
boat          824
body         1189
home.dest     565
dtype: int64

In [18]:
def is_minor(row):
    if row["age"] < 18:
        return True
    else:
        return False

minors = titanic_survival.apply(is_minor, axis=1)

print(minors.head(3))

def is_minor_label(row):
    age = row["age"]
    if pd.isnull(age):
        return "unknown"
    elif age < 18:
        return "minor"
    else:
        return "adult"
    
age_labels = titanic_survival.apply(is_minor_label, axis=1)

titanic_survival["age_labels"] = age_labels

age_labels.head(3)

0    False
1     True
2     True
dtype: bool


0    adult
1    minor
2    minor
dtype: object

In [19]:
age_group_survival = titanic_survival.pivot_table(index="age_labels", values="survived")

age_group_survival

age_labels
adult      0.387892
minor      0.525974
unknown    0.277567
Name: survived, dtype: float64