In [1]:
import pandas as pd
import seaborn as sns

In [2]:
df = sns.load_dataset('titanic')

In [3]:
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


## describe()

In [4]:
df.describe()

Unnamed: 0,survived,pclass,age,sibsp,parch,fare
count,891.0,891.0,714.0,891.0,891.0,891.0
mean,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,0.0,1.0,0.42,0.0,0.0,0.0
25%,0.0,2.0,20.125,0.0,0.0,7.9104
50%,0.0,3.0,28.0,0.0,0.0,14.4542
75%,1.0,3.0,38.0,1.0,0.0,31.0
max,1.0,3.0,80.0,8.0,6.0,512.3292


In [6]:
# include를 사용하면 원하는 dtype의 데이터도 볼 수 있다
df.describe(include = 'object')

Unnamed: 0,sex,embarked,who,embark_town,alive
count,891,889,891,889,891
unique,2,3,3,3,2
top,male,S,man,Southampton,no
freq,577,644,537,644,549


## count()

In [8]:
## na값은 제외하고 세어준다
df.count()

survived       891
pclass         891
sex            891
age            714
sibsp          891
parch          891
fare           891
embarked       889
class          891
who            891
adult_male     891
deck           203
embark_town    889
alive          891
alone          891
dtype: int64

In [9]:
df['age'].count()

np.int64(714)

## mean()

In [11]:
df['age'].mean()

np.float64(29.69911764705882)

In [19]:
# mean은 str이 포함되어있으면 에러가 나지만 numeric_only를 사용하면 전체에 적용했을때 str이 있는경우 제외하고 갑을 출력해준다
df.mean(numeric_only = True)

# Na 값도 포함된 상태로 값을 내고 싶다면
df.mean(numeric_only = True, skipna = False)

survived       0.383838
pclass         2.308642
age                 NaN
sibsp          0.523008
parch          0.381594
fare          32.204208
adult_male     0.602694
alone          0.602694
dtype: float64

In [14]:
# 성인 남성 요금 평균
df.head(2)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False


In [17]:
df_male = df['adult_male'] == True
df.loc[df_male, 'fare'].mean()

np.float64(24.864181750465548)

In [18]:
# 요금(fare)을 30~40사이의 요금을 지불한 사람 & 1등석인 사람들의 나이 평균

df_fare_over30 = df['fare'] >= 30
df_fare_under40 = df['fare'] <= 40

df_1class = df['pclass'] == 1

df.loc[df_fare_over30 & df_fare_under40 & df_1class, 'age'].mean()

np.float64(44.095238095238095)

## median() 

In [22]:
numbers = [1, 2, 10, 11, 100, 1000]
pd.Series(numbers).mean()

np.float64(187.33333333333334)

In [23]:
pd.Series(numbers).median()

np.float64(10.5)

In [26]:
age_mean = df['age'].mean()
age_median = df['age'].median()
print(age_mean, age_median)

29.69911764705882 28.0


## sum

In [28]:
df.sum(numeric_only=True)

survived        342.0000
pclass         2057.0000
age           21205.1700
sibsp           466.0000
parch           340.0000
fare          28693.9493
adult_male      537.0000
alone           537.0000
dtype: float64

In [29]:
df['fare'].sum()

np.float64(28693.9493)

In [30]:
# cumsum(): 최종값이 나올떄까지 어떻게 나왔는지 누적합의 과정을 보여줌
df['fare'].cumsum()

0          7.2500
1         78.5333
2         86.4583
3        139.5583
4        147.6083
          ...    
886    28602.7493
887    28632.7493
888    28656.1993
889    28686.1993
890    28693.9493
Name: fare, Length: 891, dtype: float64

In [32]:
# cumprod(): 지정한 값의 누적곱을 보여준다
df['fare'].cumprod()

0      7.250000e+00
1      5.168039e+02
2      4.095671e+03
3      2.174801e+05
4      1.750715e+06
           ...     
886    0.000000e+00
887    0.000000e+00
888    0.000000e+00
889    0.000000e+00
890    0.000000e+00
Name: fare, Length: 891, dtype: float64

## var() 분산

In [42]:
fare_mean = df['fare'].mean()
# fare_mean

total = ((df['fare'] - fare_mean) ** 2).sum()
total_count = df['fare'].count() # 모분산

my_var = total / total_count

my_var

np.float64(2466.665311685043)

In [47]:
# 표본 분산이 기본으로 설정 / ddof=을 수정하면 모분산으로 변경 가능
df['fare'].var(ddof=0)

np.float64(2466.665311685043)

## std()
- 분산에 루트를 씌움

In [48]:
import numpy as np
np.sqrt(df['fare'].var())

np.float64(49.6934285971809)

In [50]:
df['fare'].std()

np.float64(49.6934285971809)

## min(), max()

In [52]:
df['age'].min()

np.float64(0.42)

In [53]:
df['age'].max()

np.float64(80.0)

## aggregation

In [55]:
## 여러개 연산이 가능함
df['age'].agg(['max','min','count','mean'])

max       80.000000
min        0.420000
count    714.000000
mean      29.699118
Name: age, dtype: float64

In [56]:
df[['age','fare']].agg(['min','max'])

Unnamed: 0,age,fare
min,0.42,0.0
max,80.0,512.3292


## quantile() (분위수)

In [57]:
df['age'].quantile(0.1)

np.float64(14.0)

In [58]:
df['age'].quantile(0.8)

np.float64(41.0)

In [59]:
df['age'].median()

np.float64(28.0)

In [60]:
df['age'].quantile(0.5)

np.float64(28.0)

## unique()

In [61]:
df['who'].unique()

array(['man', 'woman', 'child'], dtype=object)

In [63]:
# unique값이 몇개인가
df['who'].nunique()

3

## mode() (최빈값)

In [64]:
df['who'].mode()

0    man
Name: who, dtype: object

In [65]:
df['deck'].mode()

0    C
Name: deck, dtype: category
Categories (7, object): ['A', 'B', 'C', 'D', 'E', 'F', 'G']

## corr()
- 각 column들 끼리의 상관관계를 -1 ~ 1까지 표현

In [68]:
df.corr(numeric_only=True)

Unnamed: 0,survived,pclass,age,sibsp,parch,fare,adult_male,alone
survived,1.0,-0.338481,-0.077221,-0.035322,0.081629,0.257307,-0.55708,-0.203367
pclass,-0.338481,1.0,-0.369226,0.083081,0.018443,-0.5495,0.094035,0.135207
age,-0.077221,-0.369226,1.0,-0.308247,-0.189119,0.096067,0.280328,0.19827
sibsp,-0.035322,0.083081,-0.308247,1.0,0.414838,0.159651,-0.253586,-0.584471
parch,0.081629,0.018443,-0.189119,0.414838,1.0,0.216225,-0.349943,-0.583398
fare,0.257307,-0.5495,0.096067,0.159651,0.216225,1.0,-0.182024,-0.271832
adult_male,-0.55708,0.094035,0.280328,-0.253586,-0.349943,-0.182024,1.0,0.404744
alone,-0.203367,0.135207,0.19827,-0.584471,-0.583398,-0.271832,0.404744,1.0


In [69]:
df.corr(numeric_only=True)['survived']

survived      1.000000
pclass       -0.338481
age          -0.077221
sibsp        -0.035322
parch         0.081629
fare          0.257307
adult_male   -0.557080
alone        -0.203367
Name: survived, dtype: float64