# 데이터프레임의 데이터 조작

In [2]:
import numpy as np
import pandas as pd

## 데이터 갯수 세기

In [3]:
s = pd.Series(range(10))
s[3] = np.nan   # ㅜ무 때문에 실수로 바뀜
s

0    0.0
1    1.0
2    2.0
3    NaN
4    4.0
5    5.0
6    6.0
7    7.0
8    8.0
9    9.0
dtype: float64

In [4]:
s.count()   # Nan을 제외한 나머지 개수들

9

In [9]:
df = pd.DataFrame(np.arange(16).reshape(4, 4), dtype=float)
df.iloc[2, 3] = np.nan
df

Unnamed: 0,0,1,2,3
0,0.0,1.0,2.0,3.0
1,4.0,5.0,6.0,7.0
2,8.0,9.0,10.0,
3,12.0,13.0,14.0,15.0


In [10]:
df.count()      # 시리즈 별로 개수

0    4
1    4
2    4
3    3
dtype: int64

In [12]:
import seaborn as sns
titanic = sns.load_dataset('titanic')
titanic.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [13]:
titanic.count()

survived       891
pclass         891
sex            891
age            714
sibsp          891
parch          891
fare           891
embarked       889
class          891
who            891
adult_male     891
deck           203
embark_town    889
alive          891
alone          891
dtype: int64

In [15]:
titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     891 non-null    int64   
 1   pclass       891 non-null    int64   
 2   sex          891 non-null    object  
 3   age          714 non-null    float64 
 4   sibsp        891 non-null    int64   
 5   parch        891 non-null    int64   
 6   fare         891 non-null    float64 
 7   embarked     889 non-null    object  
 8   class        891 non-null    category
 9   who          891 non-null    object  
 10  adult_male   891 non-null    bool    
 11  deck         203 non-null    category
 12  embark_town  889 non-null    object  
 13  alive        891 non-null    object  
 14  alone        891 non-null    bool    
dtypes: bool(2), category(2), float64(2), int64(4), object(5)
memory usage: 80.6+ KB


In [17]:
titanic.describe()

Unnamed: 0,survived,pclass,age,sibsp,parch,fare
count,891.0,891.0,714.0,891.0,891.0,891.0
mean,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,0.0,1.0,0.42,0.0,0.0,0.0
25%,0.0,2.0,20.125,0.0,0.0,7.9104
50%,0.0,3.0,28.0,0.0,0.0,14.4542
75%,1.0,3.0,38.0,1.0,0.0,31.0
max,1.0,3.0,80.0,8.0,6.0,512.3292


## 카테고리 값 세기
- DF 통으로는 못세고 시리즈별로만 셀 수 있다.

In [19]:
titanic.sex.value_counts()

male      577
female    314
Name: sex, dtype: int64

In [21]:
titanic.embark_town.value_counts()

Southampton    644
Cherbourg      168
Queenstown      77
Name: embark_town, dtype: int64

## 정렬
- sort_index, sort_values

In [26]:
np.random.seed(2021)
s2 = pd.Series(np.random.randint(6, size = 100))
s2.tail()

95    1
96    1
97    5
98    0
99    5
dtype: int64

In [27]:
s2.value_counts()

5    21
1    21
2    19
3    18
4    13
0     8
dtype: int64

In [28]:
s2.value_counts().sort_values()

0     8
4    13
3    18
2    19
5    21
1    21
dtype: int64

In [31]:
# 내림차순
s2.value_counts().sort_values(ascending = False)

1    21
5    21
2    19
3    18
4    13
0     8
dtype: int64

In [34]:
titanic.sort_values(by = 'fare', ascending = False).head()    # by 뒤에 뭘 기준으로 할 건지

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
258,1,1,female,35.0,0,0,512.3292,C,First,woman,False,,Cherbourg,yes,True
737,1,1,male,35.0,0,0,512.3292,C,First,man,True,B,Cherbourg,yes,True
679,1,1,male,36.0,0,1,512.3292,C,First,man,True,B,Cherbourg,yes,False
88,1,1,female,23.0,3,2,263.0,S,First,woman,False,C,Southampton,yes,False
27,0,1,male,19.0,3,2,263.0,S,First,man,True,C,Southampton,no,False


In [35]:
titanic.sort_values(by = ['fare', 'age']).head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
302,0,3,male,19.0,0,0,0.0,S,Third,man,True,,Southampton,no,True
271,1,3,male,25.0,0,0,0.0,S,Third,man,True,,Southampton,yes,True
179,0,3,male,36.0,0,0,0.0,S,Third,man,True,,Southampton,no,True
822,0,1,male,38.0,0,0,0.0,S,First,man,True,,Southampton,no,True
806,0,1,male,39.0,0,0,0.0,S,First,man,True,A,Southampton,no,True


## 행, 열 합계

In [36]:
np.random.seed(2021)
df2 = pd.DataFrame(np.random.randint(10, size = (4, 8)))
df2

Unnamed: 0,0,1,2,3,4,5,6,7
0,4,5,9,0,6,5,8,6
1,6,6,6,1,5,7,1,1
2,5,2,0,3,1,0,2,6
3,4,8,5,1,6,7,5,6


In [38]:
df2.sum()       # axis = 0 가 디폴트

0    19
1    21
2    20
3     5
4    18
5    19
6    16
7    19
dtype: int64

In [39]:
df2.sum(axis = 1)

0    43
1    33
2    19
3    42
dtype: int64

In [40]:
df2['Rowsum'] = df2.sum(axis=1)
df2

Unnamed: 0,0,1,2,3,4,5,6,7,Rowsum
0,4,5,9,0,6,5,8,6,43
1,6,6,6,1,5,7,1,1,33
2,5,2,0,3,1,0,2,6,19
3,4,8,5,1,6,7,5,6,42


In [44]:
df2.loc['ColSum', : ] = df2.sum()
df2

Unnamed: 0,0,1,2,3,4,5,6,7,Rowsum
0,4,5,9,0,6,5,8,6,43
1,6,6,6,1,5,7,1,1,33
2,5,2,0,3,1,0,2,6,19
3,4,8,5,1,6,7,5,6,42
colsum,19,21,20,5,18,19,16,19,137
ColSum,38,42,40,10,36,38,32,38,274


### 연습문제 4.4.3

In [47]:
# (1) 타이타닉호 승객의 평균 나이를 구하라.
titanic.age.mean()

29.69911764705882

In [49]:
# (2) 타이타닉호 승객중 여성 승객의 평균 나이를 구하라.
titanic[titanic.sex == 'female'].head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
8,1,3,female,27.0,0,2,11.1333,S,Third,woman,False,,Southampton,yes,False
9,1,2,female,14.0,1,0,30.0708,C,Second,child,False,,Cherbourg,yes,False


In [50]:
np.round(titanic[titanic.sex == 'female'].age.mean(), 1)

27.9

In [51]:
# (3) 타이타닉호 승객중 1등실 선실의 여성 승객의 평균 나이를 구하라.
titanic[titanic.pclass == 1][titanic.sex == 'female'].head()

  


Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
11,1,1,female,58.0,0,0,26.55,S,First,woman,False,C,Southampton,yes,True
31,1,1,female,,1,0,146.5208,C,First,woman,False,B,Cherbourg,yes,False
52,1,1,female,49.0,1,0,76.7292,C,First,woman,False,D,Cherbourg,yes,False


In [58]:
import warnings
warnings.filterwarnings('ignore')       # warning 안뜨게 해줌

In [59]:
np.round(titanic[titanic.pclass == 1][titanic.sex == 'female'].age.mean(), 1)

34.6

## apply 변환
- map 이랑 비슷함

In [60]:
df3 = pd.DataFrame({
    'A': [1, 3, 4, 3, 4],
    'B': [2, 3, 1, 2, 3],
    'C': [1, 5, 2, 4, 4]
})
df3

Unnamed: 0,A,B,C
0,1,2,1
1,3,3,5
2,4,1,2
3,3,2,4
4,4,3,4


In [61]:
df3.apply(lambda x: x.max() - x.min())      # x = column 개념으로 보면 됨

A    3
B    2
C    4
dtype: int64

In [67]:
df3.apply(lambda x: x.max() - x.min(), axis = 1)

0    1
1    2
2    3
3    2
4    1
dtype: int64

In [64]:
titanic['성년'] = titanic.apply(lambda r: "adult" if r.age >= 20 else "child", axis =1)

In [66]:
titanic.tail()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone,성년
886,0,2,male,27.0,0,0,13.0,S,Second,man,True,,Southampton,no,True,adult
887,1,1,female,19.0,0,0,30.0,S,First,woman,False,B,Southampton,yes,True,child
888,0,3,female,,1,2,23.45,S,Third,woman,False,,Southampton,no,False,child
889,1,1,male,26.0,0,0,30.0,C,First,man,True,C,Cherbourg,yes,True,adult
890,0,3,male,32.0,0,0,7.75,Q,Third,man,True,,Queenstown,no,True,adult


### 연습문제 4.4.4

In [69]:
titanic['category1'] = titanic.apply(lambda r: r.sex if r.age >= 20 else 'child', axis = 1)
titanic

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone,성년,category1
0,0,3,male,22.0,1,0,7.2500,S,Third,man,True,,Southampton,no,False,adult,male
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False,adult,female
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,False,,Southampton,yes,True,adult,female
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False,adult,female
4,0,3,male,35.0,0,0,8.0500,S,Third,man,True,,Southampton,no,True,adult,male
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S,Second,man,True,,Southampton,no,True,adult,male
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True,child,child
888,0,3,female,,1,2,23.4500,S,Third,woman,False,,Southampton,no,False,child,child
889,1,1,male,26.0,0,0,30.0000,C,First,man,True,C,Cherbourg,yes,True,adult,male


## **fillna method**
- NaN 을 채우는 method

In [70]:
df3

Unnamed: 0,A,B,C
0,1,2,1
1,3,3,5
2,4,1,2
3,3,2,4
4,4,3,4


In [72]:
df3.apply(pd.value_counts)

Unnamed: 0,A,B,C
1,1.0,1.0,1.0
2,,2.0,1.0
3,2.0,2.0,
4,2.0,,2.0
5,,,1.0


In [73]:
df3.apply(pd.value_counts).fillna(0.0)

Unnamed: 0,A,B,C
1,1.0,1.0,1.0
2,0.0,2.0,1.0
3,2.0,2.0,0.0
4,2.0,0.0,2.0
5,0.0,0.0,1.0


### 연습문제 4.4.5
타이타닉호의 승객 중 나이를 명시하지 않은 고객은 나이를 명시한 고객의 평균 나이 값이 되도록 titanic 데이터프레임을 고쳐라.

In [76]:
# titanic.age.fillna(titanic.age.mean())    => 이러면 안바뀜
titanic.age.fillna(titanic.age.mean(), inplace = True)      # inplace = False (기본값) 보여 줄 때만 바뀌는것 처럼 보임 / True = DF의 값을 아예 바꿔버림
titanic.tail()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone,성년,category1
886,0,2,male,27.0,0,0,13.0,S,Second,man,True,,Southampton,no,True,adult,male
887,1,1,female,19.0,0,0,30.0,S,First,woman,False,B,Southampton,yes,True,child,child
888,0,3,female,29.699118,1,2,23.45,S,Third,woman,False,,Southampton,no,False,child,child
889,1,1,male,26.0,0,0,30.0,C,First,man,True,C,Cherbourg,yes,True,adult,male
890,0,3,male,32.0,0,0,7.75,Q,Third,man,True,,Queenstown,no,True,adult,male


## astype() method

In [77]:
df3.apply(pd.value_counts).fillna(0.0)

Unnamed: 0,A,B,C
1,1.0,1.0,1.0
2,0.0,2.0,1.0
3,2.0,2.0,0.0
4,2.0,0.0,2.0
5,0.0,0.0,1.0


In [78]:
df3.apply(pd.value_counts).fillna(0.0).astype(int)

Unnamed: 0,A,B,C
1,1,1,1
2,0,2,1
3,2,2,0
4,2,0,2
5,0,0,1


## 실수 값을 카테고리 값으로 변환
- cut: 실수 값의 경계선을 지정하는 경우
- qcut: 갯수가 똑같은 구간으로 나누는 경우
---

In [79]:
ages = [0, 2, 10, 21, 23, 37, 31, 61, 20, 41, 32, 101]

In [82]:
bins = [1, 20, 30, 50, 70, 100]
labels = ["미성년자", "청년", "중년", "장년", "노년"]
cats = pd.cut(ages, bins, labels=labels)

cats

[NaN, '미성년자', '미성년자', '청년', '청년', ..., '장년', '미성년자', '중년', '중년', NaN]
Length: 12
Categories (5, object): ['미성년자' < '청년' < '중년' < '장년' < '노년']

In [83]:
cats.codes

array([-1,  0,  0,  1,  1,  2,  2,  3,  0,  2,  2, -1], dtype=int8)

In [88]:
np.random.seed(202144)
data = np.random.randn(100000)
cats = pd.qcut(data, 4, labels = ['Q1', 'Q2', 'Q3', 'Q4'])
cats

['Q2', 'Q3', 'Q4', 'Q3', 'Q3', ..., 'Q4', 'Q4', 'Q2', 'Q2', 'Q2']
Length: 100000
Categories (4, object): ['Q1' < 'Q2' < 'Q3' < 'Q4']

In [89]:
pd.value_counts(cats)

Q4    25000
Q3    25000
Q2    25000
Q1    25000
dtype: int64