# Pandas

- 파이썬에서 사용하는 데이터분석 라이브러리
- 행과 열로 이루어진 데이터 객체를 만들 수 있음
- 크게 두가지 데이터 타입이 있음 <br>
#### Series
- Index와 Value로 이루어진 데이터 타입 <br>
#### DataFrame
- Index와 Value와 Column으로 이루어진 데이터 타입
- Column은 Series로 이루어짐
- 엑셀의 테이블 형태과 유사함

In [1]:
# !pip install pandas
import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings(action='ignore')

---

## 1. Series

### 1-1. 시리즈 생성 - 2가지

#### 1) 값만 넣기

이 경우, index는 자동으로 숫자

In [11]:
series1 = pd.Series([1, 6, 7, 3])
series1

0    1
1    6
2    7
3    3
dtype: int64

In [12]:
series1_ = pd.Series([1, 6, 7, 3], index=["A","B","C","D"])
series1_

A    1
B    6
C    7
D    3
dtype: int64

#### 2) 딕셔너리로 생성

이 경우, 인덱스도 한 번에 원하는 것으로 설정할 수 있다.

In [13]:
series2 = pd.Series({'A' : 90, 'B' : 80, 'C' : 70, 'D': 60})
series2

A    90
B    80
C    70
D    60
dtype: int64

### 1-2. 간단한 확인 메소드

In [14]:
# 값만 확인
series1.values

array([1, 6, 7, 3])

In [15]:
# 인덱스 확인
series1.index

RangeIndex(start=0, stop=4, step=1)

In [16]:
# 자료형 확인
series1.dtypes

dtype('int64')

In [17]:
# 시리즈 이름, 인덱스 이름 설정 가능
series2.name = '학점 기준'
series2.index.name = '학점'
series2

학점
A    90
B    80
C    70
D    60
Name: 학점 기준, dtype: int64

### 1-3. 인덱스 심화

In [18]:
# 인덱스 변경 가능
series1.index = ['W', 'X', 'Y', 'Z']
series1

W    1
X    6
Y    7
Z    3
dtype: int64

In [19]:
series1.index

Index(['W', 'X', 'Y', 'Z'], dtype='object')

#### 인덱싱 방법

(1) 인덱스로 인덱싱 - 2가지 방법

In [20]:
# 1.<index 이름>
series2.A

90

In [21]:
# 2. ['<index 이름>']
series2['A']

90

In [22]:
#여러개 한꺼번에 인덱싱 가능
series2[['A', 'C']]

학점
A    90
C    70
Name: 학점 기준, dtype: int64

In [23]:
series2[[0,2]]

학점
A    90
C    70
Name: 학점 기준, dtype: int64

In [24]:
# offset index를 사용 가능
series2[0:3]

학점
A    90
B    80
C    70
Name: 학점 기준, dtype: int64

In [25]:
series2["A":"C"]

학점
A    90
B    80
C    70
Name: 학점 기준, dtype: int64

### 1-4. Data filtering

In [26]:
# 데이터 필터링
series2[series2 >= 80]

학점
A    90
B    80
Name: 학점 기준, dtype: int64

### 1-5. Series끼리 연산 가능

In [27]:
# series끼리 연산 가능
series_1 = pd.Series([1, 2, 3, 4])
series_2 = pd.Series([5, 6, 7, 8])

print(series_1 + series_2)
print("\n")
print(series_1 / series_2)

0     6
1     8
2    10
3    12
dtype: int64


0    0.200000
1    0.333333
2    0.428571
3    0.500000
dtype: float64


In [28]:
print(series_1 - series_2)
print("")
print(np.subtract(series_1 , series_2))
print("")
print(series_1.sub(series_2))

0   -4
1   -4
2   -4
3   -4
dtype: int64

0   -4
1   -4
2   -4
3   -4
dtype: int64

0   -4
1   -4
2   -4
3   -4
dtype: int64


In [29]:
#인덱스를 기준으로 연산함
series1 + series2
series1 - series2

A   NaN
B   NaN
C   NaN
D   NaN
W   NaN
X   NaN
Y   NaN
Z   NaN
dtype: float64

### 1-6. Series 변형

#### concat

In [30]:
S1 = pd.Series([1,2,3])
S2 = pd.Series([4,5,6])

In [31]:
pd.concat([S1,S2])

0    1
1    2
2    3
0    4
1    5
2    6
dtype: int64

In [32]:
pd.concat([S1,S2], axis=1)

Unnamed: 0,0,1
0,1,4
1,2,5
2,3,6


#### delete

In [33]:
S1

0    1
1    2
2    3
dtype: int64

In [34]:
S = S1.drop(0)
S

1    2
2    3
dtype: int64

In [35]:
S.index = ["a","b"]
S

a    2
b    3
dtype: int64

In [36]:
# S.drop(0) ->error
S.drop("a")

b    3
dtype: int64

### 1-7. Series 결측치 처리

In [37]:
Series = pd.Series([3,4,np.nan,5,6,np.nan])
Series

0    3.0
1    4.0
2    NaN
3    5.0
4    6.0
5    NaN
dtype: float64

In [40]:
Series.notnull()

0     True
1     True
2    False
3     True
4     True
5    False
dtype: bool

In [41]:
Series[Series.notnull()]

0    3.0
1    4.0
3    5.0
4    6.0
dtype: float64

In [42]:
Series.dropna()

0    3.0
1    4.0
3    5.0
4    6.0
dtype: float64

In [43]:
Series.fillna(Series.mean())

0    3.0
1    4.0
2    4.5
3    5.0
4    6.0
5    4.5
dtype: float64

---

## 2. DataFrame

column : 열(세로) / row : 행(가로)

### 2-1. 데이터프레임 생성

In [44]:
df1 = pd.DataFrame(columns=["Food", "Price"])
df1

Unnamed: 0,Food,Price


In [45]:
# 리스트로 생성
df1["Food"] = ["마라샹궈", "마라탕","아이스크림","아이셔"]
df1["Price"] = [15000, 8000,1000,"판매종료"]
df1

Unnamed: 0,Food,Price
0,마라샹궈,15000
1,마라탕,8000
2,아이스크림,1000
3,아이셔,판매종료


In [46]:
# 딕셔너리로 생성
food = ["꿔바로우", "순대국밥"]
price = [16000, 8000]
dic = {"Food": food, "Price": price}
df2 = pd.DataFrame(dic, index=['A','B'])
df2

Unnamed: 0,Food,Price
A,꿔바로우,16000
B,순대국밥,8000


In [59]:
df2.set_index('Price')

Unnamed: 0_level_0,Food
Price,Unnamed: 1_level_1
16000,꿔바로우
8000,순대국밥


In [58]:
# 칼럼을 인덱스로 설정
df2.set_index("Food")

Unnamed: 0_level_0,Price
Food,Unnamed: 1_level_1
꿔바로우,16000
순대국밥,8000


### 2-2. 데이터프레임 인덱싱

In [48]:
df2

Unnamed: 0,Food,Price
A,꿔바로우,16000
B,순대국밥,8000


In [49]:
# 컬럼이름으로 접근
df2['Food']

A    꿔바로우
B    순대국밥
Name: Food, dtype: object

In [39]:
# df2["A"] -> Error / dataframe은 index로 바로 접근 불가능함

In [50]:
df2["A":"A"] #index는 슬라이싱으로만 접근가능

Unnamed: 0,Food,Price
A,꿔바로우,16000


In [51]:
# loc을 통해 인덱스로 접근
df2.loc["A"]

Food      꿔바로우
Price    16000
Name: A, dtype: object

In [52]:
# iloc을 통해 인덱스 자리수로 접근 (0부터 시작)
df2.iloc[0]

Food      꿔바로우
Price    16000
Name: A, dtype: object

In [62]:
df2.Price > 10000

A     True
B    False
Name: Price, dtype: bool

In [60]:
# 조건으로 접근
df2[df2.Price > 10000]

Unnamed: 0,Food,Price
A,꿔바로우,16000


### 2-3. 데이터프레임 추가, 삭제

#### 추가

In [63]:
df1

Unnamed: 0,Food,Price
0,마라샹궈,15000
1,마라탕,8000
2,아이스크림,1000
3,아이셔,판매종료


In [64]:
# loc을 이용해 변경, 추가 가능
df1.loc[3] = ["꿔바로우", "12000"]
df1

Unnamed: 0,Food,Price
0,마라샹궈,15000
1,마라탕,8000
2,아이스크림,1000
3,꿔바로우,12000


In [65]:
df1.loc[4] = ["볶음밥", "7000"]
df1

Unnamed: 0,Food,Price
0,마라샹궈,15000
1,마라탕,8000
2,아이스크림,1000
3,꿔바로우,12000
4,볶음밥,7000


In [66]:
df1["Price"][2] = 1200
df1

Unnamed: 0,Food,Price
0,마라샹궈,15000
1,마라탕,8000
2,아이스크림,1200
3,꿔바로우,12000
4,볶음밥,7000


##### Row(행) 삭제

In [67]:
# drop을 통해 삭제, default는 axis=0(행) -넘파이와 똑같!
df1_ = df1.drop(2)
df1_

Unnamed: 0,Food,Price
0,마라샹궈,15000
1,마라탕,8000
3,꿔바로우,12000
4,볶음밥,7000


In [68]:
# 해당 데이터프레임에서 삭제하는 방법: inplace=True
df1.drop(1, inplace=True)  
df1

Unnamed: 0,Food,Price
0,마라샹궈,15000
2,아이스크림,1200
3,꿔바로우,12000
4,볶음밥,7000


In [69]:
# reset_index를 통해 인덱스 재정렬
# 옵션이 없을 시에는 삭제된 index도 따로 index라는 column으로 저장됨
df1__ = df1.reset_index()
df1__

Unnamed: 0,index,Food,Price
0,0,마라샹궈,15000
1,2,아이스크림,1200
2,3,꿔바로우,12000
3,4,볶음밥,7000


In [70]:
# 다음과 같이 실행하면 저장하지 않음
df1 = df1.reset_index(drop=True)
df1

Unnamed: 0,Food,Price
0,마라샹궈,15000
1,아이스크림,1200
2,꿔바로우,12000
3,볶음밥,7000


##### Column(열) 삭제

In [71]:
# axis=1 로 하면 칼럼 삭제
df1__.drop("index", axis=1)

Unnamed: 0,Food,Price
0,마라샹궈,15000
1,아이스크림,1200
2,꿔바로우,12000
3,볶음밥,7000


#### Concat

In [72]:
df1

Unnamed: 0,Food,Price
0,마라샹궈,15000
1,아이스크림,1200
2,꿔바로우,12000
3,볶음밥,7000


In [73]:
df2

Unnamed: 0,Food,Price
A,꿔바로우,16000
B,순대국밥,8000


In [74]:
# concat으로 이어붙일 수 있음
# default는 axis=0
df3 = pd.concat([df1, df2])
df3

Unnamed: 0,Food,Price
0,마라샹궈,15000
1,아이스크림,1200
2,꿔바로우,12000
3,볶음밥,7000
A,꿔바로우,16000
B,순대국밥,8000


In [75]:
# axis=1로 수평축으로
df4 = pd.concat([df2, df2], axis=1)
df4

Unnamed: 0,Food,Price,Food.1,Price.1
A,꿔바로우,16000,꿔바로우,16000
B,순대국밥,8000,순대국밥,8000


In [76]:
pd.concat([df3,df2], axis=1)

Unnamed: 0,Food,Price,Food.1,Price.1
0,마라샹궈,15000,,
1,아이스크림,1200,,
2,꿔바로우,12000,,
3,볶음밥,7000,,
A,꿔바로우,16000,꿔바로우,16000.0
B,순대국밥,8000,순대국밥,8000.0


In [77]:
pd.concat([df3, df2], axis=1)

Unnamed: 0,Food,Price,Food.1,Price.1
0,마라샹궈,15000,,
1,아이스크림,1200,,
2,꿔바로우,12000,,
3,볶음밥,7000,,
A,꿔바로우,16000,꿔바로우,16000.0
B,순대국밥,8000,순대국밥,8000.0


In [58]:
#결합방법 join으로 지정 default='outer'
pd.concat([df3,df2], axis=1, join='inner')

Unnamed: 0,Food,Price,Food.1,Price.1
A,꿔바로우,16000,꿔바로우,16000
B,순대국밥,8000,순대국밥,8000


#### merge

In [78]:
df3

Unnamed: 0,Food,Price
0,마라샹궈,15000
1,아이스크림,1200
2,꿔바로우,12000
3,볶음밥,7000
A,꿔바로우,16000
B,순대국밥,8000


In [79]:
df5 = pd.DataFrame({'Food': ['마라샹궈', '초밥', '부대찌개', '순대국밥'], 
                    'Country': ['중국', '일본', '한국', '한국']})
df5

Unnamed: 0,Food,Country
0,마라샹궈,중국
1,초밥,일본
2,부대찌개,한국
3,순대국밥,한국


In [80]:
#결합방법 how로 지정 default='inner'
# inner로 하면 key값이 둘 다 있는 것만
pd.merge(df3, df5, how='inner', on='Food')

Unnamed: 0,Food,Price,Country
0,마라샹궈,15000,중국
1,순대국밥,8000,한국


In [81]:
# outer로 하면 key값이 한 데이터프레임에만 있어도 병합
pd.merge(df3, df5, how='outer', on='Food')

Unnamed: 0,Food,Price,Country
0,마라샹궈,15000.0,중국
1,아이스크림,1200.0,
2,꿔바로우,12000.0,
3,꿔바로우,16000.0,
4,볶음밥,7000.0,
5,순대국밥,8000.0,한국
6,초밥,,일본
7,부대찌개,,한국


In [83]:
# left로 하면 왼쪽에 있는 데이터 프레임의 'on' 기준
pd.merge(df3, df5, how='left', on='Food')

Unnamed: 0,Food,Price,Country
0,마라샹궈,15000,중국
1,아이스크림,1200,
2,꿔바로우,12000,
3,볶음밥,7000,
4,꿔바로우,16000,
5,순대국밥,8000,한국


In [84]:
# right로 하면 오른쪽에 있는 데이터 프레임의 'on' 기준
pd.merge(df3, df5, how='right', on='Food')

Unnamed: 0,Food,Price,Country
0,마라샹궈,15000.0,중국
1,초밥,,일본
2,부대찌개,,한국
3,순대국밥,8000.0,한국


In [85]:
pd.merge(df3, df5, how='left', on='Food')

Unnamed: 0,Food,Price,Country
0,마라샹궈,15000,중국
1,아이스크림,1200,
2,꿔바로우,12000,
3,볶음밥,7000,
4,꿔바로우,16000,
5,순대국밥,8000,한국


### concat과 merge의 차이?
concat은 단순 병합/merge는 두 데이터의 공통 항목을 기준으로 병합

#### merge 종류
<img src="https://miro.medium.com/max/1400/1*9eH1_7VbTZPZd9jBiGIyNA.png" height="100px" width="300px">

### 2-5. 데이터 읽기, 저장

In [86]:
df3['Price'] = df3['Price'].apply(lambda x : int(x))

In [87]:
df3.sort_values('Price', ascending = False)

Unnamed: 0,Food,Price
A,꿔바로우,16000
0,마라샹궈,15000
2,꿔바로우,12000
B,순대국밥,8000
3,볶음밥,7000
1,아이스크림,1200


In [88]:
# 데이터 읽기
# 경로 지정해주세요
# path = ''
# titanic = pd.read_csv(path+'titanic.csv')
titanic = pd.read_csv('./titanic.csv')

In [89]:
# head를 통해 데이터 맨 앞부분 확인
# 기본 5개
titanic.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [90]:
# shape를 통해 데이터의 row, column 길이 확인 가능
titanic.shape

(891, 12)

In [91]:
# tail을 통해 데이터 뒷부분 확인
# 기본 5개
titanic.tail()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.45,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0,C148,C
890,891,0,3,"Dooley, Mr. Patrick",male,32.0,0,0,370376,7.75,,Q


In [92]:
# 디폴트는 5개지만 보려는 갯수를 조정할 수 있음
# display를 사용해 print처럼 확인 가능

display(titanic.head(2))
display(titanic.tail(2))

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0,C148,C
890,891,0,3,"Dooley, Mr. Patrick",male,32.0,0,0,370376,7.75,,Q


In [93]:
# info를 통해 nan과 데이터 타입, 전체 데이터 수 등 확인 가능
titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [94]:
# 데이터 타입 변경 가능
titanic.Survived = titanic.Survived.astype('object')

In [95]:
titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    object 
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(4), object(6)
memory usage: 83.7+ KB


In [96]:
# describe를 이용하여 통계값 확인 (단, 수치형만)
titanic.describe()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,2.0,20.125,0.0,0.0,7.9104
50%,446.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,3.0,38.0,1.0,0.0,31.0
max,891.0,3.0,80.0,8.0,6.0,512.3292


In [97]:
# 컬럼명 확인
titanic.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [98]:
# 불러오는 설정을 달리해서 가져올 수 있음
titanic2= pd.read_csv("titanic.csv",
                      usecols=["PassengerId","Age", "Survived"], # 사용할 컬럼들
                      nrows=10, #가져올 행의 개수
                      index_col="PassengerId") # 인덱스로 지정할 컬럼명

titanic2

Unnamed: 0_level_0,Survived,Age
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0,22.0
2,1,38.0
3,1,26.0
4,1,35.0
5,0,35.0
6,0,
7,0,54.0
8,0,2.0
9,1,27.0
10,1,14.0


In [99]:
# 다시 저장하기: .to_csv
titanic2.to_csv("titanic2.csv", index=False)

# index=False를 하지 않으면 인덱스까지 새로운 컬럼(Unnamed: 0)으로 저장됨
# 이 경우, 인덱스인 PassengerId column이 저장
titanic2.to_csv("titanic3.csv")

In [100]:
titanic2 = pd.read_csv('titanic2.csv')
titanic2

Unnamed: 0,Survived,Age
0,0,22.0
1,1,38.0
2,1,26.0
3,1,35.0
4,0,35.0
5,0,
6,0,54.0
7,0,2.0
8,1,27.0
9,1,14.0


In [101]:
titanic3 = pd.read_csv('titanic3.csv')
titanic3

Unnamed: 0,PassengerId,Survived,Age
0,1,0,22.0
1,2,1,38.0
2,3,1,26.0
3,4,1,35.0
4,5,0,35.0
5,6,0,
6,7,0,54.0
7,8,0,2.0
8,9,1,27.0
9,10,1,14.0


---

### 2-6. 판다스 함수

#### apply

In [102]:
titanic.head(2)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C


In [103]:
# apply를 이용해 함수를 적용할 수 있음
def is_adult(age):
    if age < 20 :
        return 'not adult'
    if age >= 20 :
        return 'adult'

In [104]:
titanic['is_adult'] = titanic['Age'].apply(is_adult)
titanic.tail()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,is_adult
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0,,S,adult
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0,B42,S,not adult
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.45,,S,
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0,C148,C,adult
890,891,0,3,"Dooley, Mr. Patrick",male,32.0,0,0,370376,7.75,,Q,adult


#### lambda

In [105]:
# lambda 함수를 통해서도 쓸 수 있음
# 더 간단
titanic['is_adult'] = titanic['Age'].apply(lambda x: 'adult' if x < 20 else 'not_adult')
titanic.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,is_adult
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,not_adult
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,not_adult
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,not_adult
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,not_adult
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,not_adult


#### sort

In [106]:
# 원하는 칼럼을 선택해 그것을 기준으로 오름차순 정렬: .sort_values()
titanic.sort_values('Pclass').head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,is_adult
445,446,1,1,"Dodge, Master. Washington",male,4.0,0,2,33638,81.8583,A34,S,adult
310,311,1,1,"Hays, Miss. Margaret Bechstein",female,24.0,0,0,11767,83.1583,C54,C,not_adult
309,310,1,1,"Francatelli, Miss. Laura Mabel",female,30.0,0,0,PC 17485,56.9292,E36,C,not_adult
307,308,1,1,"Penasco y Castellana, Mrs. Victor de Satode (M...",female,17.0,1,0,PC 17758,108.9,C65,C,adult
306,307,1,1,"Fleming, Miss. Margaret",female,,0,0,17421,110.8833,,C,not_adult


In [107]:
# ascending=False로 하면 내림차순
titanic.sort_values('Pclass', ascending=False).head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,is_adult
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,not_adult
511,512,0,3,"Webber, Mr. James",male,,0,0,SOTON/OQ 3101316,8.05,,S,not_adult
500,501,0,3,"Calic, Mr. Petar",male,17.0,0,0,315086,8.6625,,S,adult
501,502,0,3,"Canavan, Miss. Mary",female,21.0,0,0,364846,7.75,,Q,not_adult
502,503,0,3,"O'Sullivan, Miss. Bridget Mary",female,,0,0,330909,7.6292,,Q,not_adult


### null 값 확인 및 처리

In [108]:
titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 13 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    object 
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
 12  is_adult     891 non-null    object 
dtypes: float64(2), int64(4), object(7)
memory usage: 90.6+ KB


In [109]:
# null 값 확인: .isnull()
titanic[titanic["Age"].isnull()]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,is_adult
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q,not_adult
17,18,1,2,"Williams, Mr. Charles Eugene",male,,0,0,244373,13.0000,,S,not_adult
19,20,1,3,"Masselmani, Mrs. Fatima",female,,0,0,2649,7.2250,,C,not_adult
26,27,0,3,"Emir, Mr. Farred Chehab",male,,0,0,2631,7.2250,,C,not_adult
28,29,1,3,"O'Dwyer, Miss. Ellen ""Nellie""",female,,0,0,330959,7.8792,,Q,not_adult
...,...,...,...,...,...,...,...,...,...,...,...,...,...
859,860,0,3,"Razi, Mr. Raihed",male,,0,0,2629,7.2292,,C,not_adult
863,864,0,3,"Sage, Miss. Dorothy Edith ""Dolly""",female,,8,2,CA. 2343,69.5500,,S,not_adult
868,869,0,3,"van Melkebeke, Mr. Philemon",male,,0,0,345777,9.5000,,S,not_adult
878,879,0,3,"Laleff, Mr. Kristo",male,,0,0,349217,7.8958,,S,not_adult


In [110]:
# null값 세기: .sum()
titanic.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
is_adult         0
dtype: int64

In [111]:
# null 값 아닌 것 확인: .notnull()
titanic[titanic["Age"].notnull()]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,is_adult
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S,not_adult
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,not_adult
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S,not_adult
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S,not_adult
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S,not_adult
...,...,...,...,...,...,...,...,...,...,...,...,...,...
885,886,0,3,"Rice, Mrs. William (Margaret Norton)",female,39.0,0,5,382652,29.1250,,Q,not_adult
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S,not_adult
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S,adult
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C,not_adult


#### dropna

In [112]:
# 결측치 제거
titanic_notnull = titanic.dropna()
titanic[titanic['Age'].isnull()].head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,is_adult
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q,not_adult
17,18,1,2,"Williams, Mr. Charles Eugene",male,,0,0,244373,13.0,,S,not_adult
19,20,1,3,"Masselmani, Mrs. Fatima",female,,0,0,2649,7.225,,C,not_adult
26,27,0,3,"Emir, Mr. Farred Chehab",male,,0,0,2631,7.225,,C,not_adult
28,29,1,3,"O'Dwyer, Miss. Ellen ""Nellie""",female,,0,0,330959,7.8792,,Q,not_adult


In [113]:
titanic_notnull.isnull().sum()

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
is_adult       0
dtype: int64

In [114]:
titanic.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
is_adult         0
dtype: int64

In [115]:
# (inplace = True) 조건은 null drop한 채로 저장
temp = titanic.copy()
temp
temp.dropna(inplace = True)
temp[temp['Age'].isnull()].head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,is_adult


In [116]:
# .dropna(axis=1) 하면 null 값인 column 자체를 다 삭제할 수 있음.
titanic_notnull = titanic.dropna(axis = 1)
titanic_notnull
#titanic_notnull[titanic_notnull['Age'].isnull()] -> Error발생

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,SibSp,Parch,Ticket,Fare,is_adult
0,1,0,3,"Braund, Mr. Owen Harris",male,1,0,A/5 21171,7.2500,not_adult
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,1,0,PC 17599,71.2833,not_adult
2,3,1,3,"Heikkinen, Miss. Laina",female,0,0,STON/O2. 3101282,7.9250,not_adult
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,1,0,113803,53.1000,not_adult
4,5,0,3,"Allen, Mr. William Henry",male,0,0,373450,8.0500,not_adult
...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,0,0,211536,13.0000,not_adult
887,888,1,1,"Graham, Miss. Margaret Edith",female,0,0,112053,30.0000,adult
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,1,2,W./C. 6607,23.4500,not_adult
889,890,1,1,"Behr, Mr. Karl Howell",male,0,0,111369,30.0000,not_adult


#### unique

In [117]:
titanic.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,is_adult
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,not_adult
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,not_adult
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,not_adult
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,not_adult
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,not_adult


In [118]:
# unique값이 뭔지 확인
titanic.Embarked.unique()

array(['S', 'C', 'Q', nan], dtype=object)

#### value_counts

In [119]:
# unique값들이 각각 얼마나 있는지 확인
titanic.Survived.value_counts()

0    549
1    342
Name: Survived, dtype: int64

#### groupby

In [120]:
titanic_sample = titanic[['Name', 'Age', 'Pclass', 'Fare', 'Sex']]
titanic_sample.head()

Unnamed: 0,Name,Age,Pclass,Fare,Sex
0,"Braund, Mr. Owen Harris",22.0,3,7.25,male
1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0,1,71.2833,female
2,"Heikkinen, Miss. Laina",26.0,3,7.925,female
3,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0,1,53.1,female
4,"Allen, Mr. William Henry",35.0,3,8.05,male


In [121]:
# size를 통해 해당 칼럼을 모은 후 개수 확인
titanic_sample.groupby("Pclass").size()

Pclass
1    216
2    184
3    491
dtype: int64

In [122]:
# 두가지 기준으로 groupby한 후에 개수도 확인 가능
titanic_sample.groupby(["Pclass", "Sex"]).size()

Pclass  Sex   
1       female     94
        male      122
2       female     76
        male      108
3       female    144
        male      347
dtype: int64

In [123]:
# Pclass, Sex는 집계 기준이 되는 컬럼, Fare은 mean 구하려는 컬럼
titanic_sample.groupby(["Pclass", "Sex"])['Fare'].mean()

Pclass  Sex   
1       female    106.125798
        male       67.226127
2       female     21.970121
        male       19.741782
3       female     16.118810
        male       12.661633
Name: Fare, dtype: float64

In [124]:
# groupby로 모은 후 .agg를 통해 다양한 통계를 낼 수 있음 (min, max, mean, sum, median)
titanic_sample.groupby("Pclass").agg((["min","max","mean"]))

# outlier, null값 등의 이상치는 EDA 과정에서 제거합니다
# EDA에 대해서도 DA팀이 토요일에 세션 진행하니 많관부!

Unnamed: 0_level_0,Age,Age,Age,Fare,Fare,Fare
Unnamed: 0_level_1,min,max,mean,min,max,mean
Pclass,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
1,0.92,80.0,38.233441,0.0,512.3292,84.154687
2,0.67,70.0,29.87763,0.0,73.5,20.662183
3,0.42,74.0,25.14062,0.0,69.55,13.67555
