### Pandas : '2차원 배열의 테이터'를 처리하기 위한 패키지

### EDA(Exploartory Data Analysis) : 탐색적 데이터분석.

### 데이터를 분석하기 전에 그래프나 통게적인 방법을 이용하여 데이터를 관찰하는 것.

In [25]:
import pandas as pd

In [None]:
# DataFrame 생성
# D와 F가 대문자임에 주의!

df = pd.DataFrame([1,2,4])
df

Unnamed: 0,0
0,1
1,2
2,4


In [None]:
# 다양한 자료를 이용하여 생성 가능
pd.DataFrame([5, "가나", 1, 2, [2,4,6]])

Unnamed: 0,0
0,5
1,가나
2,1
3,2
4,"[2, 4, 6]"


In [None]:
# DataFrame 끼리 연산 가능
# 크기가 맞지 않더라고 연산 가능!

df1 = pd.DataFrame([1,2,3]) + pd.DataFrame([5,6,7])
df2 = pd.DataFrame([1,2,3]) + pd.DataFrame([5,6,7,8])

print(df1)
print(df2) # 3번째 인덱스는 NaN으로 나옴

    0
0   6
1   8
2  10
      0
0   6.0
1   8.0
2  10.0
3   NaN


In [None]:
# 곱하기도 가능

pd.DataFrame([1,5,6]) * pd.DataFrame([5,3,6,1])

Unnamed: 0,0
0,5.0
1,15.0
2,36.0
3,


In [None]:
# list를 DataFrame으로 변경 가능!

list_1 = [1,2,3,4]
pd.DataFrame(list_1)


Unnamed: 0,0
0,1
1,2
2,3
3,4


In [None]:
# 2차원 형태의 데이터도 변경 가능

list2 = [[1,2,3],[4,5,6],[7,8,9]]
pd.DataFrame(list2)

Unnamed: 0,0,1,2
0,1,2,3
1,4,5,6
2,7,8,9


In [None]:
# 여러개의 list를 이용해서 DataFrame 만들기!

food = ["banana", "apple", "soda", "cookie"]
price = [1400, 1000, 500, 1000]
stock = [20, 10, 5, 7]

pd.DataFrame([food, price, stock])

# 영 결과가 이상해...

Unnamed: 0,0,1,2,3
0,banana,apple,soda,cookie
1,1400,1000,500,1000
2,20,10,5,7


In [None]:
# dict를 이용하여 DataFrame 생성하기


food_list = pd.DataFrame({"Food" : food,
                          "Price" : price,
                          "Stock" : stock})
food_list

Unnamed: 0,Food,Price,Stock
0,banana,1400,20
1,apple,1000,10
2,soda,500,5
3,cookie,1000,7


In [None]:
food_df = pd.DataFrame({"Food" : food,
                        "Price" : price,
                        "Stock" : stock})

food_df

Unnamed: 0,Food,Price,Stock
0,banana,1400,20
1,apple,1000,10
2,soda,500,5
3,cookie,1000,7


In [None]:
# 원하는 column을 index로 지정할 수 있다.

food_list.set_index("Food", inplace = True)
# Food 열을 index로 집어넣어준다.
# inplace = True: 원본 객체를 변경해준다!!!!
# inplace = True 는 꽤 많이 쓰이는 듯. 잘 알아두자
food_list

Unnamed: 0_level_0,Price,Stock
Food,Unnamed: 1_level_1,Unnamed: 2_level_1
banana,1400,20
apple,1000,10
soda,500,5
cookie,1000,7


In [None]:
math =[100, 100, 90, 85, 20, 50]
sci = [90, 70, 40, 93, 54, 76]
eng = [99, 97, 23, 35, 56, 93]
final = ["합", "합", "불", "불", "불", "불"]

test_dict = {"math":math,
             "sci":sci,
             "eng":eng,
             "final":final}

test_df = pd.DataFrame(test_dict)
test_df

Unnamed: 0,math,sci,eng,final
0,100,90,99,합
1,100,70,97,합
2,90,40,23,불
3,85,93,35,불
4,20,54,56,불
5,50,76,93,불


In [None]:
import numpy as np
import seaborn as sns

# 분석전에는 항상 데이터가 무엇을 설명하는것인지 필히 확인해야 한다.
# titanic meta data : https://coding-kindergarten.tistory.com/127

titanic = sns.load_dataset("titanic")
titanic

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.2500,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.0500,S,Third,man,True,,Southampton,no,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S,Second,man,True,,Southampton,no,True
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True
888,0,3,female,,1,2,23.4500,S,Third,woman,False,,Southampton,no,False
889,1,1,male,26.0,0,0,30.0000,C,First,man,True,C,Cherbourg,yes,True


In [None]:
# DataFrame의 정보를 파악

titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     891 non-null    int64   
 1   pclass       891 non-null    int64   
 2   sex          891 non-null    object  
 3   age          714 non-null    float64 
 4   sibsp        891 non-null    int64   
 5   parch        891 non-null    int64   
 6   fare         891 non-null    float64 
 7   embarked     889 non-null    object  
 8   class        891 non-null    category
 9   who          891 non-null    object  
 10  adult_male   891 non-null    bool    
 11  deck         203 non-null    category
 12  embark_town  889 non-null    object  
 13  alive        891 non-null    object  
 14  alone        891 non-null    bool    
dtypes: bool(2), category(2), float64(2), int64(4), object(5)
memory usage: 80.7+ KB


In [None]:
# DataFrame의 모양 파악
# 행이 891개(index), 열이 15개(column)

titanic.shape

(891, 15)

In [None]:
# DataFrame의 컬럼명 파악

titanic.columns

Index(['survived', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare',
       'embarked', 'class', 'who', 'adult_male', 'deck', 'embark_town',
       'alive', 'alone'],
      dtype='object')

In [None]:
titanic.index # 인덱스에 대한 정보

RangeIndex(start=0, stop=891, step=1)

In [None]:
# DataFrame 각 행의 Datatype 파악

titanic.dtypes

survived          int64
pclass            int64
sex              object
age             float64
sibsp             int64
parch             int64
fare            float64
embarked         object
class          category
who              object
adult_male         bool
deck           category
embark_town      object
alive            object
alone              bool
dtype: object

In [None]:
# 각 열의 데이터 수 세어보기
# 결측치를 제외한 값들의 갯수를 세어낸다.

titanic.count()

survived       891
pclass         891
sex            891
age            714
sibsp          891
parch          891
fare           891
embarked       889
class          891
who            891
adult_male     891
deck           203
embark_town    889
alive          891
alone          891
dtype: int64

In [None]:
# DataFrame의 상위 n개의 row(행) 출력

titanic.head(10)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True
5,0,3,male,,0,0,8.4583,Q,Third,man,True,,Queenstown,no,True
6,0,1,male,54.0,0,0,51.8625,S,First,man,True,E,Southampton,no,True
7,0,3,male,2.0,3,1,21.075,S,Third,child,False,,Southampton,no,False
8,1,3,female,27.0,0,2,11.1333,S,Third,woman,False,,Southampton,yes,False
9,1,2,female,14.0,1,0,30.0708,C,Second,child,False,,Cherbourg,yes,False


In [None]:
# DataFrame의 하위 n개 row 출력
titanic.tail(10)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
881,0,3,male,33.0,0,0,7.8958,S,Third,man,True,,Southampton,no,True
882,0,3,female,22.0,0,0,10.5167,S,Third,woman,False,,Southampton,no,True
883,0,2,male,28.0,0,0,10.5,S,Second,man,True,,Southampton,no,True
884,0,3,male,25.0,0,0,7.05,S,Third,man,True,,Southampton,no,True
885,0,3,female,39.0,0,5,29.125,Q,Third,woman,False,,Queenstown,no,False
886,0,2,male,27.0,0,0,13.0,S,Second,man,True,,Southampton,no,True
887,1,1,female,19.0,0,0,30.0,S,First,woman,False,B,Southampton,yes,True
888,0,3,female,,1,2,23.45,S,Third,woman,False,,Southampton,no,False
889,1,1,male,26.0,0,0,30.0,C,First,man,True,C,Cherbourg,yes,True
890,0,3,male,32.0,0,0,7.75,Q,Third,man,True,,Queenstown,no,True


In [None]:
age = titanic["age"]
age

0      22.0
1      38.0
2      26.0
3      35.0
4      35.0
       ... 
886    27.0
887    19.0
888     NaN
889    26.0
890    32.0
Name: age, Length: 891, dtype: float64

In [None]:
# age의 타입이 Series
# DataFrame내의 개별 column은 series, 그것들이 모인게 DataFrame

type(age)

pandas.core.series.Series

In [None]:
# .을 이용해서도 조회 가능

titanic.age

0      22.0
1      38.0
2      26.0
3      35.0
4      35.0
       ... 
886    27.0
887    19.0
888     NaN
889    26.0
890    32.0
Name: age, Length: 891, dtype: float64

In [None]:
# Series를 DataFrame 으로 빼내는 방법.

age2 = titanic[["age"]]
age2

Unnamed: 0,age
0,22.0
1,38.0
2,26.0
3,35.0
4,35.0
...,...
886,27.0
887,19.0
888,
889,26.0


In [None]:
type(age2)

pandas.core.frame.DataFrame

In [None]:
# 여러개의 column을 이용하기
# 대괄호가 2개 필요
# column name을 기준으로 추출하며, 입력된 순서로 정렬함

titanic[["age", "survived", "fare"]]

Unnamed: 0,age,survived,fare
0,22.0,0,7.2500
1,38.0,1,71.2833
2,26.0,1,7.9250
3,35.0,1,53.1000
4,35.0,0,8.0500
...,...,...,...
886,27.0,0,13.0000
887,19.0,1,30.0000
888,,0,23.4500
889,26.0,1,30.0000


In [None]:
# index 기준으로 slicing

titanic[1:40:5] # 1번 인덱스에서 40번 인덱스까지 5간격으로

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
6,0,1,male,54.0,0,0,51.8625,S,First,man,True,E,Southampton,no,True
11,1,1,female,58.0,0,0,26.55,S,First,woman,False,C,Southampton,yes,True
16,0,3,male,2.0,4,1,29.125,Q,Third,child,False,,Queenstown,no,False
21,1,2,male,34.0,0,0,13.0,S,Second,man,True,D,Southampton,yes,True
26,0,3,male,,0,0,7.225,C,Third,man,True,,Cherbourg,no,True
31,1,1,female,,1,0,146.5208,C,First,woman,False,B,Cherbourg,yes,False
36,1,3,male,,0,0,7.2292,C,Third,man,True,,Cherbourg,yes,True


In [None]:
# index 형태 바꾸기
# python에는 lambda 함수가 있다.

titanic2 = titanic.rename(index = lambda k : f'n{k}') 
# f-string: f'문자열'
# 인덱스 k를 문자열'nk'로 바꿈.
'''

ex)
x = 1
y = 2
f"{x} + {y}는 {x + y}입니다."

=> '1 + 2는 3입니다.'

'''
titanic2

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
n0,0,3,male,22.0,1,0,7.2500,S,Third,man,True,,Southampton,no,False
n1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
n2,1,3,female,26.0,0,0,7.9250,S,Third,woman,False,,Southampton,yes,True
n3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False
n4,0,3,male,35.0,0,0,8.0500,S,Third,man,True,,Southampton,no,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
n886,0,2,male,27.0,0,0,13.0000,S,Second,man,True,,Southampton,no,True
n887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True
n888,0,3,female,,1,2,23.4500,S,Third,woman,False,,Southampton,no,False
n889,1,1,male,26.0,0,0,30.0000,C,First,man,True,C,Cherbourg,yes,True


(1) pandas DataFrame의 칼럼 이름 바꾸기

    :  df.columns = ['a', 'b']

    :  df.rename(columns = {'old_nm' : 'new_nm'}, inplace = True)


(2) pandas DataFrame의 인덱스 이름 바꾸기

    : df.index = ['a', 'b']

    : df.rename(index = {'old_nm': 'new_nm'}, inplace = True)

In [None]:
# loc method
# 명칭기반 추출

titanic2.loc[['n3','n7']]


Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
n3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
n7,0,3,male,2.0,3,1,21.075,S,Third,child,False,,Southampton,no,False


df.loc["인덱싱할 row 이름", "인덱싱할 column 이름"]

df.iloc[인덱싱할 row 인덱스, 인덱싱할 column 인덱스]

In [None]:
# 연속해서 추출 가능
titanic2.loc["n3":"n7"]

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
n3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
n4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True
n5,0,3,male,,0,0,8.4583,Q,Third,man,True,,Queenstown,no,True
n6,0,1,male,54.0,0,0,51.8625,S,First,man,True,E,Southampton,no,True
n7,0,3,male,2.0,3,1,21.075,S,Third,child,False,,Southampton,no,False


In [None]:
titanic.loc[:,["age", "survived"]]

Unnamed: 0,age,survived
0,22.0,0
1,38.0,1
2,26.0,1
3,35.0,1
4,35.0,0
...,...,...
886,27.0,0
887,19.0,1
888,,0
889,26.0,1


In [None]:
# index와 column을 원하는것만 추출 가능

titanic2.loc[["n1", "n217"], ["age", "survived"]]

Unnamed: 0,age,survived
n1,38.0,1
n217,42.0,0


In [None]:
titanic2.loc[["n2", "n510"], ["age", "class", "survived"]]

Unnamed: 0,age,class,survived
n2,26.0,Third,1
n510,29.0,Third,1


In [None]:
# iloc method 
# 위치기반(인덱스 기반) 추출
# 1개면 row에만 해당

titanic2.iloc[1:4] # 1~3행의 정보 전부 다 

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
n1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
n2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
n3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False


In [None]:
# 범위가 2개면 row와 column 순으로 추출

titanic2.iloc[1:4, 5:8]

Unnamed: 0,parch,fare,embarked
n1,0,71.2833,C
n2,0,7.925,S
n3,0,53.1,S


In [None]:
# column만 추출하고 싶으면 ':' 사용

titanic2.iloc[:, 6:8] # : 를 통해 행 전체를 출력해주었다. 

Unnamed: 0,fare,embarked
n0,7.2500,S
n1,71.2833,C
n2,7.9250,S
n3,53.1000,S
n4,8.0500,S
...,...,...
n886,13.0000,S
n887,30.0000,S
n888,23.4500,S
n889,30.0000,C


In [None]:
titanic2.iloc[:30, [1,4,5]]

Unnamed: 0,pclass,sibsp,parch
n0,3,1,0
n1,1,1,0
n2,3,0,0
n3,1,1,0
n4,3,0,0
n5,3,0,0
n6,1,0,0
n7,3,3,1
n8,3,0,2
n9,2,1,0


In [None]:
# boolean indexing ... 매우 중요!
# bool 타입 데이터 : True or False

titanic["age"]>30

0      False
1       True
2      False
3       True
4       True
       ...  
886    False
887    False
888    False
889    False
890     True
Name: age, Length: 891, dtype: bool

In [None]:
# 나이가 30살이 넘는 승객들의 정보를 추출하라
titanic[titanic["age"]>30]

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.0500,S,Third,man,True,,Southampton,no,True
6,0,1,male,54.0,0,0,51.8625,S,First,man,True,E,Southampton,no,True
11,1,1,female,58.0,0,0,26.5500,S,First,woman,False,C,Southampton,yes,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
873,0,3,male,47.0,0,0,9.0000,S,Third,man,True,,Southampton,no,True
879,1,1,female,56.0,0,1,83.1583,C,First,woman,False,C,Cherbourg,yes,False
881,0,3,male,33.0,0,0,7.8958,S,Third,man,True,,Southampton,no,True
885,0,3,female,39.0,0,5,29.1250,Q,Third,woman,False,,Queenstown,no,False


In [None]:
# 몇 명일까?

titanic[titanic['age'] > 30]["age"].count()

305

In [None]:
# 문제 4 생존한 승객들의 모든 정보를 추출하라

titanic[titanic["survived"]==1]


Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False
8,1,3,female,27.0,0,2,11.1333,S,Third,woman,False,,Southampton,yes,False
9,1,2,female,14.0,1,0,30.0708,C,Second,child,False,,Cherbourg,yes,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
875,1,3,female,15.0,0,0,7.2250,C,Third,child,False,,Cherbourg,yes,True
879,1,1,female,56.0,0,1,83.1583,C,First,woman,False,C,Cherbourg,yes,False
880,1,2,female,25.0,0,1,26.0000,S,Second,woman,False,,Southampton,yes,False
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True


In [None]:
# multiple boolean indexing
# and : &
# or : |
# 개별 조건들을 ()로 묵어줘야 함

# 나이가 30이 넘고 생존한 승객들의 정보를 추출하라

titanic[(titanic["age"]>30) & (titanic["survived"]==1)] # 각 조건에 () 씌워주기

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False
11,1,1,female,58.0,0,0,26.5500,S,First,woman,False,C,Southampton,yes,True
15,1,2,female,55.0,0,0,16.0000,S,Second,woman,False,,Southampton,yes,True
21,1,2,male,34.0,0,0,13.0000,S,Second,man,True,D,Southampton,yes,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
857,1,1,male,51.0,0,0,26.5500,S,First,man,True,E,Southampton,yes,True
862,1,1,female,48.0,0,0,25.9292,S,First,woman,False,D,Southampton,yes,True
865,1,2,female,42.0,0,0,13.0000,S,Second,woman,False,,Southampton,yes,True
871,1,1,female,47.0,1,1,52.5542,S,First,woman,False,D,Southampton,yes,False


In [29]:
# 나이가 30이 넘고 생존한 승객들의 운임 요금과 독신여부의 정보를 추출하라

titanic[(titanic["age"]>30) & titanic["survived"]==1][["fare", "alone"]]

Unnamed: 0,fare,alone
1,71.2833,False
3,53.1000,False
11,26.5500,True
15,16.0000,True
21,13.0000,True
...,...,...
857,26.5500,True
862,25.9292,True
865,13.0000,True
871,52.5542,False


In [30]:
# 우리가 필요한 데이터만 추출해보자.
# 작업하려는 녀석을 df로 보통 저장을 한다.

df = titanic[['survived', 'pclass', 'sex', 'age', 'fare', 'alone','deck']]
df.head()

Unnamed: 0,survived,pclass,sex,age,fare,alone,deck
0,0,3,male,22.0,7.25,False,
1,1,1,female,38.0,71.2833,False,C
2,1,3,female,26.0,7.925,True,
3,1,1,female,35.0,53.1,False,C
4,0,3,male,35.0,8.05,True,


In [31]:
# 결측치 처리

# 결측치 여부 파악하기
df.isna()

Unnamed: 0,survived,pclass,sex,age,fare,alone,deck
0,False,False,False,False,False,False,True
1,False,False,False,False,False,False,False
2,False,False,False,False,False,False,True
3,False,False,False,False,False,False,False
4,False,False,False,False,False,False,True
...,...,...,...,...,...,...,...
886,False,False,False,False,False,False,True
887,False,False,False,False,False,False,False
888,False,False,False,True,False,False,True
889,False,False,False,False,False,False,False


In [32]:
df.isna().sum()

survived      0
pclass        0
sex           0
age         177
fare          0
alone         0
deck        688
dtype: int64

In [33]:
# 소수의 열만 제거하고 싶다면 drop을 이용

df.drop("deck", axis=1) # aixs=1: 열 제거

Unnamed: 0,survived,pclass,sex,age,fare,alone
0,0,3,male,22.0,7.2500,False
1,1,1,female,38.0,71.2833,False
2,1,3,female,26.0,7.9250,True
3,1,1,female,35.0,53.1000,False
4,0,3,male,35.0,8.0500,True
...,...,...,...,...,...,...
886,0,2,male,27.0,13.0000,True
887,1,1,female,19.0,30.0000,True
888,0,3,female,,23.4500,False
889,1,1,male,26.0,30.0000,True


In [34]:
df
# 원본에서는 제거가 안된 것을 볼수 있다.

Unnamed: 0,survived,pclass,sex,age,fare,alone,deck
0,0,3,male,22.0,7.2500,False,
1,1,1,female,38.0,71.2833,False,C
2,1,3,female,26.0,7.9250,True,
3,1,1,female,35.0,53.1000,False,C
4,0,3,male,35.0,8.0500,True,
...,...,...,...,...,...,...,...
886,0,2,male,27.0,13.0000,True,
887,1,1,female,19.0,30.0000,True,B
888,0,3,female,,23.4500,False,
889,1,1,male,26.0,30.0000,True,C


In [35]:
# 원본 객체도 수정해주려면 inplace=True 를 해준다
df.drop("deck", axis = 1, inplace=True)
df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


Unnamed: 0,survived,pclass,sex,age,fare,alone
0,0,3,male,22.0,7.2500,False
1,1,1,female,38.0,71.2833,False
2,1,3,female,26.0,7.9250,True
3,1,1,female,35.0,53.1000,False
4,0,3,male,35.0,8.0500,True
...,...,...,...,...,...,...
886,0,2,male,27.0,13.0000,True
887,1,1,female,19.0,30.0000,True
888,0,3,female,,23.4500,False
889,1,1,male,26.0,30.0000,True


In [37]:
# 모든 결측치를 제거!! (행으로 제거)
df.dropna(inplace=True)
df.shape # 891 -> 714

(714, 6)

In [38]:
# Pandas를 활용한 기초통게량 살펴보기

# 데이터 갯수 확인 (결측치를 제외한 상태이다)
df.count()

survived    714
pclass      714
sex         714
age         714
fare        714
alone       714
dtype: int64

In [39]:
# 자료 요약하기!!!
df.describe()

Unnamed: 0,survived,pclass,age,fare
count,714.0,714.0,714.0,714.0
mean,0.406162,2.236695,29.699118,34.694514
std,0.49146,0.83825,14.526497,52.91893
min,0.0,1.0,0.42,0.0
25%,0.0,1.0,20.125,8.05
50%,0.0,2.0,28.0,15.7417
75%,1.0,3.0,38.0,33.375
max,1.0,3.0,80.0,512.3292


In [41]:
# 모드 데이터를 describe 하고 싶다!
df.describe(include="all")

Unnamed: 0,survived,pclass,sex,age,fare,alone
count,714.0,714.0,714,714.0,714.0,714
unique,,,2,,,2
top,,,male,,,True
freq,,,453,,,404
mean,0.406162,2.236695,,29.699118,34.694514,
std,0.49146,0.83825,,14.526497,52.91893,
min,0.0,1.0,,0.42,0.0,
25%,0.0,1.0,,20.125,8.05,
50%,0.0,2.0,,28.0,15.7417,
75%,1.0,3.0,,38.0,33.375,


In [42]:
# 범주형 자료를 볼때는 object
df.describe(include='object')

Unnamed: 0,sex
count,714
unique,2
top,male
freq,453


In [43]:
# 수치형 자료를 볼 때는 number

df.describe(include="number")

Unnamed: 0,survived,pclass,age,fare
count,714.0,714.0,714.0,714.0
mean,0.406162,2.236695,29.699118,34.694514
std,0.49146,0.83825,14.526497,52.91893
min,0.0,1.0,0.42,0.0
25%,0.0,1.0,20.125,8.05
50%,0.0,2.0,28.0,15.7417
75%,1.0,3.0,38.0,33.375
max,1.0,3.0,80.0,512.3292


In [None]:
# 개별적으로도 살펴 볼 수 있다.
df.mean() # 평균
df.std()  # 표준편차
df.max()  # 최댓값
df.min()  # 최솟값
df.median() # 중앙값
df.mode()  # 최빈값

In [47]:
# 남자승객, 여자승객의 각각 평균 연령?

print("남자승객의 평균 연령 : ", df[df["sex"]=="male"]["age"].mean())
print("여자승객의 평균 연령 : ", df[df["sex"]=="female"]["age"].mean())

남자승객의 평균 연령 :  30.72664459161148
여자승객의 평균 연령 :  27.915708812260537


In [48]:
# 성별에 따른 생존자의 비율을 구하시오 
# 남자승객 생존자 / 남자승객 전체

male_count = titanic[titanic["sex"]=="male"]["sex"].count()
female_count = titanic[titanic["sex"]=="female"]["sex"].count()

male_survived = titanic[(titanic["sex"]=="male") & (titanic["survived"]==1)]["sex"].count()
female_survived = titanic[(titanic["sex"]=="female") & (titanic["survived"]==1)]["sex"].count()

print("남자 생존자 비율 : {}".format(male_survived/male_count))
print("여자 생존자 비율 : {}".format(female_survived/female_count))

남자 생존자 비율 : 0.18890814558058924
여자 생존자 비율 : 0.7420382165605095


In [49]:
print("남자 생존자 비율 = {: .2f}%".format(male_survived/male_count*100))
print("여자 생존자 비율 = {: .2f}%".format(female_survived/female_count*100))

남자 생존자 비율 =  18.89%
여자 생존자 비율 =  74.20%


In [None]:
# 문제 6 탑승 등급별(class) 생존자의 비율을 계산하시오

In [53]:
titanic.describe(include="all")

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
count,891.0,891.0,891,714.0,891.0,891.0,891.0,889,891,891,891,203,889,891,891
unique,,,2,,,,,3,3,3,2,7,3,2,2
top,,,male,,,,,S,Third,man,True,C,Southampton,no,True
freq,,,577,,,,,644,491,537,537,59,644,549,537
mean,0.383838,2.308642,,29.699118,0.523008,0.381594,32.204208,,,,,,,,
std,0.486592,0.836071,,14.526497,1.102743,0.806057,49.693429,,,,,,,,
min,0.0,1.0,,0.42,0.0,0.0,0.0,,,,,,,,
25%,0.0,2.0,,20.125,0.0,0.0,7.9104,,,,,,,,
50%,0.0,3.0,,28.0,0.0,0.0,14.4542,,,,,,,,
75%,1.0,3.0,,38.0,1.0,0.0,31.0,,,,,,,,


In [54]:
titanic["class"]

0       Third
1       First
2       Third
3       First
4       Third
        ...  
886    Second
887     First
888     Third
889     First
890     Third
Name: class, Length: 891, dtype: category
Categories (3, object): ['First', 'Second', 'Third']

In [59]:
First_class = titanic[titanic["class"]=="First"]["class"].count()
Second_class = titanic[titanic["class"]=="Second"]["class"].count()
Third_class = titanic[titanic["class"]=="Third"]["class"].count()

First_class_survived = titanic[(titanic["survived"]==1)&(titanic["class"]=="First")]["class"].count()
Second_class_survived = titanic[(titanic["survived"]==1)&(titanic["class"]=="Second")]["class"].count()
Third_class_survived = titanic[(titanic["survived"]==1)&(titanic["class"]=="Third")]["class"].count()

print("일등석 생존자 비율 = {:.2f}%".format(First_class_survived/First_class*100))
print("이등석 생존자 비율 = {:.2f}%".format(Second_class_survived/Second_class*100))
print("삼등석 생존자 비율 = {:.2f}%".format(Third_class_survived/Third_class*100))

일등석 생존자 비율 = 62.96%
이등석 생존자 비율 = 47.28%
삼등석 생존자 비율 = 24.24%
