In [265]:
# 라이브러리 임포트
import pandas as pd

# 데이터 url
url = 'https://raw.githubusercontent.com/pandas-dev/pandas/master/doc/data/titanic.csv'

# 데이터프레임으로 데이터를 적재
dataframe = pd.read_csv(url)

# 처음 5개 행을 출력
dataframe.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


# Create Data

In [266]:
# Pandas 데이터 직접 생성
dataframe = pd.DataFrame()

# 열 추가
dataframe['Name'] = ['Jacky Jackson', 'Steven Stevenson']
dataframe['Age'] = [38, 25]
dataframe['Driver'] = [True, False]

# 데이터프레임 확인
dataframe

Unnamed: 0,Name,Age,Driver
0,Jacky Jackson,38,True
1,Steven Stevenson,25,False


In [268]:
# 리스트나 튜플로부터 생성
data = [['Jacky Jackson', 38, True], ['Steven Stevenson', 25, False]]

pd.DataFrame(data, columns = ['Name', 'Age', 'Driver'])

Unnamed: 0,Name,Age,Driver
0,Jacky Jackson,38,True
1,Steven Stevenson,25,False


In [269]:
# Numpy의 ndarray로 생성
import numpy as np

data = [['Jacky Jackson', 38, True], ['Steven Stevenson', 25, False]]

matrix = np.array(data)

pd.DataFrame(matrix, columns = ['Name', 'Age', 'Driver'])

Unnamed: 0,Name,Age,Driver
0,Jacky Jackson,38,True
1,Steven Stevenson,25,False


In [270]:
# 딕셔너리로부터 생성
data = {'Name': ['Jacky Jackson', 'Steven Stevenson'],
        'Age': [38, 25],
        'Driver': [True, False]}

pd.DataFrame(data)

Unnamed: 0,Name,Age,Driver
0,Jacky Jackson,38,True
1,Steven Stevenson,25,False


In [271]:
# 딕셔너리로 이루어진 리스트로부터 생성
data = [{'Name': 'Jacky Jackson', 'Age': 38, 'Driver': True},
        {'Name': 'Steven Stevenson', 'Age': 25, 'Driver': False}]

# index 지정도 가능
pd.DataFrame(data, index = ['row1', 'row2'])

Unnamed: 0,Name,Age,Driver
row1,Jacky Jackson,38,True
row2,Steven Stevenson,25,False


# Series

In [272]:
# Series 생성
dat = pd.Series([1, 3, 6, 12])
dat

0     1
1     3
2     6
3    12
dtype: int64

In [273]:
# Series 저장된 데이터 값 확인
dat.values

array([ 1,  3,  6, 12])

In [274]:
# Series는 다양한 객체 데이터로 저장 가능
# 모든 요소가 같은 데이터 타입일 필요 없음
dat2 = pd.Series(np.array([1,3, np.nan, 12]))
dat3 = pd.Series(['aa', 'bb', 'c', 'd'])
dat4 = pd.Series([1, 'aa', 2.34, 'd'])

In [275]:
dat5 = pd.Series([1, 3, 6, 12], index = [1, 10, 20, 33])
dat6 = pd.Series([1, 3, 6, 12], index = ['a', 'b', 'c', 'd'])
dat7 = pd.Series({'a': 1, 'b': 3, 'c': 6, 'd': 12})
print(dat5)
print(dat6)
print(dat7)

1      1
10     3
20     6
33    12
dtype: int64
a     1
b     3
c     6
d    12
dtype: int64
a     1
b     3
c     6
d    12
dtype: int64


In [276]:
# 레이블 값 변경
dat2.index = ['un', 'due', 'trois', 'quatre']
dat2

un         1.0
due        3.0
trois      NaN
quatre    12.0
dtype: float64

# DataFrame

In [277]:
df = pd.DataFrame(np.random.randn(6, 2),
                  index = [[1, 1, 2, 2, 3, 3], [1, 2, 1, 2, 1, 2]],
                           columns = ['item1', 'item2'])
df

Unnamed: 0,Unnamed: 1,item1,item2
1,1,-0.27043,0.442527
1,2,0.757499,-1.775052
2,1,0.467148,0.970303
2,2,-0.85624,-0.481589
3,1,1.423228,-1.978231
3,2,0.375859,0.529611


# Use-Dataframe

In [278]:
# 처음 2개의 행 확인
dataframe.head(2)

Unnamed: 0,Name,Age,Driver
0,Jacky Jackson,38,True
1,Steven Stevenson,25,False


In [279]:
# 마지막 3개의 행 확인
dataframe.tail(3)

Unnamed: 0,Name,Age,Driver
0,Jacky Jackson,38,True
1,Steven Stevenson,25,False


In [280]:
# 행과 열의 수 확인
dataframe.shape

(2, 3)

In [281]:
# 수치형 열의 기본 통계값 확인
dataframe.describe()

Unnamed: 0,Age
count,2.0
mean,31.5
std,9.192388
min,25.0
25%,28.25
50%,31.5
75%,34.75
max,38.0


In [282]:
# 첫 번째 행을 선택
dataframe.iloc[0]

Name      Jacky Jackson
Age                  38
Driver             True
Name: 0, dtype: object

In [283]:
# 세 개의 행을 선택
dataframe.iloc[1:4]

Unnamed: 0,Name,Age,Driver
1,Steven Stevenson,25,False


In [284]:
# 4개의 행을 선택
# loc는 마지막 인덱스 포함
dataframe.loc[1:4]

Unnamed: 0,Name,Age,Driver
1,Steven Stevenson,25,False


In [285]:
# 라이브러리 임포트
import pandas as pd

# 데이터 url
url = 'https://raw.githubusercontent.com/pandas-dev/pandas/master/doc/data/titanic.csv'

# 데이터프레임으로 데이터를 적재
dataframe = pd.read_csv(url)

# 처음 5개 행을 출력
dataframe.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [286]:
# 인덱스 설정
dataframe = dataframe.set_index(['Name'])

# 행 확인
dataframe.loc['Heikkinen, Miss. Laina']

PassengerId                   3
Survived                      1
Pclass                        3
Sex                      female
Age                        26.0
SibSp                         0
Parch                         0
Ticket         STON/O2. 3101282
Fare                      7.925
Cabin                       NaN
Embarked                      S
Name: Heikkinen, Miss. Laina, dtype: object

In [287]:
# Age, Sex 열만 선택한다
dataframe.loc[:'Heikkinen, Miss. Laina', 'Sex': 'Age']

Unnamed: 0_level_0,Sex,Age
Name,Unnamed: 1_level_1,Unnamed: 2_level_1
"Braund, Mr. Owen Harris",male,22.0
"Cumings, Mrs. John Bradley (Florence Briggs Thayer)",female,38.0
"Heikkinen, Miss. Laina",female,26.0


In [288]:
# dataframe[:3]가 동일
dataframe[:'Heikkinen, Miss. Laina']

Unnamed: 0_level_0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
"Braund, Mr. Owen Harris",1,0,3,male,22.0,1,0,A/5 21171,7.25,,S
"Cumings, Mrs. John Bradley (Florence Briggs Thayer)",2,1,1,female,38.0,1,0,PC 17599,71.2833,C85,C
"Heikkinen, Miss. Laina",3,1,3,female,26.0,0,0,STON/O2. 3101282,7.925,,S


In [289]:
dataframe[['Sex', 'Age']].head(2)

Unnamed: 0_level_0,Sex,Age
Name,Unnamed: 1_level_1,Unnamed: 2_level_1
"Braund, Mr. Owen Harris",male,22.0
"Cumings, Mrs. John Bradley (Florence Briggs Thayer)",female,38.0


In [290]:
# 조건에 따라 행 선택

# 여성 승객만 선택
dataframe[dataframe['Sex'] == 'female'].head(2)

Unnamed: 0_level_0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
"Cumings, Mrs. John Bradley (Florence Briggs Thayer)",2,1,1,female,38.0,1,0,PC 17599,71.2833,C85,C
"Heikkinen, Miss. Laina",3,1,3,female,26.0,0,0,STON/O2. 3101282,7.925,,S


In [291]:
# 여성이면서 30세 이상인 승객만 선택

dataframe[(dataframe['Sex'] == 'female') & (dataframe['Age'] >=30)]

Unnamed: 0_level_0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
"Cumings, Mrs. John Bradley (Florence Briggs Thayer)",2,1,1,female,38.0,1,0,PC 17599,71.2833,C85,C
"Futrelle, Mrs. Jacques Heath (Lily May Peel)",4,1,1,female,35.0,1,0,113803,53.1000,C123,S
"Bonnell, Miss. Elizabeth",12,1,1,female,58.0,0,0,113783,26.5500,C103,S
"Hewlett, Mrs. (Mary D Kingcome)",16,1,2,female,55.0,0,0,248706,16.0000,,S
"Vander Planke, Mrs. Julius (Emelia Maria Vandemoortele)",19,0,3,female,31.0,1,0,345763,18.0000,,S
...,...,...,...,...,...,...,...,...,...,...,...
"Swift, Mrs. Frederick Joel (Margaret Welles Barron)",863,1,1,female,48.0,0,0,17466,25.9292,D17,S
"Bystrom, Mrs. (Karolina)",866,1,2,female,42.0,0,0,236852,13.0000,,S
"Beckwith, Mrs. Richard Leonard (Sallie Monypeny)",872,1,1,female,47.0,1,1,11751,52.5542,D35,S
"Potter, Mrs. Thomas Jr (Lily Alexenia Wilson)",880,1,1,female,56.0,0,1,11767,83.1583,C50,C


In [292]:
# Name 열에 Heikkinen이 포함된 행만 찾기
dataframe = pd.read_csv(url)

dataframe['Name'].str.find('Heikkinen')

0     -1
1     -1
2      0
3     -1
4     -1
      ..
886   -1
887   -1
888   -1
889   -1
890   -1
Name: Name, Length: 891, dtype: int64

In [293]:
# 특정 문자열 포함한 행 선택 가능

dataframe[dataframe['Name'].str.find('Heikkinen') > -1].head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


In [294]:
# Sex 열이 female인 모든 행을 Woman으로 변경
dataframe['Sex'].replace('female', 'Woman').head(2)

0     male
1    Woman
Name: Sex, dtype: object

In [295]:
# female과 male을 Woman과 Man으로 치환

dataframe['Sex'].replace(["female", "male"], ["Woman", "Man"]).head(5)

0      Man
1    Woman
2    Woman
3    Woman
4      Man
Name: Sex, dtype: object

In [296]:
# 값을 치환하고 두 개의 행을 출력
dataframe.replace(1, "One").head(2)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,One,0,3,"Braund, Mr. Owen Harris",male,22.0,One,0,A/5 21171,7.25,,S
1,2,One,One,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,One,0,PC 17599,71.2833,C85,C


In [297]:
# 한 번에 여러 개의 값을 동일하게 변경
dataframe.replace(["female", "male"], "person").head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",person,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",person,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",person,26.0,0,0,STON/O2. 3101282,7.925,,S


In [298]:
# 딕셔너리로 바꿀 값을 각각 매핑하여 전달가능
dataframe.replace({"female": 1, "male": 0}).head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",1,26.0,0,0,STON/O2. 3101282,7.925,,S


In [299]:
# 열 이름을 바꾸고 행 출력
dataframe.rename(columns = {'Pclass': 'Passenger Class'}).head(2)

Unnamed: 0,PassengerId,Survived,Passenger Class,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C


In [300]:
# 열 이름 2개 바꾸고 행 출력
dataframe.rename(columns = {'Pclass': 'Passenger Class', 'Sex': 'Gender'}).head(2)

Unnamed: 0,PassengerId,Survived,Passenger Class,Name,Gender,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C


In [301]:
dataframe.columns = ['PassengerId', 'Survived Class', 'Class', 'Name', 'Gender', 'Age', 'SipSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked']

In [302]:
# 인덱스 0을 -1로 바꾼다
dataframe.rename(index = {0: -1}).head(2)

Unnamed: 0,PassengerId,Survived Class,Class,Name,Gender,Age,SipSp,Parch,Ticket,Fare,Cabin,Embarked
-1,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C


In [303]:
# 열 이름을 소문자로 바꾼다.
dataframe.rename(str.lower, axis = 'columns').head(2)

Unnamed: 0,passengerid,survived class,class,name,gender,age,sipsp,parch,ticket,fare,cabin,embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C


In [304]:
# 기초통계 구하기
print('최댓값:', dataframe['Age'].max())
print('최솟값:', dataframe['Age'].min())
print('평균:', dataframe['Age'].mean())
print('합:', dataframe['Age'].sum())
print('카운트:', dataframe['Age'].count())

최댓값: 80.0
최솟값: 0.42
평균: 29.69911764705882
합: 21205.17
카운트: 714


In [305]:
# 데이터프레임 전체에도 카운트 출력
dataframe.count()

PassengerId       891
Survived Class    891
Class             891
Name              891
Gender            891
Age               714
SipSp             891
Parch             891
Ticket            891
Fare              891
Cabin             204
Embarked          889
dtype: int64

In [306]:
# 수치형 열의 공분산을 계산
dataframe.cov()

  dataframe.cov()


Unnamed: 0,PassengerId,Survived Class,Class,Age,SipSp,Parch,Fare
PassengerId,66231.0,-0.626966,-7.561798,138.696504,-16.325843,-0.342697,161.883369
Survived Class,-0.626966,0.236772,-0.137703,-0.551296,-0.018954,0.032017,6.221787
Class,-7.561798,-0.137703,0.699015,-4.496004,0.076599,0.012429,-22.830196
Age,138.696504,-0.551296,-4.496004,211.019125,-4.163334,-2.344191,73.84903
SipSp,-16.325843,-0.018954,0.076599,-4.163334,1.216043,0.368739,8.748734
Parch,-0.342697,0.032017,0.012429,-2.344191,0.368739,0.649728,8.661052
Fare,161.883369,6.221787,-22.830196,73.84903,8.748734,8.661052,2469.436846


In [307]:
# 수치형 열의 상관계수를 계산
dataframe.corr()

  dataframe.corr()


Unnamed: 0,PassengerId,Survived Class,Class,Age,SipSp,Parch,Fare
PassengerId,1.0,-0.005007,-0.035144,0.036847,-0.057527,-0.001652,0.012658
Survived Class,-0.005007,1.0,-0.338481,-0.077221,-0.035322,0.081629,0.257307
Class,-0.035144,-0.338481,1.0,-0.369226,0.083081,0.018443,-0.5495
Age,0.036847,-0.077221,-0.369226,1.0,-0.308247,-0.189119,0.096067
SipSp,-0.057527,-0.035322,0.083081,-0.308247,1.0,0.414838,0.159651
Parch,-0.001652,0.081629,0.018443,-0.189119,0.414838,1.0,0.216225
Fare,0.012658,0.257307,-0.5495,0.096067,0.159651,0.216225,1.0


In [308]:
# 정렬하기
dataframe.sort_values(by = 'Age', ascending = False)

Unnamed: 0,PassengerId,Survived Class,Class,Name,Gender,Age,SipSp,Parch,Ticket,Fare,Cabin,Embarked
630,631,1,1,"Barkworth, Mr. Algernon Henry Wilson",male,80.0,0,0,27042,30.0000,A23,S
851,852,0,3,"Svensson, Mr. Johan",male,74.0,0,0,347060,7.7750,,S
493,494,0,1,"Artagaveytia, Mr. Ramon",male,71.0,0,0,PC 17609,49.5042,,C
96,97,0,1,"Goldschmidt, Mr. George B",male,71.0,0,0,PC 17754,34.6542,A5,C
116,117,0,3,"Connors, Mr. Patrick",male,70.5,0,0,370369,7.7500,,Q
...,...,...,...,...,...,...,...,...,...,...,...,...
859,860,0,3,"Razi, Mr. Raihed",male,,0,0,2629,7.2292,,C
863,864,0,3,"Sage, Miss. Dorothy Edith ""Dolly""",female,,8,2,CA. 2343,69.5500,,S
868,869,0,3,"van Melkebeke, Mr. Philemon",male,,0,0,345777,9.5000,,S
878,879,0,3,"Laleff, Mr. Kristo",male,,0,0,349217,7.8958,,S


In [309]:
dataframe = pd.read_csv(url)

# 고유한 값을 찾는다
dataframe['Sex'].unique()

array(['male', 'female'], dtype=object)

In [310]:
# 카운트를 출력
dataframe['Sex'].value_counts()

male      577
female    314
Name: Sex, dtype: int64

In [311]:
dataframe['Pclass'].value_counts()

3    491
1    216
2    184
Name: Pclass, dtype: int64

In [312]:
# 고유한 값의 개수를 출력
dataframe['Pclass'].nunique()

3

In [313]:
dataframe.nunique()

PassengerId    891
Survived         2
Pclass           3
Name           891
Sex              2
Age             88
SibSp            7
Parch            7
Ticket         681
Fare           248
Cabin          147
Embarked         3
dtype: int64

In [314]:
# 누락된 값을 선택하고 2개의 행 출력
dataframe[dataframe['Age'].isnull()].head(2)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
17,18,1,2,"Williams, Mr. Charles Eugene",male,,0,0,244373,13.0,,S


In [315]:
# NaN으로 값을 바꾼다.
dataframe['Sex'] = dataframe['Sex'].replace('male', np.nan)

In [316]:
# 데이터를 적재하고 누락된 값을 설정함
dataframe = pd.read_csv(url, na_values = [np.nan, 'None', -999])
dataframe

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [317]:
# 데이터를 적재하고 누락된 값을 설정함
dataframe = pd.read_csv(url, na_values = ['female'], keep_default_na = False)
dataframe[12:14]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
12,13,0,3,"Saundercock, Mr. William Henry",male,20,0,0,A/5. 2151,8.05,,S
13,14,0,3,"Andersson, Mr. Anders Johan",male,39,1,5,347082,31.275,,S


In [318]:
# 데이터를 적재하고 누락된 값을 설정함
dataframe = pd.read_csv(url, na_filter = False)
dataframe[12:14]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
12,13,0,3,"Saundercock, Mr. William Henry",male,20,0,0,A/5. 2151,8.05,,S
13,14,0,3,"Andersson, Mr. Anders Johan",male,39,1,5,347082,31.275,,S


In [319]:
# 열 삭제
dataframe.drop('Age', axis = 1).head(2)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,1,0,PC 17599,71.2833,C85,C


In [320]:
# 열 삭제
dataframe.drop(['Age', 'Sex'], axis = 1).head(2)

Unnamed: 0,PassengerId,Survived,Pclass,Name,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,0,PC 17599,71.2833,C85,C


In [321]:
# 열 인덱스 지정하여 Pclass 열 삭제
dataframe.drop(dataframe.columns[2], axis = 1).head(2)

Unnamed: 0,PassengerId,Survived,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,"Braund, Mr. Owen Harris",male,22,1,0,A/5 21171,7.25,,S
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38,1,0,PC 17599,71.2833,C85,C


In [322]:
# 행 삭제하고 처음 두 개의 행 출력
dataframe[dataframe['Sex'] != 'male'].head(2)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26,0,0,STON/O2. 3101282,7.925,,S


In [323]:
# 중복 행 삭제
dataframe.drop_duplicates().head(2)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38,1,0,PC 17599,71.2833,C85,C


In [324]:
# 행의 개수 출력
print("원본 데이터프레임 행의 수:", len(dataframe))
print("중복 삭제 후 행의 수:", len(dataframe.drop_duplicates()))

원본 데이터프레임 행의 수: 891
중복 삭제 후 행의 수: 891


In [325]:
# 일부 열만 대상으로 중복 검사 진행, 처음 나타난 것 유지
dataframe.drop_duplicates(subset = ['Sex'])

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38,1,0,PC 17599,71.2833,C85,C


In [326]:
# 중복된 행 삭제, 마지막에 나타난 것 유지
dataframe.drop_duplicates(subset = ['Sex'], keep = 'last')

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.45,,S
890,891,0,3,"Dooley, Mr. Patrick",male,32.0,0,0,370376,7.75,,Q


In [327]:
# Sex 열의 값으로 행을 그룹핑하고 평균 계산
dataframe.groupby('Sex').mean()

  dataframe.groupby('Sex').mean()


Unnamed: 0_level_0,PassengerId,Survived,Pclass,SibSp,Parch,Fare
Sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
female,431.028662,0.742038,2.159236,0.694268,0.649682,44.479818
male,454.147314,0.188908,2.389948,0.429809,0.235702,25.523893


In [328]:
# 행을 그룹핑
dataframe.groupby('Sex')

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7a55353dee00>

In [329]:
# 행 그룹핑하고 카운팅
dataframe.groupby('Survived')['Name'].count()

Survived
0    549
1    342
Name: Name, dtype: int64

In [330]:
dataframe = pd.read_csv(url, na_values = [np.nan, 'None', -999])

# 행 그룹핑하고 평균 계산
dataframe.groupby(['Sex', 'Survived'])['Age'].mean()

Sex     Survived
female  0           25.046875
        1           28.847716
male    0           31.618056
        1           27.276022
Name: Age, dtype: float64

In [331]:
# 처음 두 이름을 대문자로 바꾸어 출력
for name in dataframe['Name'][0:2]:
    print(name.upper())

BRAUND, MR. OWEN HARRIS
CUMINGS, MRS. JOHN BRADLEY (FLORENCE BRIGGS THAYER)


In [332]:
# 처음 두 이름을 대문자로 바꾸어 출력
[name.upper() for name in dataframe['Name'][0:2]]

['BRAUND, MR. OWEN HARRIS',
 'CUMINGS, MRS. JOHN BRADLEY (FLORENCE BRIGGS THAYER)']

In [333]:
# 함수를 만든다
def uppercase(x):
    return x.upper()

# apply 사용하여 열의 몯느 원소에 내장 하무나 사용자 정의 함수를 적용
dataframe['Name'].apply(uppercase)[0:2]

0                              BRAUND, MR. OWEN HARRIS
1    CUMINGS, MRS. JOHN BRADLEY (FLORENCE BRIGGS TH...
Name: Name, dtype: object

In [334]:
# map 메서드
# Survived 열의 1을 Live로, 0을 Dead로 바꾼다
dataframe['Survived'].map({1: 'Live', 0: 'Dead'})[:5]

0    Dead
1    Live
2    Live
3    Live
4    Dead
Name: Survived, dtype: object

In [335]:
# 함수의 매개변수(age)를 apply 메서드를 호출할 때 전달할 수 있음
dataframe['Age'].apply(lambda x, age:x < age, age = 30)[:5]

0     True
1    False
2     True
3    False
4    False
Name: Age, dtype: bool

In [336]:
# 각 열에서 가장 큰 값을 뽑습니다.
# 숫자로 변환할 수 있는 값만을 남기고 나머지는 NaN으로 설정
df = dataframe.apply(pd.to_numeric, errors='coerce')
df.apply(lambda x: max(x))

PassengerId    891.0000
Survived         1.0000
Pclass           3.0000
Name                NaN
Sex                 NaN
Age             80.0000
SibSp            8.0000
Parch            6.0000
Ticket              NaN
Fare           512.3292
Cabin               NaN
Embarked            NaN
dtype: float64

In [337]:
def truncate_string(x):
    if type(x) == str:
        return x[:20]
    return x

# 문자열의 길이 최대 20자로 줄인다
dataframe.applymap(truncate_string)[:5]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Har",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John B",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Lai",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacqu",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William H",male,35.0,0,0,373450,8.05,,S


In [338]:
# 행을 그룹핑하고 다음 함수를 적용
dataframe.groupby('Sex').apply(lambda x: x.count())

Unnamed: 0_level_0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
Sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
female,314,314,314,314,314,261,314,314,314,314,97,312
male,577,577,577,577,577,453,577,577,577,577,107,577


In [339]:
# dataframe_a, dataframe_b 만들기
dataframe_a = pd.DataFrame()
dataframe_a['id'] = [1, 2, 3]
dataframe_a['first'] = ['Alex', 'Amy', 'Allen']
dataframe_a['last'] = ['Anderson', 'Ackerman', 'Ali']
print(dataframe_a)

dataframe_b = pd.DataFrame()
dataframe_b['id'] = [4, 5, 6]
dataframe_b['first'] = ['Billy', 'Brian', 'Bran']
dataframe_b['last'] = ['Bonder', 'Black','Balwner']
print(dataframe_b)

   id  first      last
0   1   Alex  Anderson
1   2    Amy  Ackerman
2   3  Allen       Ali
   id  first     last
0   4  Billy   Bonder
1   5  Brian    Black
2   6   Bran  Balwner


In [340]:
# 행 방향으로 데이터프레임 연결
pd.concat([dataframe_a, dataframe_b], axis = 0)

Unnamed: 0,id,first,last
0,1,Alex,Anderson
1,2,Amy,Ackerman
2,3,Allen,Ali
0,4,Billy,Bonder
1,5,Brian,Black
2,6,Bran,Balwner


In [341]:
# 열 방향으로 데이터프레임 연결
pd.concat([dataframe_a, dataframe_b], axis = 1)

Unnamed: 0,id,first,last,id.1,first.1,last.1
0,1,Alex,Anderson,4,Billy,Bonder
1,2,Amy,Ackerman,5,Brian,Black
2,3,Allen,Ali,6,Bran,Balwner


In [342]:
# dataframe_employees 만들기
employee_data = {'employee_id': ['1', '2', '3', '4'],
                'name': ['Amy Jones', 'Allen Keys', 'Alice Bees', 'Tim Horton']}
dataframe_employees = pd.DataFrame(employee_data, columns = ['employee_id', 'name'])
dataframe_employees

Unnamed: 0,employee_id,name
0,1,Amy Jones
1,2,Allen Keys
2,3,Alice Bees
3,4,Tim Horton


In [343]:
# dataframe_sales 만들기
sales_data = {'employee_id': ['3', '4', '5', '6'],
              'total_sales': [23456, 2512, 2345, 1455]}
dataframe_sales = pd.DataFrame(sales_data, columns = ['employee_id', 'total_sales'])
dataframe_sales

Unnamed: 0,employee_id,total_sales
0,3,23456
1,4,2512
2,5,2345
3,6,1455


In [344]:
# 데이터프레임 병합, 내부조인은 on 매개변수에 병합열 지정하여 merge 메서드 사용
pd.merge(dataframe_employees, dataframe_sales, on = 'employee_id')

Unnamed: 0,employee_id,name,total_sales
0,3,Alice Bees,23456
1,4,Tim Horton,2512


In [345]:
# 외부 조인은 how 매개변수 지정
pd.merge(dataframe_employees, dataframe_sales, on = 'employee_id', how = 'outer')

Unnamed: 0,employee_id,name,total_sales
0,1,Amy Jones,
1,2,Allen Keys,
2,3,Alice Bees,23456.0
3,4,Tim Horton,2512.0
4,5,,2345.0
5,6,,1455.0


In [346]:
# 왼쪽 조인
pd.merge(dataframe_employees, dataframe_sales, on = 'employee_id', how = 'left')

Unnamed: 0,employee_id,name,total_sales
0,1,Amy Jones,
1,2,Allen Keys,
2,3,Alice Bees,23456.0
3,4,Tim Horton,2512.0


In [347]:
# 오른쪽 조인
pd.merge(dataframe_employees, dataframe_sales, on = 'employee_id', how = 'right')

Unnamed: 0,employee_id,name,total_sales
0,3,Alice Bees,23456
1,4,Tim Horton,2512
2,5,,2345
3,6,,1455


In [348]:
# 각 데이터프레임에서 병합하기 위한 열 이름 지정 가능
pd.merge(dataframe_employees, dataframe_sales, left_on = 'employee_id',  right_on = 'employee_id')

Unnamed: 0,employee_id,name,total_sales
0,3,Alice Bees,23456
1,4,Tim Horton,2512


In [349]:
# 각 데이터프레임의 인덱스를 기준으로 병합
pd.merge(dataframe_employees, dataframe_sales, right_index= True, left_index = True)

Unnamed: 0,employee_id_x,name,employee_id_y,total_sales
0,1,Amy Jones,3,23456
1,2,Allen Keys,4,2512
2,3,Alice Bees,5,2345
3,4,Tim Horton,6,1455


In [350]:
# Pandas는 Numpy와 뛰어난 호환성을 자랑한다

# random한 3행 4열의 Dataframe 생성
data_frame = pd.DataFrame(np.random.randn(3, 4))
data_frame

Unnamed: 0,0,1,2,3
0,1.633898,-0.509105,-2.338665,1.207224
1,0.719524,-0.258754,0.118051,0.208204
2,-1.124069,0.281987,-0.647221,1.894388


In [351]:
# Numpy에서 제공하는 함수를 바로 적용할 수 있다
new_data_frame = np.log(data_frame)
new_data_frame

  result = func(self.values, **kwargs)


Unnamed: 0,0,1,2,3
0,0.490968,,,0.188324
1,-0.329165,,-2.136642,-1.569237
2,,-1.265894,,0.638896


In [352]:
# ndarray로 간편한 변환도 제공
nparray = np.array(data_frame)
nparray

array([[ 1.6338977 , -0.5091052 , -2.33866488,  1.20722412],
       [ 0.71952449, -0.25875425,  0.11805057,  0.20820391],
       [-1.1240686 ,  0.28198696, -0.64722112,  1.89438804]])

In [353]:
# NaN이 존재하는 모든 행 삭제
data_frame.dropna()

Unnamed: 0,0,1,2,3
0,1.633898,-0.509105,-2.338665,1.207224
1,0.719524,-0.258754,0.118051,0.208204
2,-1.124069,0.281987,-0.647221,1.894388


In [354]:
# NaN이 존재하는 모든 열 삭제
data_frame.dropna(axis = 1)

Unnamed: 0,0,1,2,3
0,1.633898,-0.509105,-2.338665,1.207224
1,0.719524,-0.258754,0.118051,0.208204
2,-1.124069,0.281987,-0.647221,1.894388


In [355]:
data_frame.columns = ['col1', 'col2', 'col3', 'col4']


# col1, col2 열에 NaN이 포함된 행 삭제
data_frame.dropna(subset = ['col1', 'col2'])

Unnamed: 0,col1,col2,col3,col4
0,1.633898,-0.509105,-2.338665,1.207224
1,0.719524,-0.258754,0.118051,0.208204
2,-1.124069,0.281987,-0.647221,1.894388


In [356]:
# 0행과 1행에 NaN이 포함된 열 삭제
data_frame.dropna(axis = 1, subset = [0, 1])

Unnamed: 0,col1,col2,col3,col4
0,1.633898,-0.509105,-2.338665,1.207224
1,0.719524,-0.258754,0.118051,0.208204
2,-1.124069,0.281987,-0.647221,1.894388


In [357]:
# NaN을 다른 값으로 모두 대체
data_frame.fillna(0)

Unnamed: 0,col1,col2,col3,col4
0,1.633898,-0.509105,-2.338665,1.207224
1,0.719524,-0.258754,0.118051,0.208204
2,-1.124069,0.281987,-0.647221,1.894388


In [358]:
# NaN 값을 True로 다른 값을 False로 바꾸도록 하거나
data_frame.isnull()

Unnamed: 0,col1,col2,col3,col4
0,False,False,False,False
1,False,False,False,False
2,False,False,False,False


In [359]:
# NaN을 같은 열의 바로 위의 행 값으로 대체
data_frame.fillna(method = 'ffill')

# NaN을 같은 열의 바로 아래의 행 값으로 대체
data_frame.fillna(method = 'bfill')

Unnamed: 0,col1,col2,col3,col4
0,1.633898,-0.509105,-2.338665,1.207224
1,0.719524,-0.258754,0.118051,0.208204
2,-1.124069,0.281987,-0.647221,1.894388


# 연습문제 1

In [360]:
# DataFrame을 활용해 남성과 여성 각각 Age가 30이상인 사람들의 평균 Height을 구해보자.
data_frame = pd.DataFrame({
    "Age": [37, 20, 30, 45, 25, 31, 41],
    'Height': [156, 180, 170, 160, 150, 140, 181],
    'Sex': ["f", "m", "m", "f", "f", "f", "m"]
})

print(data_frame[(data_frame['Age']>= 30)])
data_frame[(data_frame['Age']>= 30)].groupby('Sex').mean()

   Age  Height Sex
0   37     156   f
2   30     170   m
3   45     160   f
5   31     140   f
6   41     181   m


Unnamed: 0_level_0,Age,Height
Sex,Unnamed: 1_level_1,Unnamed: 2_level_1
f,37.666667,152.0
m,35.5,175.5
