In [2]:
import pandas as pd
import numpy as np
import seaborn as sns

# 자료형 (type) 변형, 범주형 (category) 데이터 처리

범주형 (category) :

수치로 측정이 불가능한 자료 / 종류를 표시하는 데이터

순위형 자료 (Ordinal Data)
: 범주 사이의 순서에 의미가 있는 자료
Ex) 학점(A+, A, A-, ...)

명목형 자료 (Nominal Data)
: 범주 사이의 순서가 의미가 없는 자료
Ex) 혈액형(A, B, O, AB), 성별(남성, 여성), ...

# 1. 데이터 표준화

# 1-1. 단위 환산

In [3]:
t = sns.load_dataset('titanic')
t.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [4]:
# round() : 반올림 함수
t['fare'] = t['fare'].round(2) # 괄호 안의 수가 표현할 소수점의 자릿수가 됨
t.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.28,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.92,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [5]:
# round()함수는 데이터프레임.round(소수점자릿수) / round(데이터프레임, 소수점자릿수)
# 괄호안에 아무 숫자도 넣지 않으면, 소수점 첫번째 자리에서 반올림
print(round(1.66667))
print(round(1.66667, 3))

2
1.667


# 1-2. 자료형 변환 : astype()

In [6]:
# 자료형 확인
t.dtypes

Unnamed: 0,0
survived,int64
pclass,int64
sex,object
age,float64
sibsp,int64
parch,int64
fare,float64
embarked,object
class,category
who,object


In [7]:
t['sex'].describe()

Unnamed: 0,sex
count,891
unique,2
top,male
freq,577


In [9]:
t['sex'] = t['sex'].astype('category')

In [10]:
t['sex'].dtype

CategoricalDtype(categories=['female', 'male'], ordered=False, categories_dtype=object)

In [12]:
a = pd.read_excel('myStudent.xlsx')
a.head()

Unnamed: 0,gender,group,education,kor,eng,math
0,f,a,highscool,100,80,50
1,f,b,college,56,15,89
2,m,c,university,12,68,52
3,f,d,highscool,85,65,15
4,m,e,college,80,100,50


In [35]:
# 국어, 영어, 수학 점수 다 더한 값의 평균값 계산해서 데이터프레임에 추가하기
a['avg'] = (a[['kor', 'eng', 'math']].mean(axis = 1)).round(2)
a.head()

Unnamed: 0,gender,group,education,kor,eng,math,avg
0,female,a,highscool,100,80,50,76.67
1,female,b,college,56,15,89,53.33
2,male,c,university,12,68,52,44.0
3,female,d,highscool,85,65,15,55.0
4,male,e,college,80,100,50,76.67


In [30]:
a.dtypes

Unnamed: 0,0
gender,object
group,object
education,object
kor,int64
eng,int64
math,int64
avg,float64


In [31]:
a['gender'] = a['gender'].astype('category')
a['group'] = a['group'].astype('category')
a['education'] = a['education'].astype('category')
a.dtypes

Unnamed: 0,0
gender,category
group,category
education,category
kor,int64
eng,int64
math,int64
avg,float64


In [32]:
a[['gender', 'group', 'education']].describe()

Unnamed: 0,gender,group,education
count,110,110,110
unique,2,5,3
top,m,a,highscool
freq,61,22,41


In [36]:
# replace() 함수 : 값 변경
# 다양한 값을 한번에 !!
# replace( {변경전:변경후, ...} )
# a['gender']

a['gender'].replace({'f':'female', 'm':'male'}, inplace=True)
a['gender'].unique()

None


['female', 'male']
Categories (2, object): ['female', 'male']

# 2. 범주형(카테고리) 데이터 처리

# 구간 분할 : np.histogram(), pd.cut()

In [38]:
aa = pd.DataFrame(a)
aa.head()

Unnamed: 0,gender,group,education,kor,eng,math,avg
0,female,a,highscool,100,80,50,76.67
1,female,b,college,56,15,89,53.33
2,male,c,university,12,68,52,44.0
3,female,d,highscool,85,65,15,55.0
4,male,e,college,80,100,50,76.67


In [40]:
# np.histogram()함수 : np.histogram(나눌 리스트, bins=나눌 구간)의 형태
# 2개의 값 :
#   count : 각 구간에 속하는 값의 갯수
#   bins_dividers : 경계값 리스트를 반환
count, bins_dividers = np.histogram(aa['avg'], bins=3)
print(count, '\n', bins_dividers)

[16 40 54] 
 [26.67       49.44666667 72.22333333 95.        ]


In [41]:
# pd.cut() 함수 : 자동으로 범주형 변수를 생성해줌, 간단함
bin_names = ['low', 'middle', 'high']
aa['aa_bin'] = pd.cut(x = aa['avg'],        # 데이터 배열
                      bins=bins_dividers,   # 경계값 리스트
                      labels=bin_names,     # 구간명
                      include_lowest=True)  # 첫 경계값 포함여부(구간의 하위 값)
aa.head()

Unnamed: 0,gender,group,education,kor,eng,math,avg,aa_bin
0,female,a,highscool,100,80,50,76.67,high
1,female,b,college,56,15,89,53.33,middle
2,male,c,university,12,68,52,44.0,low
3,female,d,highscool,85,65,15,55.0,middle
4,male,e,college,80,100,50,76.67,high
