In [47]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import platform
import seaborn

# 운영 체제 확인
if platform.system() == 'Darwin':  # Mac
    print('apple gothic')
    font_name = 'AppleGothic'
elif platform.system() == 'Windows':  # Windows
    font_name = 'NanumGothic'
else:
    font_name = None

# 한글 폰트 설정
if font_name:
    plt.rcParams['font.family'] = font_name

# 마이너스 부호 설정
plt.rcParams['axes.unicode_minus'] = False

# Future Warning 메시지 제거
import warnings
warnings.simplefilter(action = 'ignore', category = FutureWarning)

apple gothic


In [17]:
df = pd.read_csv('../data/welfare_python.csv')
df

Unnamed: 0,gender,birth,marriage,religion,code_job,income,code_religion
0,2,1936,2,2,,,1
1,2,1945,2,2,,,1
2,1,1948,2,2,942.0,120.0,1
3,1,1942,3,1,762.0,200.0,1
4,2,1923,2,1,,,1
...,...,...,...,...,...,...,...
16659,2,1967,1,1,,,5
16660,2,1992,5,1,314.0,302.5,5
16661,1,1995,5,1,,,5
16662,2,1998,0,1,,,5


## 전처리

In [18]:
# 성별 문자화

df.loc[df['gender'] == 1, 'gender'] = '남성'
df.loc[df['gender'] == 2, 'gender'] = '여성'
df

  df.loc[df['gender'] == 1, 'gender'] = '남성'


Unnamed: 0,gender,birth,marriage,religion,code_job,income,code_religion
0,여성,1936,2,2,,,1
1,여성,1945,2,2,,,1
2,남성,1948,2,2,942.0,120.0,1
3,남성,1942,3,1,762.0,200.0,1
4,여성,1923,2,1,,,1
...,...,...,...,...,...,...,...
16659,여성,1967,1,1,,,5
16660,여성,1992,5,1,314.0,302.5,5
16661,남성,1995,5,1,,,5
16662,여성,1998,0,1,,,5


In [19]:
df['marriage'].value_counts()

marriage
1    8431
0    2861
5    2433
2    2117
3     712
4      84
6      26
Name: count, dtype: int64

In [20]:
# 결혼여부 문자화

def Enco_marriage(x):
    if x == 1:
        return '결혼'
    elif x == 3:
        return '이혼'
    else:
        return '결측치'

df['marriage'] = df['marriage'].apply(lambda x: Enco_marriage(x))
df['marriage'].value_counts()


marriage
결혼     8431
결측치    7521
이혼      712
Name: count, dtype: int64

In [32]:
# merge를 위한 결측치 처리
df['code_job'] = df['code_job'].replace([np.inf, -np.inf], np.nan).fillna(0)
df['code_job'] = df['code_job'].astype(int)
df

Unnamed: 0,gender,birth,marriage,religion,code_job,income,code_religion
0,여성,1936,결측치,2,0,,1
1,여성,1945,결측치,2,0,,1
2,남성,1948,결측치,2,942,120.0,1
3,남성,1942,이혼,1,762,200.0,1
4,여성,1923,결측치,1,0,,1
...,...,...,...,...,...,...,...
16659,여성,1967,결혼,1,0,,5
16660,여성,1992,결측치,1,314,302.5,5
16661,남성,1995,결측치,1,0,,5
16662,여성,1998,결측치,1,0,,5


In [34]:
# welfare_job에서 동일 컬럼인 job_code를 기준으로 job 정보 추출
job = pd.read_csv('../data/welfare_job.csv', encoding = 'cp949')

df = pd.merge(df, job, 
        on = 'code_job', 
        how = 'left')
df

Unnamed: 0,gender,birth,marriage,religion,code_job,income,code_religion,job
0,여성,1936,결측치,2,0,,1,
1,여성,1945,결측치,2,0,,1,
2,남성,1948,결측치,2,942,120.0,1,경비원 및 검표원
3,남성,1942,이혼,1,762,200.0,1,전기공
4,여성,1923,결측치,1,0,,1,
...,...,...,...,...,...,...,...,...
16659,여성,1967,결혼,1,0,,5,
16660,여성,1992,결측치,1,314,302.5,5,비서 및 사무 보조원
16661,남성,1995,결측치,1,0,,5,
16662,여성,1998,결측치,1,0,,5,


In [36]:
df['code_religion'].value_counts()

code_religion
2    3711
7    2922
3    2785
1    2486
4    2036
5    1467
6    1257
Name: count, dtype: int64

In [41]:
# code_religion의 7개 권역별 지역 -> 문자열 구분
def religion_str(x):
    if x == 1:
        return '서울'
    elif x == 2:
        return '수도권(인천/경기)'
    elif x == 3:
        return '부산/경남/울산'
    elif x == 4:
        return '대구/경국'
    elif x == 5:
        return '대전/충남'
    elif x == 6:
        return '강원/충북'
    else:
        return '광주/전남/전북/제주도'
    
df['code_religion'] = df['code_religion'].apply(lambda x: religion_str(x))

In [44]:
# age 컬럼 생성
df['age'] = 2024 - df['birth'] + 1
df

Unnamed: 0,gender,birth,marriage,religion,code_job,income,code_religion,job,age
0,여성,1936,결측치,2,0,,서울,,89
1,여성,1945,결측치,2,0,,서울,,80
2,남성,1948,결측치,2,942,120.0,서울,경비원 및 검표원,77
3,남성,1942,이혼,1,762,200.0,서울,전기공,83
4,여성,1923,결측치,1,0,,서울,,102
...,...,...,...,...,...,...,...,...,...
16659,여성,1967,결혼,1,0,,대전/충남,,58
16660,여성,1992,결측치,1,314,302.5,대전/충남,비서 및 사무 보조원,33
16661,남성,1995,결측치,1,0,,대전/충남,,30
16662,여성,1998,결측치,1,0,,대전/충남,,27


In [48]:
# 종교 컬럼 문자화
df.loc[df['religion'] == 1, 'religion'] = '있음'
df.loc[df['religion'] == 2, 'religion'] = '없음'
df

Unnamed: 0,gender,birth,marriage,religion,code_job,income,code_religion,job,age
0,여성,1936,결측치,없음,0,,서울,,89
1,여성,1945,결측치,없음,0,,서울,,80
2,남성,1948,결측치,없음,942,120.0,서울,경비원 및 검표원,77
3,남성,1942,이혼,있음,762,200.0,서울,전기공,83
4,여성,1923,결측치,있음,0,,서울,,102
...,...,...,...,...,...,...,...,...,...
16659,여성,1967,결혼,있음,0,,대전/충남,,58
16660,여성,1992,결측치,있음,314,302.5,대전/충남,비서 및 사무 보조원,33
16661,남성,1995,결측치,있음,0,,대전/충남,,30
16662,여성,1998,결측치,있음,0,,대전/충남,,27


In [54]:
# income 결측치 대체
# nan -> 평균값
income_mean = df['income'].mean()
print(income_mean)

df['income'] = df['income'].fillna(income_mean)
df['income'] = round(df['income'], 2)
df

241.61972635621697


Unnamed: 0,gender,birth,marriage,religion,code_job,income,code_religion,job,age
0,여성,1936,결측치,없음,0,241.62,서울,,89
1,여성,1945,결측치,없음,0,241.62,서울,,80
2,남성,1948,결측치,없음,942,120.00,서울,경비원 및 검표원,77
3,남성,1942,이혼,있음,762,200.00,서울,전기공,83
4,여성,1923,결측치,있음,0,241.62,서울,,102
...,...,...,...,...,...,...,...,...,...
16659,여성,1967,결혼,있음,0,241.62,대전/충남,,58
16660,여성,1992,결측치,있음,314,302.50,대전/충남,비서 및 사무 보조원,33
16661,남성,1995,결측치,있음,0,241.62,대전/충남,,30
16662,여성,1998,결측치,있음,0,241.62,대전/충남,,27


In [62]:
# 연령대 구간별 ageg 컬럼 신설
df['ageg'] = 99

for i in range(len(df)):
    if df.loc[i, 'age'] >= 60:
        df.loc[i, 'ageg'] = '노년'

    elif df.loc[i, 'age'] >= 30:
        df.loc[i, 'ageg'] = '중년'

    else:
        df.loc[i, 'ageg'] = '청년'

df['ageg'].value_counts()

ageg
노년    8185
중년    5429
청년    3050
Name: count, dtype: int64