In [14]:
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import pandas as pd
import plotly 
import plotly.graph_objs as go
from plotly.offline import iplot
import plotly.express as px
import warnings

%matplotlib inline
warnings.filterwarnings("ignore")
#  plotly.tools.set_credentials_file(username='ID', api_key='APIKEY')
data_path = 'train.csv'

In [15]:
train = pd.read_csv('train.csv')

In [16]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43957 entries, 0 to 43956
Data columns (total 12 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   age              43957 non-null  int64 
 1   workclass        41459 non-null  object
 2   education        43957 non-null  object
 3   educational-num  43957 non-null  int64 
 4   marital-status   43957 non-null  object
 5   occupation       41451 non-null  object
 6   relationship     43957 non-null  object
 7   race             43957 non-null  object
 8   gender           43957 non-null  object
 9   hours-per-week   43957 non-null  int64 
 10  native-country   43194 non-null  object
 11  income_>50K      43957 non-null  int64 
dtypes: int64(4), object(8)
memory usage: 4.0+ MB


In [17]:
train.head()

Unnamed: 0,age,workclass,education,educational-num,marital-status,occupation,relationship,race,gender,hours-per-week,native-country,income_>50K
0,67,Private,Doctorate,16,Divorced,Exec-managerial,Not-in-family,White,Male,60,United-States,1
1,17,Private,12th,8,Never-married,Other-service,Own-child,White,Male,15,United-States,0
2,31,Private,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,40,United-States,1
3,58,State-gov,7th-8th,4,Married-civ-spouse,Transport-moving,Husband,White,Male,40,United-States,0
4,25,State-gov,Some-college,10,Never-married,Other-service,Not-in-family,Black,Male,40,United-States,0


In [18]:
train.isna().sum()

age                   0
workclass          2498
education             0
educational-num       0
marital-status        0
occupation         2506
relationship          0
race                  0
gender                0
hours-per-week        0
native-country      763
income_>50K           0
dtype: int64

### 결측치 채워넣기


In [19]:
train['workclass'] = train['workclass'].fillna('None')
train['occupation'] = train['occupation'].fillna('None')
train['native-country'] = train['native-country'].fillna('Others')
train['native-country'] = train['native-country'].str.replace('South', 'South Korea')  #South를 South Korea로
train['native-country'] = train['native-country'].str.replace('Hong', 'Hong Kong')  #Hong을 Hong Kong으로
train.isna().sum()

age                0
workclass          0
education          0
educational-num    0
marital-status     0
occupation         0
relationship       0
race               0
gender             0
hours-per-week     0
native-country     0
income_>50K        0
dtype: int64

In [20]:
#----------------------age group--------------------------
bins = [17, 25, 35, 45, 55, 65, 91]
labels = ['17-24', '25-34', '35-44', '45-54', '55-64', '65+' ]
age = train['age']
train['age-group'] = pd.cut(age, bins, right=False, labels=labels)  # 나이 구간 만들기
age_group_count = pd.DataFrame(train.groupby('age-group')['age-group'].count())  # 나이대별 분포를 위한 데이터
age_group_education = pd.DataFrame(train.groupby('age-group')['educational-num'].mean())

#------------------------gender--------------------------------------
gender = pd.DataFrame(train.groupby('gender')['gender'].count())
gender.rename(columns={'gender':'count'}, inplace=True)
gender.reset_index(inplace=True)

gender_education = pd.DataFrame(train.groupby('gender')['educational-num'].mean())

#----------------------workclass-------------------------------------
workclass = pd.DataFrame(train.groupby('workclass')['workclass'].count())
workclass.rename(columns={'workclass':'count'}, inplace=True)
workclass.reset_index(inplace=True)

# workclass_education = pd.DataFrame(train.groupby('workclass')['educational-num'].mean())
workclass_education = pd.DataFrame(train.groupby('workclass')['educational-num'].mean())
workclass_education.rename(columns={'educational-num':'mean'}, inplace=True)
workclass_education.reset_index(inplace=True)

#-----------------------country-------------------------------
native_country = pd.DataFrame(train.groupby('native-country')['native-country'].count())
native_country.rename(columns={'native-country':'count'}, inplace=True)
native_country.reset_index(inplace=True)

native_country_education =  pd.DataFrame(train.groupby('native-country')['educational-num'].mean())
native_country_education.rename(columns={'educational-num':'mean'}, inplace=True)
native_country_education.reset_index(inplace=True)

#-----------------------marital_status------------------------
marital_status = pd.DataFrame(train.groupby('marital-status')['marital-status'].count())
marital_status.rename(columns={'marital-status':'count'}, inplace=True)
marital_status.reset_index(inplace=True)


marital_status_education =  pd.DataFrame(train.groupby('marital-status')['educational-num'].mean())
marital_status_education.rename(columns={'educational-num':'mean'}, inplace=True)
marital_status_education.reset_index(inplace=True)





marital_status_education

Unnamed: 0,marital-status,mean
0,Divorced,10.061799
1,Married-AF-spouse,10.096774
2,Married-civ-spouse,10.302824
3,Married-spouse-absent,9.450877
4,Never-married,9.958038
5,Separated,9.2241
6,Widowed,9.093109


### 나이대별 분포

In [21]:
age_trace = go.Bar(
    x = age_group_count.index,
    y = age_group_count['age-group'],
    marker=dict(color='rgba(171, 50, 96, 0.6)'))

data = [age_trace]

layout = go.Layout(title='나이대별 분포')

fig = go.Figure(data=data, layout=layout)
iplot(fig)

### 나이대별 평균 교육 수준

In [22]:
age_trace2 = go.Bar(
    x = age_group_education.index,
    y = age_group_education['educational-num'],
    text = round(age_group_education['educational-num'], 2),
    marker=dict(color='rgba(171, 50, 96, 0.6)'))

data = [age_trace2]

layout = go.Layout(title='나이대별 평균 교육 수준')

fig = go.Figure(data=data, layout=layout)
iplot(fig)

### 데이터 성별 분포

In [23]:
fig = px.pie(gender, values='count', names='gender')
fig.show()

In [24]:
gender_trace = go.Bar(
    x = gender_education.index,
    y = gender_education['educational-num'],
    text = round(gender_education['educational-num'], 2),
    marker=dict(color='rgba(171, 50, 96, 0.6)'))

data = [gender_trace]

layout = go.Layout(title='성별별 평균 교육 수준')

fig = go.Figure(data=data, layout=layout)
iplot(fig)

### 고용 형태별 교육 수준

In [25]:
# workclass_trace = go.Bar(

#     x = workclass_education['educational-num'],
#     y = workclass_education.index,
#     text = round(workclass_education['educational-num'], 2),
#     textposition = "outside",
#     marker=dict(color='rgba(171, 50, 96, 0.6)'))

# data = [workclass_trace]

# layout = go.Layout(title='고용 형태 별 평균 교육 수준')

# fig = go.Figure(data=data, layout=layout)
# iplot(fig)

fig = px.bar(workclass_education, x = 'mean', y = 'workclass', color="workclass")
fig.update_layout(yaxis={'categoryorder':'total ascending'})
fig.show()

In [26]:
fig = px.box(train,  x = 'workclass', y = 'educational-num')
fig.show()

### 국가별 평균 교육 수준 분포

In [27]:
fig = px.bar(native_country_education, x = 'native-country', y = 'mean', color='mean')
fig.update_layout(xaxis={'categoryorder':'total descending'})
fig.show()

### 혼인 상태별 평균 교육 수준 분포

In [28]:
fig = px.bar(marital_status_education, x = 'marital-status', y = 'mean', color='mean',
             text=round(marital_status_education['mean'], 2))
fig.update_layout(xaxis={'categoryorder':'total descending'})
fig.show()

In [29]:
fig = px.box(train,  x = 'marital-status', y = 'educational-num')
fig.show()