In [1]:
import numpy as np 
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt
import warnings 

warnings.filterwarnings('ignore')

In [5]:
train =  pd.read_csv("./data/train.csv")
test =  pd.read_csv("./data/test.csv")
sub =  pd.read_csv("./data/sample_submission.csv")

In [39]:
y=train['income']
test['income']='blank'

In [40]:
### 데이터 합치기
all_dat=pd.concat([train, test], axis=0)
print(all_dat.shape)

(32561, 17)


In [41]:
all_dat.income.value_counts()

<=50K    19744
blank     6512
>50K      6305
Name: income, dtype: int64

In [43]:
all_dat.loc[ all_dat['income']=='>50K' , 'target'] = 1
all_dat.loc[ all_dat['income']=='<=50K' , 'target'] = 0
all_dat.loc[ all_dat['income']=='blank' , 'target'] = 999
all_dat['target'] = all_dat.target.astype("int")

In [15]:
all_dat.head()

Unnamed: 0,id,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income,target
0,0,40,Private,168538,HS-grad,9,Married-civ-spouse,Sales,Husband,White,Male,0,0,60,United-States,>50K,1.0
1,1,17,Private,101626,9th,5,Never-married,Machine-op-inspct,Own-child,White,Male,0,0,20,United-States,<=50K,0.0
2,2,18,Private,353358,Some-college,10,Never-married,Other-service,Own-child,White,Male,0,0,16,United-States,<=50K,0.0
3,3,21,Private,151158,Some-college,10,Never-married,Prof-specialty,Own-child,White,Female,0,0,25,United-States,<=50K,0.0
4,4,24,Private,122234,Some-college,10,Never-married,Adm-clerical,Not-in-family,Black,Female,0,0,20,?,<=50K,0.0


In [16]:
all_dat.columns

Index(['id', 'age', 'workclass', 'fnlwgt', 'education', 'education_num',
       'marital_status', 'occupation', 'relationship', 'race', 'sex',
       'capital_gain', 'capital_loss', 'hours_per_week', 'native_country',
       'income', 'target'],
      dtype='object')

### 원핫 인코딩
1. 범주형 변수 선택
2. 입력, 출력 분할
3. pd.get_dummies(범주형 변수)를 이용해 원핫 인코딩 수행
4. 원핫이 적용된 데이터를 원본 데이터에 추가
5. 모델 적용

In [25]:
sel_cat = ['workclass', 'education', 'marital_status', 'occupation', 'relationship', 'race', 'sex', 'native_country' ]

X_cat = all_dat[sel_cat]
y = all_dat['target']

In [26]:
X_dummy = pd.get_dummies(X_cat)
X_dummy[:5]

Unnamed: 0,workclass_?,workclass_Federal-gov,workclass_Local-gov,workclass_Never-worked,workclass_Private,workclass_Self-emp-inc,workclass_Self-emp-not-inc,workclass_State-gov,workclass_Without-pay,education_10th,...,native_country_Portugal,native_country_Puerto-Rico,native_country_Scotland,native_country_South,native_country_Taiwan,native_country_Thailand,native_country_Trinadad&Tobago,native_country_United-States,native_country_Vietnam,native_country_Yugoslavia
0,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [22]:
all_dat_n = pd.concat([all_dat, X_dummy], axis=1)
all_dat_n

Unnamed: 0,id,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,...,native_country_Portugal,native_country_Puerto-Rico,native_country_Scotland,native_country_South,native_country_Taiwan,native_country_Thailand,native_country_Trinadad&Tobago,native_country_United-States,native_country_Vietnam,native_country_Yugoslavia
0,0,40,Private,168538,HS-grad,9,Married-civ-spouse,Sales,Husband,White,...,0,0,0,0,0,0,0,1,0,0
1,1,17,Private,101626,9th,5,Never-married,Machine-op-inspct,Own-child,White,...,0,0,0,0,0,0,0,1,0,0
2,2,18,Private,353358,Some-college,10,Never-married,Other-service,Own-child,White,...,0,0,0,0,0,0,0,1,0,0
3,3,21,Private,151158,Some-college,10,Never-married,Prof-specialty,Own-child,White,...,0,0,0,0,0,0,0,1,0,0
4,4,24,Private,122234,Some-college,10,Never-married,Adm-clerical,Not-in-family,Black,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6507,6507,35,Private,61343,Bachelors,13,Married-civ-spouse,Sales,Husband,White,...,0,0,0,0,0,0,0,1,0,0
6508,6508,41,Self-emp-inc,32185,Bachelors,13,Married-civ-spouse,Tech-support,Husband,White,...,0,0,0,0,0,0,0,1,0,0
6509,6509,39,Private,409189,5th-6th,3,Married-civ-spouse,Other-service,Husband,White,...,0,0,0,0,0,0,0,0,0,0
6510,6510,35,Private,180342,HS-grad,9,Married-civ-spouse,Craft-repair,Husband,White,...,0,0,0,0,0,0,0,1,0,0


In [23]:
train_n = all_dat_n.loc[ (all_dat_n['target']==0) | (all_dat_n['target']==1)  , : ]
test_n = all_dat_n.loc[ all_dat_n['target']==999  ,  : ]

In [24]:
print(train_n.shape, test_n.shape)

(26049, 119) (0, 119)


In [44]:
sel_cat = ['workclass', 'education', 'marital_status',
           'occupation', 'relationship', 'race', 
           'sex', 'native_country', 'income']
all_dat=all_dat_n.drop(sel_cat, axis=1)

# 데이터 나누기
train_n=all_dat_n.loc[(all_dat_n['target']==0) | (all_dat_n['target']==1),:]
test_n=all_dat_n.loc[all_dat_n['target']==999,:]

In [49]:
### 최종 모델에 적용하기 위해 데이터를 x,y로 분할
X=train_n.drop(['target'], axis=1)
y=train_n['target']

test_X=test_n.drop(['target'],axis=1)
print(X.shape,y.shape,test_X.shape)

(26049, 118) (26049,) (0, 118)


### 로지스틱 모델 선택, 학습 및 예측

In [46]:
from sklearn.linear_model import LogisticRegression

In [47]:
model = LogisticRegression()
model.fit(X, y)
pred = model.predict(test_X)

ValueError: could not convert string to float: 'Private'

In [33]:
sub.columns

Index(['id', 'prediction'], dtype='object')

In [48]:
print( sub.shape )
print( pred.shape )

(6512, 2)


NameError: name 'pred' is not defined

In [35]:
sub['prediction'] = pred
sub.to_csv("secondSub4th.csv", index=False)

NameError: name 'pred' is not defined