In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing
from sklearn.metrics import log_loss
from lightgbm import LGBMClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import OneHotEncoder
import random
import glob
import missingno as msno
from sklearn.svm import SVC
import warnings

from tensorflow.keras.utils import to_categorical

In [3]:
pd.set_option('max_columns', 100)
warnings.simplefilter('ignore')

In [4]:
train=pd.read_csv('train.csv')
test=pd.read_csv('test.csv')
submit=pd.read_csv('sample_submission.csv')
df_train=train.copy()
df_test=test.copy()

In [5]:
print(train.shape,test.shape)

(26457, 20) (10000, 19)


In [6]:
non_occyp=df_train.loc[df_train['occyp_type'].isnull()]#직업유형이 없는 사람
occyp=df_train.dropna()#직업유형이 있는 사람(결측치 제거)

non_occyp.loc[non_occyp['DAYS_EMPLOYED']>=0, 'occyp_type'] = 'Inoccupation'
df_test.loc[df_test['DAYS_EMPLOYED']>=0, 'occyp_type'] = 'Inoccupation'


In [7]:
train_data = pd.concat([occyp,non_occyp],ignore_index = True)
test_data = df_test
print(train_data.shape,test_data.shape)

(26457, 20) (10000, 19)


In [8]:
#명목형 변수 라벨화
label_encoder=preprocessing.LabelEncoder()
train_data['gender'] = train_data['gender'].replace(['F','M'],[0,1])
test_data['gender'] = test_data['gender'].replace(['F','M'],[0,1])
train_data['car'] = train_data['car'].replace(['N','Y'],[0,1])
test_data['car'] = test_data['car'].replace(['N','Y'],[0,1])
train_data['reality'] = train_data['reality'].replace(['N','Y'],[0,1])
test_data['reality'] = test_data['reality'].replace(['N','Y'],[0,1])
train_data['income_type']=label_encoder.fit_transform(train_data['income_type'])
test_data['income_type']=label_encoder.fit_transform(test_data['income_type'])
#################################################################
train_data['edu_type']=label_encoder.fit_transform(train_data['edu_type'])
test_data['edu_type']=label_encoder.fit_transform(test_data['edu_type'])
########################################################################
train_data['family_type']=label_encoder.fit_transform(train_data['family_type'])
test_data['family_type']=label_encoder.fit_transform(test_data['family_type'])
########################################################################
train_data['house_type']=label_encoder.fit_transform(train_data['house_type'])
test_data['house_type']=label_encoder.fit_transform(test_data['house_type'])
########################################################################

In [9]:
train_data.head()

Unnamed: 0,index,gender,car,reality,child_num,income_total,income_type,edu_type,family_type,house_type,DAYS_BIRTH,DAYS_EMPLOYED,FLAG_MOBIL,work_phone,phone,email,occyp_type,family_size,begin_month,credit
0,1,0,0,1,1,247500.0,0,4,0,1,-11380,-1540,1,0,0,1,Laborers,3.0,-5.0,1.0
1,2,1,1,1,0,450000.0,4,1,1,1,-19087,-4434,1,0,1,0,Managers,2.0,-22.0,2.0
2,3,0,0,1,0,202500.0,0,4,1,1,-15088,-2092,1,0,1,0,Sales staff,2.0,-37.0,0.0
3,4,0,1,1,0,157500.0,2,1,1,1,-15037,-2105,1,0,0,0,Managers,2.0,-26.0,2.0
4,5,0,0,1,2,270000.0,4,4,1,1,-13413,-4996,1,0,0,1,High skill tech staff,4.0,-18.0,1.0


In [10]:
test_data.head()

Unnamed: 0,index,gender,car,reality,child_num,income_total,income_type,edu_type,family_type,house_type,DAYS_BIRTH,DAYS_EMPLOYED,FLAG_MOBIL,work_phone,phone,email,occyp_type,family_size,begin_month
0,26457,1,1,0,0,112500.0,1,4,0,1,-21990,365243,1,0,1,0,Inoccupation,2.0,-60.0
1,26458,0,0,1,0,135000.0,2,1,1,1,-18964,-8671,1,0,1,0,Core staff,2.0,-36.0
2,26459,0,0,1,0,69372.0,4,4,1,1,-15887,-217,1,1,1,0,Laborers,2.0,-40.0
3,26460,1,1,0,0,112500.0,0,4,1,1,-19270,-2531,1,1,0,0,Drivers,2.0,-41.0
4,26461,0,1,1,0,225000.0,2,1,1,1,-17822,-9385,1,1,0,0,Managers,2.0,-8.0


In [11]:
null_data=train_data.loc[train_data['occyp_type'].isnull()]
fill_data=train_data.dropna() #결측치 제거한 데이터
print(fill_data.shape,null_data.shape)

fill_data['occyp_type_l']=label_encoder.fit_transform(fill_data['occyp_type'])
#결측치 없는 데이터 중 occyp_type를 라벨링한 변수를 추가(라벨링과 원래값 비교를 위해)

(22724, 20) (3733, 20)


In [12]:
from sklearn import linear_model 
lin_reg = linear_model.LinearRegression()
x=fill_data[fill_data.columns.drop(['index','occyp_type','credit',
                                  'occyp_type_l'])]
#employeed(0,1)을 넣지 않고 days_employed에서 양수값을 0으로 처리한 상태에서 재시도
#라벨링한 occyp_type_l는 종속변수로 사용
y=fill_data['occyp_type_l']#라벨링 상태
lin_reg_model = lin_reg.fit(x, y)

print("x.shape :", x.shape , "y.shape :" , y.shape)

x_test=null_data[null_data.columns.drop(['index','occyp_type','credit'])]
y_test=null_data['occyp_type']
y_test=lin_reg.predict(x_test)#결측치가 있는 데이터에 대입

print("x.test :", x_test.shape , "y.test :" , y_test.shape)

null_data['occyp_type']=y_test
null_data['occyp_type']=np.around(null_data['occyp_type']) #회귀 결과의 소수점 반올림

x.shape : (22724, 17) y.shape : (22724,)
x.test : (3733, 17) y.test : (3733,)


In [13]:
#라벨링->원래값
null_data.loc[null_data['occyp_type']==2.0,'occyp_type']= 'Cooking staff'
null_data.loc[null_data['occyp_type']==3.0,'occyp_type']='Core staff'
null_data.loc[null_data['occyp_type']==4.0,'occyp_type']='Drivers'
null_data.loc[null_data['occyp_type']==5.0,'occyp_type']='HR staff'
null_data.loc[null_data['occyp_type']==6.0,'occyp_type']='High skill tech staff'
null_data.loc[null_data['occyp_type']==7.0,'occyp_type']='IT staff'
null_data.loc[null_data['occyp_type']==8.0,'occyp_type']='Laborers'
null_data.loc[null_data['occyp_type']==9.0,'occyp_type']='Low-skill Laborers'
null_data.loc[null_data['occyp_type']==10.0,'occyp_type']='Managers'
null_data.loc[null_data['occyp_type']==11.0,'occyp_type']='Medicine staff'
null_data.loc[null_data['occyp_type']==12.0,'occyp_type']='Private service staff'
null_data.loc[null_data['occyp_type']==13.0,'occyp_type']='Realty agents'
null_data.loc[null_data['occyp_type']==14.0,'occyp_type']='Sales staff'
null_data.loc[null_data['occyp_type']==15.0,'occyp_type']='Secretaries'

In [14]:
fill_data=fill_data.drop('occyp_type_l',axis=1)
train=pd.concat([null_data,fill_data],ignore_index=True).sort_values(by='index')

In [15]:
train['occyp_type'] = label_encoder.fit_transform(train['occyp_type'])
# print(label_encoder.classes_)
# train['occyp_type'].value_counts()   #확인

In [16]:
# test 데이터 변환
null_data=test_data.loc[test_data['occyp_type'].isnull()]
fill_data=test_data.dropna() #결측치 제거한 데이터
print(fill_data.shape,null_data.shape)

fill_data['occyp_type_l']=label_encoder.fit_transform(fill_data['occyp_type'])
#결측치 없는 데이터 중 occyp_type를 라벨링한 변수를 추가(라벨링과 원래값 비교를 위해)

(8545, 19) (1455, 19)


In [17]:
from sklearn import linear_model 
lin_reg = linear_model.LinearRegression()
x=fill_data[fill_data.columns.drop(['index','occyp_type',
                                  'occyp_type_l'])]
#employeed(0,1)을 넣지 않고 days_employed에서 양수값을 0으로 처리한 상태에서 재시도
#라벨링한 occyp_type_l는 종속변수로 사용
y=fill_data['occyp_type_l']#라벨링 상태
lin_reg_model = lin_reg.fit(x, y)

print("x.shape :", x.shape , "y.shape :" , y.shape)

x_test=null_data[null_data.columns.drop(['index','occyp_type'])]
y_test=null_data['occyp_type']
y_test=lin_reg.predict(x_test)#결측치가 있는 데이터에 대입

print("x.test :", x_test.shape , "y.test :" , y_test.shape)

null_data['occyp_type']=y_test
null_data['occyp_type']=np.around(null_data['occyp_type']) #회귀 결과의 소수점 반올림

x.shape : (8545, 17) y.shape : (8545,)
x.test : (1455, 17) y.test : (1455,)


In [18]:
#라벨링->원래값
null_data.loc[null_data['occyp_type']==2.0,'occyp_type']= 'Cooking staff'
null_data.loc[null_data['occyp_type']==3.0,'occyp_type']='Core staff'
null_data.loc[null_data['occyp_type']==4.0,'occyp_type']='Drivers'
null_data.loc[null_data['occyp_type']==5.0,'occyp_type']='HR staff'
null_data.loc[null_data['occyp_type']==6.0,'occyp_type']='High skill tech staff'
null_data.loc[null_data['occyp_type']==7.0,'occyp_type']='IT staff'
null_data.loc[null_data['occyp_type']==8.0,'occyp_type']='Laborers'
null_data.loc[null_data['occyp_type']==9.0,'occyp_type']='Low-skill Laborers'
null_data.loc[null_data['occyp_type']==10.0,'occyp_type']='Managers'
null_data.loc[null_data['occyp_type']==11.0,'occyp_type']='Medicine staff'
null_data.loc[null_data['occyp_type']==12.0,'occyp_type']='Private service staff'
null_data.loc[null_data['occyp_type']==13.0,'occyp_type']='Realty agents'
null_data.loc[null_data['occyp_type']==14.0,'occyp_type']='Sales staff'
null_data.loc[null_data['occyp_type']==15.0,'occyp_type']='Secretaries'

In [19]:
fill_data=fill_data.drop('occyp_type_l',axis=1)
test=pd.concat([null_data,fill_data],ignore_index=True).sort_values(by='index')

In [20]:
test['occyp_type'] = label_encoder.fit_transform(test['occyp_type'])
# print(label_encoder.classes_)
# train['occyp_type'].value_counts()   #확인

In [21]:
train.head()

Unnamed: 0,index,gender,car,reality,child_num,income_total,income_type,edu_type,family_type,house_type,DAYS_BIRTH,DAYS_EMPLOYED,FLAG_MOBIL,work_phone,phone,email,occyp_type,family_size,begin_month,credit
0,0,0,0,0,0,202500.0,0,1,1,2,-13899,-4709,1,0,0,0,9,2.0,-6.0,1.0
3733,1,0,0,1,1,247500.0,0,4,0,1,-11380,-1540,1,0,0,1,9,3.0,-5.0,1.0
3734,2,1,1,1,0,450000.0,4,1,1,1,-19087,-4434,1,0,1,0,11,2.0,-22.0,2.0
3735,3,0,0,1,0,202500.0,0,4,1,1,-15088,-2092,1,0,1,0,15,2.0,-37.0,0.0
3736,4,0,1,1,0,157500.0,2,1,1,1,-15037,-2105,1,0,0,0,11,2.0,-26.0,2.0


In [22]:
test.head()

Unnamed: 0,index,gender,car,reality,child_num,income_total,income_type,edu_type,family_type,house_type,DAYS_BIRTH,DAYS_EMPLOYED,FLAG_MOBIL,work_phone,phone,email,occyp_type,family_size,begin_month
1455,26457,1,1,0,0,112500.0,1,4,0,1,-21990,365243,1,0,1,0,8,2.0,-60.0
1456,26458,0,0,1,0,135000.0,2,1,1,1,-18964,-8671,1,0,1,0,3,2.0,-36.0
1457,26459,0,0,1,0,69372.0,4,4,1,1,-15887,-217,1,1,1,0,9,2.0,-40.0
1458,26460,1,1,0,0,112500.0,0,4,1,1,-19270,-2531,1,1,0,0,4,2.0,-41.0
1459,26461,0,1,1,0,225000.0,2,1,1,1,-17822,-9385,1,1,0,0,11,2.0,-8.0


# model 
(label_encoding되어있음 -> one-hot encoding은 코드 수정)

In [30]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
folds=[]
for train_idx, valid_idx in skf.split(train, train['credit']):
    folds.append((train_idx, valid_idx))
X_test = test

In [None]:
from tensorflow.keras.utils import to_categorical
from sklearn.metrics import accuracy_score, log_loss
from sklearn.model_selection import train_test_split
from sklearn.multiclass import OneVsRestClassifier

random.seed(42)
outcomes=[]
sub=np.zeros((test_x.shape[0], 3))  
for fold in range(5):
    train_idx, valid_idx = folds[fold]
    X_train, X_valid, y_train, y_valid = train.drop(['credit'],axis=1).iloc[train_idx].values, train.drop(['credit'],axis=1).iloc[valid_idx].values,\
                                         train['credit'][train_idx].values, train['credit'][valid_idx].values 
    svm = OneVsRestClassifier(SVC(probability=True)).fit(X_train,y_train)
    y_pred = svm.predict_proba(X_valid)
    logloss = log_loss(y_valid,y_pred)
    outcomes.append(logloss)
    print(f"FOLD {fold+1} : logloss:{logloss}")
    
    sub += svm.predict_proba(X_test)

mean_outcome=np.mean(outcomes)
print("Mean_outcome:{}".format(mean_outcome))

my_submission = sub/skf.n_splits
submission.loc[:,1:]=my_submission
submission.to_csv('baseline_submission.csv', index=False)

FOLD 0 : logloss:0.8775724646996279
FOLD 1 : logloss:0.8936698226468275
FOLD 2 : logloss:0.8771893135222251


In [None]:
# random.seed(42)
# lgb_models={}
# for fold in range(5):
#     print(f'===================================={fold+1}============================================')
#     train_idx, valid_idx = folds[fold]
#     X_train, X_valid, y_train, y_valid = train.drop(['credit'],axis=1).iloc[train_idx].values, train.drop(['credit'],axis=1).iloc[valid_idx].values,\
#                                          train['credit'][train_idx].values, train['credit'][valid_idx].values 
#     lgb = LGBMClassifier(n_estimators=1000)
#     lgb.fit(X_train, y_train, 
#             eval_set=[(X_train, y_train), (X_valid, y_valid)], 
#             early_stopping_rounds=30,
#            verbose=100)
#     lgb_models[fold]=lgb
#     print(f'================================================================================\n\n')