Import

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler

warnings.filterwarnings("ignore")
%matplotlib inline

In [2]:
vote_df = pd.read_csv('../train.csv')

#투표 여부 비율 확인
print(vote_df.voted.value_counts())

0    19918
1    16507
Name: voted, dtype: int64


In [3]:
#머신러닝 알고리즘에 불필요한 속성 제거
def drop_features(df):
    df.drop(['gender','hand','wr_01','wr_02','wr_03','wr_04','wr_05','wr_06','wr_07','wr_08','wr_09',
            'wr_10','wr_11','wr_12','wr_13',
            'wf_01', 'wf_02', 'wf_03', 'index'], axis = 1, inplace=True) 
    return df

In [4]:
#Q_E 함수
def replace_feature_Q(df):
    def outlier_iqr(df=None, column= None, weight=1.5):
        data = df[column]
        q25, q75 = np.percentile(data.values, 25), np.percentile(data.values, 75)
        iqr = q75 - q25
        iqr_weight = iqr * weight
        lower = q25 - iqr_weight
        upper = q75 + iqr_weight
        outlier_index = data[(data < lower) | (data > upper)].index
        return outlier_index
    col_name = ['QaE', 'QbE', 'QcE', 'QdE','QeE','QfE','QgE','QhE','QiE','QjE','QkE','QlE','QmE','QnE','QoE','QpE','QqE','QrE','QsE','QtE']
    for col in col_name:
        outlier_index = outlier_iqr(df, column = col)
        for i in outlier_index:
            df[col].replace(df.loc[i, col], df[col].mean())
    
    scaler = MinMaxScaler()
    df_QE = df[col_name]
    scaler.fit(df_QE)
    df[col_name] = scaler.transform(df_QE)
    return df

In [5]:
# age_group 함수
def replace_feature_age(df):
    df['age_group'].replace('+70s', '60s',inplace=True)
    #age_group 레이블 인코딩 수행:
    feature = ['age_group']
    le = LabelEncoder()
    le = le.fit(df[feature])
    df[feature] = le.transform(df[feature])
    return df

In [6]:
#familysize 함수
def fs_incodidng(df):
    def fs_labeling(df):
        label_list=[]
        for i in range(len(df)):
            if df.iloc[i]['familysize']==[2]:
                 label_list.append(7)
            elif df.iloc[i]['familysize']==[3]:
                 label_list.append(2)
            elif df.iloc[i]['familysize']==[1]:
                 label_list.append(5)
            elif df.iloc[i]['familysize']==[4]:
                 label_list.append(4)
            elif df.iloc[i]['familysize']==[5]:
                 label_list.append(3)
            elif df.iloc[i]['familysize']==[0]:
                 label_list.append(2)
            elif df.iloc[i]['familysize']==[6]:
                 label_list.append(1)
            else:
                label_list.append(0)
        return label_list
    df['familysize'] = fs_labeling(df)
    # 라벨링 후 0인 컬럼을 7로 변환(7이 가장 많은 비율을 차지하는 컬럼)
    df['familysize'].replace(0,7,inplace=True)
    return df

In [7]:
#race 함수
def race_incoding(df):
    def race_labeling(df):
        label_list=[]
        for i in range(len(df)):
            if df.iloc[i]['race']=='White':
                label_list.append(2)
            elif df.iloc[i]['race']=='Asian' or df.iloc[i]['race']=='Arab':
                label_list.append(0)
            else:
                label_list.append(1)
        return label_list

    df['race'] = race_labeling(df)
    return df

In [8]:
# 0값 대체함수
def replace_feature(df):
    df['education'].replace(0,2,inplace=True)
    df['married'].replace(0,1,inplace=True) 
    df['urban'].replace(0,2,inplace=True)
    df['engnat'].replace(0,1,inplace=True)
    return df

In [9]:
# 원핫인코딩 함수
def OH_incording(df):
    df = pd.get_dummies(df, columns = ['religion', 'urban', 'engnat'])
    return df

In [10]:
def transform_features(df):
    df = replace_feature_Q(df)
    df = replace_feature_age(df)
    df = fs_incodidng(df)
    df = race_incoding(df)
    df = drop_features(df)
    df = replace_feature(df)
    df = OH_incording(df)
    return df

In [11]:
# 원본데이터를 재로딩
vote_df = pd.read_csv('../train.csv')
# X와 y로 나눔
y_vote_df = vote_df['voted']
X_vote_df = vote_df.drop('voted',axis=1)

# X에만 함수 적용
# X_vote_df = transform_features(X_vote_df)

In [12]:
X_vote_df.head(5)

Unnamed: 0,QaA,QaE,QbA,QbE,QcA,QcE,QdA,QdE,QeA,QeE,...,religion_Hindu,religion_Jewish,religion_Muslim,religion_Other,religion_Sikh,urban_1,urban_2,urban_3,engnat_1,engnat_2
0,1.0,0.000124,5.0,0.000322,5.0,0.000383,1.0,0.00044,1.0,0.0004,...,0,0,0,0,0,0,1,0,1,0
1,1.0,0.000211,5.0,0.000473,5.0,0.002294,1.0,0.00069,1.0,0.001705,...,1,0,0,0,0,0,0,1,0,1
2,5.0,0.000242,1.0,0.000224,2.0,0.000687,1.0,0.001164,3.0,0.000809,...,0,0,0,0,0,0,1,0,1,0
3,4.0,6.5e-05,1.0,0.000911,1.0,0.002214,4.0,0.002965,1.0,0.000724,...,1,0,0,0,0,1,0,0,0,1
4,4.0,0.000217,5.0,0.000512,5.0,0.000523,3.0,0.000366,1.0,0.000657,...,0,0,0,1,0,0,1,0,1,0


In [14]:
# # 원본데이터 전체에 데이터 전처리
# vote_df2 = transform_features(vote_df)
# vote_df2.to_csv('../vote_df2.csv', index=False)

In [15]:
# # 원본 테스트데이터 불러오기
# vote_df_test = pd.read_csv('../test.csv')

# # 원본 테스트데이터 전처리하고 csv로 저장
# vote_df_test = transform_features(vote_df_test)
# vote_df_test.to_csv('../vote_df_test.csv', index=False)