In [40]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.style.use('seaborn-white')

from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler, RobustScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, StratifiedKFold

from lightgbm import LGBMClassifier

In [11]:
train = pd.read_csv('./Data/소득예측경진대회/train.csv')
test = pd.read_csv('./Data/소득예측경진대회/test.csv')
submission = pd.read_csv('./Data/소득예측경진대회/sample_submission.csv')

train = train.drop(['id'], axis = 1)
test = test.drop(['id'], axis = 1)

train.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,target
0,32,Private,309513,Assoc-acdm,12,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,40,United-States,0
1,33,Private,205469,Some-college,10,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,40,United-States,1
2,46,Private,149949,Some-college,10,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,40,United-States,0
3,23,Private,193090,Bachelors,13,Never-married,Adm-clerical,Own-child,White,Female,0,0,30,United-States,0
4,55,Private,60193,HS-grad,9,Divorced,Adm-clerical,Not-in-family,White,Female,0,0,40,United-States,0


In [12]:
train['workclass'] = train['workclass'].fillna('Private')
train['occupation'] = train['occupation'].fillna('Exec-managerial')
train['native.country'] = train['native.country'].fillna('United-States')

In [13]:
X = train.drop(['target'], axis = 1)
y = train['target']

In [14]:
columns = ('workclass', 'education', 'marital.status', 'occupation', 'relationship', 'race', 'sex', 'native.country')

for cols in columns:
    le = LabelEncoder()
    le.fit(list(X[cols]))
    X[cols] = le.transform(list(X[cols]))
    

for cols in columns:
    le = LabelEncoder()
    le.fit(list(test[cols]))
    test[cols] = le.transform(list(test[cols]))

In [15]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = .2, random_state = 123)

print(x_train.shape, x_test.shape)
print(y_train.shape, y_test.shape)

(13984, 14) (3496, 14)
(13984,) (3496,)


In [21]:
lgb = LGBMClassifier()
lgb.fit(x_train, y_train)

LGBMClassifier()

In [38]:
test_pred = lgb.predict(x_test)
acc = accuracy_score(test_pred, y_test)
print(acc)

## 스케일링 전, 'x_test' 결과 : 0.875

0.875


In [22]:
pred = lgb.predict(test)

In [23]:
pred

array([1, 0, 1, ..., 0, 0, 0], dtype=int64)

In [26]:
lgbc_submission = pd.DataFrame({'id':submission.id, 'target':pred})
lgbc_submission.to_csv("lgbc_submission.csv", index = False)

## 정규화진행후 테스트

In [59]:
# 1. StandardScaler

std_scaler = StandardScaler()
X = std_scaler.fit_transform(X)
y = train['target']

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = .2, random_state = 123)

print(x_train.shape, x_test.shape)
print(y_train.shape, y_test.shape)

lgb = LGBMClassifier()
lgb.fit(x_train, y_train)

test = std_scaler.fit_transform(test)

(13984, 14) (3496, 14)
(13984,) (3496,)


In [61]:
test_pred = lgb.predict(x_test)
acc = accuracy_score(test_pred, y_test)
print(acc)

0.8727116704805492


In [60]:
pred = lgb.predict(test)
lgbc_std_submission = pd.DataFrame({'id':submission.id, 'target':pred})
lgbc_std_submission.to_csv("lgbc_std_submission.csv", index = False)

In [62]:
# 2. MinMaxScaler

mm_scaler = MinMaxScaler()
X = std_scaler.fit_transform(X)
y = train['target']

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = .2, random_state = 123)

print(x_train.shape, x_test.shape)
print(y_train.shape, y_test.shape)

lgb = LGBMClassifier()
lgb.fit(x_train, y_train)

test = std_scaler.fit_transform(test)

test_pred = lgb.predict(x_test)
acc = accuracy_score(test_pred, y_test)
print(acc)

pred = lgb.predict(test)
lgbc_mm_submission = pd.DataFrame({'id':submission.id, 'target':pred})
lgbc_mm_submission.to_csv("lgbc_mm_submission.csv", index = False)

(13984, 14) (3496, 14)
(13984,) (3496,)
0.8727116704805492


In [63]:
# 3. RobustScaler

R_scaler = RobustScaler()
X = R_scaler.fit_transform(X)
y = train['target']

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = .2, random_state = 123)

print(x_train.shape, x_test.shape)
print(y_train.shape, y_test.shape)

lgb = LGBMClassifier()
lgb.fit(x_train, y_train)

test = std_scaler.fit_transform(test)

test_pred = lgb.predict(x_test)
acc = accuracy_score(test_pred, y_test)
print(acc)

pred = lgb.predict(test)
lgbc_rob_submission = pd.DataFrame({'id':submission.id, 'target':pred})
lgbc_rob_submission.to_csv("lgbc_rob_submission.csv", index = False)

(13984, 14) (3496, 14)
(13984,) (3496,)
0.8778604118993135


### 결과
- 모든 scaling방법동원했으나 점수가 오히려 떨어짐
- Feature Engineering 필요해보임