In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [2]:
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import StandardScaler, MultiLabelBinarizer
import lightgbm as lgb

In [3]:
file_name = 'stat_0602'

stat = pd.read_csv('stat/' + file_name + '.csv')
train = pd.read_csv('data/train.csv', usecols=['bidder_id', 'outcome'])
test = pd.read_csv('data/test.csv', usecols=['bidder_id'])

In [5]:
# preprocessing
stat = pd.merge(stat, train, on='bidder_id', how='left')
bidder = stat['bidder_id']
clr_stat = stat.drop(['bidder_id', 'outcome'], axis=1)
clr_stat.fillna(0, inplace=True)

numbers = clr_stat.select_dtypes(exclude=['object'])
num_col_name = numbers.columns
numbers = numbers.as_matrix()

texts = clr_stat.select_dtypes(include=['object']).as_matrix()

scaler = StandardScaler()
numbers = pd.DataFrame(data=scaler.fit_transform(numbers), columns=num_col_name)

mlb = MultiLabelBinarizer()
mlb.fit(texts)
text_col_name = mlb.classes_
texts = pd.DataFrame(data=mlb.transform(texts), columns=text_col_name)

clr_stat = pd.concat([numbers, texts], axis=1)
clr_stat['bidder_id'] = bidder

In [6]:
train_sheet = pd.merge(clr_stat, train, on='bidder_id')
test_sheet = pd.merge(clr_stat, test, on='bidder_id')
y = train_sheet['outcome']
X = train_sheet.drop(['bidder_id', 'outcome'], axis=1)

In [7]:
# predict
test_X = test_sheet.drop(['bidder_id'], axis=1)
result_sheet = pd.DataFrame(data=test_sheet['bidder_id'], columns=['bidder_id'])

df = lgb.LGBMClassifier(n_estimators=300, num_leaves=4, min_child_weight=3)
df.fit(X, y)

result_sheet['prediction'] = pd.DataFrame(df.predict_proba(test_X))[1]
output = pd.merge(test, result_sheet, on='bidder_id', how='left').fillna(0)

In [11]:
file_name = 'output_0605_lgb_overfit'
output.to_csv('result/' + file_name + '.csv', index=False)