# Load Data

In [1]:
import pickle
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np

In [2]:
with open('input/Preprocessing/train.pickle', 'rb') as f:
    train = pickle.load(f)
    f.close()
target = pd.read_csv("input/train.csv")['target'].as_matrix()

In [3]:
print(train.shape)
print(target.shape)

(595212, 58)
(595212,)


In [4]:
x_train, x_validate, y_train, y_validate = train_test_split(train, target, test_size=0.2, random_state=12345)

# Model

In [5]:
import torch
import torch.nn as nn
from torch.autograd import Variable
from torch import FloatTensor
import torch.nn.functional as F

In [6]:
from torch.optim import Adam

In [7]:
def gini(actual, pred):
    assert (len(actual) == len(pred))
    all = np.asarray(np.c_[actual, pred, np.arange(len(actual))], dtype=np.float)
    all = all[np.lexsort((all[:, 2], -1 * all[:, 1]))]
    totalLosses = all[:, 0].sum()
    giniSum = all[:, 0].cumsum().sum() / totalLosses

    giniSum -= (len(actual) + 1) / 2.
    return giniSum / len(actual)


def gini_normalized(actual, pred):
    return gini(actual, pred) / gini(actual, actual)

In [8]:
from torch.utils.data import Dataset, DataLoader
class OurDataset(Dataset):
    
    def __init__(self, datas, labels):
        self.datas = datas
        self.labels = labels
    
    def __len__(self):
        return self.datas.size()[0]
    
    def __getitem__(self, idx):
        return {'data': self.datas[idx], 'label': self.labels[idx]}

In [22]:
class WeightedModel(nn.Module):
    
    def __init__(self, hidden_node, input_shape):
        super(WeightedModel, self).__init__()
        self.dense = []
        self.dropout = []
        prev_layer = input_shape
        for layer in hidden_node:
            self.dense.append(nn.Linear(prev_layer, layer))
            self.dropout.append(nn.Dropout(0.3))
            prev_layer = layer
        self.logit = nn.Linear(prev_layer, 1)
        self.optimizer = Adam(self.parameters(), lr=0.01)
        
    def forward(self, x):
        for d, drop in zip(self.dense, self.dropout):
            x = F.relu(d(x))
            x = drop(x)
        x = F.sigmoid(self.logit(x))
        return x
    
    def train_model(self, x, y, x_validate, y_validate, epochs=100, batch_size=512):
        dataset = OurDataset(x, y)
        dataloader = DataLoader(dataset, batch_size=batch_size,
                        shuffle=True, num_workers=4)
        
        for i in range(epochs):
            sum_loss = 0
            c = 0
            for i_batch, sample_batch in enumerate(dataloader):
                data = Variable(sample_batch['data'])
                label = Variable(sample_batch['label'].float())
                self.train()
                output = self(data)
                self.optimizer.zero_grad()
                loss_function = nn.BCELoss(weight=FloatTensor([33]))
                loss = loss_function(output, label)
                sum_loss = sum_loss + loss.data.numpy()[0]
                loss.backward()
                self.optimizer.step()
                c = c + 1
            print(sum_loss / c)
            
            self.eval()
            output_validate = self.predict(x_validate)
            actual = y_validate.data.numpy()
            pred = output_validate.data.numpy()
            score = gini_normalized(actual, pred)
            print(score)
            
    def predict(self, x, threshold=0.5) :
        return self(x)
    
#     @static_method
#     def class_weight(y):
        

In [23]:
x_train_torch = FloatTensor(x_train)
x_validate_torch = Variable(FloatTensor(x_validate))
y_train_torch = FloatTensor(y_train.astype(float))
y_validate_torch = Variable(FloatTensor(y_validate.astype(float)))

In [24]:
torch.manual_seed(12345)
model = WeightedModel([70], x_train.shape[1])

In [25]:
model.train_model(x_train_torch, y_train_torch, x_validate_torch, y_validate_torch, epochs=30)

  "Please ensure they have the same size.".format(target.size(), input.size()))
  "Please ensure they have the same size.".format(target.size(), input.size()))


5.95106322681
0.0111180476459
5.27209991722
0.0853381308602
5.18881504159
0.128526663227
5.15891350148
0.115015402552
5.15965202031
0.115439799573
5.14437878503
0.120313303622
5.16981314768
0.117702711084
5.141806551
0.118386368367
5.14377049089
0.118692645891
5.14243130538
0.124766904086
5.14565875758
0.116442773834
5.15654669131
0.118767072701
5.14267240303
0.129392302593
5.15562756105
0.127949610761
5.14571286029
0.125257820927
5.1433692707
0.109451037972
5.14377961225
0.108632533159
5.14343139872
0.125354896913
5.14231515814
0.114713610373
5.15713743441
0.116525698129
5.15786590279
0.119607494637
5.17272284899
0.120642096854
5.14472978443
0.109177290327
5.1440327567
0.134060463552
5.14221365713
0.13543746873
5.16963498088
0.123675901376
5.15614884317
0.111327214807
5.15787639731
0.12718008479
5.14469551784
0.123370469151
5.15676122987
0.118821305344


In [26]:
model.train_model(x_train_torch, y_train_torch, x_validate_torch, y_validate_torch, epochs=10, batch_size=10000)

  "Please ensure they have the same size.".format(target.size(), input.size()))
  "Please ensure they have the same size.".format(target.size(), input.size()))


5.1479121844
0.124404958123
5.14720972379
0.117916033964
5.14444720745
0.127140238419
5.14042995373
0.124004310381
5.14420353373
0.126242768871
5.14034452041
0.12038662224
5.13948462407
0.125906121229
5.14062483112
0.12824703084
5.14788039525
0.124379004594
5.14169085026
0.126965038955


In [27]:
model.train_model(x_train_torch, y_train_torch, x_validate_torch, y_validate_torch, epochs=10, batch_size=400000)

  "Please ensure they have the same size.".format(target.size(), input.size()))
  "Please ensure they have the same size.".format(target.size(), input.size()))


5.17882966995
0.123739444448
5.12894558907
0.129611093631
5.16650247574
0.123783371715
5.1676633358
0.131906951604
5.11590456963
0.128806615129
5.14883542061
0.128270056154
5.10571813583
0.125129205965
5.1561229229
0.113605810873
5.14143013954
0.122088056639
5.14145493507
0.115360389944
