In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os

data : https://www.kaggle.com/c/porto-seguro-safe-driver-prediction/data?select=train.csv

In [56]:
# ind, reg, car, calc
# ind是与司机个人相关的特征/reg是地区相关的特征/car是汽车相关的特征/calc则是其他通过计算或估计得到的特征
# bin : binary features 
# cat : indicate categorical features
df = pd.read_csv('train.csv')

In [57]:
len(df)

595212

In [58]:
df.head()

Unnamed: 0,id,target,ps_ind_01,ps_ind_02_cat,ps_ind_03,ps_ind_04_cat,ps_ind_05_cat,ps_ind_06_bin,ps_ind_07_bin,ps_ind_08_bin,...,ps_calc_11,ps_calc_12,ps_calc_13,ps_calc_14,ps_calc_15_bin,ps_calc_16_bin,ps_calc_17_bin,ps_calc_18_bin,ps_calc_19_bin,ps_calc_20_bin
0,7,0,2,2,5,1,0,0,1,0,...,9,1,5,8,0,1,1,0,0,1
1,9,0,1,1,7,0,0,0,0,1,...,3,1,1,9,0,1,1,0,1,0
2,13,0,5,4,9,1,0,0,0,1,...,4,2,7,7,0,1,1,0,1,0
3,16,0,0,1,2,0,0,1,0,0,...,2,2,4,9,0,0,0,0,0,0
4,17,0,0,2,0,1,0,1,0,0,...,3,1,1,3,0,0,0,1,1,0


In [59]:
df['target'][10]

0

In [60]:
len(df)

595212

In [62]:
for col in df.columns: 
    print(col) 
    df = df[df[col].notnull()]
len(df)

id
target
ps_ind_01
ps_ind_02_cat
ps_ind_03
ps_ind_04_cat
ps_ind_05_cat
ps_ind_06_bin
ps_ind_07_bin
ps_ind_08_bin
ps_ind_09_bin
ps_ind_10_bin
ps_ind_11_bin
ps_ind_12_bin
ps_ind_13_bin
ps_ind_14
ps_ind_15
ps_ind_16_bin
ps_ind_17_bin
ps_ind_18_bin
ps_reg_01
ps_reg_02
ps_reg_03
ps_car_01_cat
ps_car_02_cat
ps_car_03_cat
ps_car_04_cat
ps_car_05_cat
ps_car_06_cat
ps_car_07_cat
ps_car_08_cat
ps_car_09_cat
ps_car_10_cat
ps_car_11_cat
ps_car_11
ps_car_12
ps_car_13
ps_car_14
ps_car_15
ps_calc_01
ps_calc_02
ps_calc_03
ps_calc_04
ps_calc_05
ps_calc_06
ps_calc_07
ps_calc_08
ps_calc_09
ps_calc_10
ps_calc_11
ps_calc_12
ps_calc_13
ps_calc_14
ps_calc_15_bin
ps_calc_16_bin
ps_calc_17_bin
ps_calc_18_bin
ps_calc_19_bin
ps_calc_20_bin


595212

In [266]:
from torch.utils.data.dataset import Dataset
from torch.utils.data import DataLoader

import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch

# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')#CUDA error: device-side assert triggered
device = torch.device('cpu')

In [122]:
class insuranceDataset(Dataset):
    def __init__(self, csv_data):
        
        self.dataset = csv_data
        self.length  = len(csv_data)

    def __getitem__(self, index):
        user = 0
        target = 0
        ps_ind = []
        ps_reg = []
        ps_car = []
        ps_calc =[]
        
        for col in df.columns: 
            if 'id' in col:
                user = int(self.dataset[col][index])
                
            if 'target' in col:
                target = int(self.dataset[col][index])
                
            if '_bin' in col:
                if 'ps_ind' in col:
                    ps_ind.append(self.dataset[col][index])
                elif 'ps_reg' in col:
                    ps_reg.append(self.dataset[col][index])
                elif 'ps_car' in col:
                    ps_car.append(self.dataset[col][index])
                elif 'ps_calc' in col:
                    ps_calc.append(self.dataset[col][index])
                else:
                    print("error")

        ps_ind = torch.Tensor(ps_ind).to(device)
        ps_car = torch.Tensor(ps_car).to(device)
        ps_reg = torch.Tensor(ps_reg).to(device)
        ps_calc = torch.Tensor(ps_calc).to(device)
        
        return target, ps_ind, ps_car, ps_reg, ps_calc

    def __len__(self):

        return self.length

In [123]:
#595212
train_set = insuranceDataset(df[500000:].reset_index(drop=True))

In [124]:
train_set.__getitem__(0)

(0,
 tensor([1., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.]),
 tensor([]),
 tensor([]),
 tensor([0., 1., 0., 0., 0., 1.]))

In [125]:
t,w,x,y,z = train_set.__getitem__(0)
test_input = torch.cat([w, x, y, z],0)
print(test_input)

tensor([1., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 1., 0., 0., 0., 1.])


In [126]:
test_input.size()

torch.Size([17])

In [418]:
class DeepFM(nn.Module):
    def __init__(self, field_dic, emb_dim, num_factors, mlp_dims, drop_rate=0.1):
        super(DeepFM, self).__init__()
        
        self.ind_embedding = nn.Embedding(field_dic, emb_dim)
        self.car_embedding = nn.Embedding(field_dic, emb_dim)
        self.reg_embedding = nn.Embedding(field_dic, emb_dim)
        self.calc_embedding = nn.Embedding(field_dic, emb_dim)
        
        self.fc = nn.Embedding(field_dic, 1)
        self.linear_layer = nn.Linear(1,1)
        
        input_dim = self.embed_output_dim = num_factors*emb_dim
        self.modules = []
        for dim in mlp_dims:      
            self.modules.append(nn.Linear(input_dim, dim))
            self.modules.append(nn.Sigmoid())
            self.modules.append(nn.Dropout(drop_rate))
            input_dim = dim
        self.modules.append(nn.Linear(dim,1))
        self.mlp = nn.Sequential(*self.modules)
        
        self.classify_layer = nn.Linear(1,2)
        
    def forward(self, ind, car, reg, calc):
        x = torch.cat([ind, car, reg, calc],1).to(device)
        
        embed_ind = self.ind_embedding(ind)
        embed_car = self.car_embedding(car)
        embed_reg = self.reg_embedding(reg)
        embed_calc = self.calc_embedding(calc)
        embed_x = torch.cat([embed_ind, embed_car, embed_reg, embed_calc],1).to(device)

        square_of_sum = torch.sum(embed_x, 1) ** 2
        sum_of_square = torch.sum(embed_x ** 2, 1)
        
        inputs = embed_x.view(list(x.size())[0],-1)

        x = self.linear_layer(self.fc(x).sum(1)) + 0.5 * (square_of_sum - sum_of_square).sum(1, keepdims=True) + self.mlp(inputs)
        
        x = self.classify_layer(x)
        x = torch.sigmoid(x)

        return x
    

In [424]:
dic = [0,1,-1]
factor = [11,0,0,6]

net = DeepFM(field_dic=3, emb_dim=3, num_factors=sum(factor), mlp_dims=[30, 20, 10]).to(device)

In [425]:
net #target, ps_ind 18, ps_car 16, ps_reg 3, ps_calc 20

DeepFM(
  (ind_embedding): Embedding(3, 3)
  (car_embedding): Embedding(3, 3)
  (reg_embedding): Embedding(3, 3)
  (calc_embedding): Embedding(3, 3)
  (fc): Embedding(3, 1)
  (linear_layer): Linear(in_features=1, out_features=1, bias=True)
  (mlp): Sequential(
    (0): Linear(in_features=51, out_features=30, bias=True)
    (1): Sigmoid()
    (2): Dropout(p=0.1, inplace=False)
    (3): Linear(in_features=30, out_features=20, bias=True)
    (4): Sigmoid()
    (5): Dropout(p=0.1, inplace=False)
    (6): Linear(in_features=20, out_features=10, bias=True)
    (7): Sigmoid()
    (8): Dropout(p=0.1, inplace=False)
    (9): Linear(in_features=10, out_features=1, bias=True)
  )
  (classify_layer): Linear(in_features=1, out_features=2, bias=True)
)

In [426]:
optimizer = optim.Adam(net.parameters())
loss_fn = torch.nn.CrossEntropyLoss()

In [427]:
BATCH_SIZE = 100
train_data = DataLoader(train_set, shuffle=True, batch_size=BATCH_SIZE)
iter = len(train_data)

In [428]:
# [batch, ind_dim] [batch, car_dim] [batch, reg_dim] [batch, calc_dim]
for e in range(10): #lucky number
    
    mean_loss = 0
    count = 0
    
    for t, ind, car, reg, calc  in train_data:
        
        try:
            optimizer.zero_grad()

            output =  net(ind.type(torch.LongTensor), car.type(torch.LongTensor), reg.type(torch.LongTensor), calc.type(torch.LongTensor))
            # output[batch, 2]
            loss = loss_fn(output, t.type(torch.LongTensor))
            loss.backward()
            optimizer.step()

            count += 1
            mean_loss+=float(loss)
            
        except RuntimeError:
            print('error.....1')
        except AttributeError: 
            print('error.....2')
#         else:
#             print("e : " + str(e) + ' (' + str(count) + '/' + str(iter) + ') loss : ' + str(float(loss)) )

    print("*****e : "+str(e)+" L : "+str(mean_loss/count)+"*****")
        
    torch.save(net, "./DeepFM_"+str(e)+".pkl")

*****e : 0 L : 0.37850112826851706*****
*****e : 1 L : 0.3500160703441404*****
*****e : 2 L : 0.34994018227956475*****
*****e : 3 L : 0.3500011481055433*****
*****e : 4 L : 0.34991887571428903*****
*****e : 5 L : 0.34991656223474243*****
*****e : 6 L : 0.3499923803123322*****
*****e : 7 L : 0.35006878508600836*****
*****e : 8 L : 0.34991460041009115*****
*****e : 9 L : 0.34991445759035733*****


# test

In [433]:
test_set = insuranceDataset(df[-90000:].reset_index(drop=True))

In [434]:
len(test_set)

90000

In [435]:
test_data = DataLoader(train_set, shuffle=False, batch_size=1)

In [441]:
hit = 0
count = 0

for t, ind, car, reg, calc  in test_data:

    with torch.no_grad():


        output =  net(ind.type(torch.LongTensor), car.type(torch.LongTensor), reg.type(torch.LongTensor), calc.type(torch.LongTensor))
        
        predict = torch.argmax(output)
        
#         print(output, predict, t)

        count += 1
        
        if int(predict) == int(t):
            hit += 1
        
        
print( str(hit) + " / " + str(count) + "  acc : "  +str(hit/count))

91719 / 95212  acc : 0.9633134478847204
