In [None]:
!mkdir ~/.kaggle

In [None]:
!cp kaggle.json ~/.kaggle

In [None]:
!chmod 600 ~/.kaggle/kaggle.json

In [None]:

!kaggle competitions download -c amex-default-prediction

In [None]:

!unzip amex-default-prediction.zip

In [None]:
import cairo
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

print('Reading data...')
chunk=pd.read_csv('train.csv',chunksize=100000)
train_data=pd.concat(chunk)
del chunk
num_columns=[]
cat_columns=[]
NAN_THRESHOLD=0.9
selected_columns=["S_2"]
print("Shape of train data:",train_data.shape)

samples=len(train_data)

for column in train_data.columns:
    if column=="S_2":
        continue
    #print(column,train_data[column].dtype)
    nans=train_data[column].isnull().sum()/samples
    if nans>NAN_THRESHOLD:
        print("Column:",column,"has",nans*100,"% of NANs")
        #train_data[column].fillna(train_data[column].mean(),inplace=True)
        #print("Filled with mean")
    else:
        if train_data[column].dtype=='object':
            train_data[column]=train_data[column].fillna(train_data[column].mode()[0])
            cat_columns.append(column)
            #print("Filled with mode")
        else:
            train_data[column]=train_data[column].fillna(train_data[column].mean())
            train_data[column]=train_data[column].round(decimals=2)
            num_columns.append(column)
            #print("Filled with mean")
        selected_columns.append(column)
train_data=train_data[selected_columns]
print("Shape of train data:",train_data.shape)

print("loading labels...")
train_labels=pd.read_csv('train_labels.csv')




In [None]:
print("Convert the Dates in S_2 column from string to Datetime")
train_data["S_2"]=pd.to_datetime(train_data["S_2"])

gb=train_data.groupby("customer_ID")
indices=gb.indices

"""
We need to encode the object values into categorical first and then ordinally encode it
We also need to round up the values to 2 decimals
"""
needed_cat_column=["customer_ID","S_2","target"]
for column in train_data.columns:
  nan=train_data[column].isna().sum()
  if nan/samples:
    print(column,train_data[column].dtype,(nan/samples)*100)
  if train_data[column].dtype=="object":
    if column not in needed_cat_column:
      train_data[column]=train_data[column].astype("category").cat.codes.astype(float)
  else:
    try:
      train_data[column]=train_data[column].round(decimals=2)
    except:
      print(column)


train_data.D_64=train_data.D_64.fillna(train_data.D_64.mode())
print(train_data.D_64.isna().sum())

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from datetime import timedelta
class TabnetDataset(Dataset):
    def __init__(self, train_labels,map):
        self.map=map
        self.labels=train_labels
        self.len=len(train_labels)
        self.device=torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

    def __len__(self):
        return self.len

    def __getitem__(self, idx):
        cust_id,target=self.labels.iloc[idx]["customer_ID"],self.labels.iloc[idx]["target"]
        indices=self.map[cust_id]
        df=train_data.iloc[indices].sort_values(by="S_2").drop(columns=["customer_ID"])
        df["S_2"]=df["S_2"].diff()/timedelta(minutes=1)
        df["S_2"]=df["S_2"].astype(float)
        df["S_2"]=df["S_2"].fillna(1).cumsum(axis=0)
        for column in df.columns:
            if column in num_columns:
                df[f'{column}_wavg']=df[column].mul(df["S_2"]).cumsum(axis=0)
                df[f'{column}_wavg']=df[f'{column}_wavg']/df["S_2"]
        print(df.shape)
        return torch.from_numpy(df.values).float().to(self.device),torch.from_numpy(np.array(target)).int().to(self.device)


    
        


In [None]:
from torch.nn.modules import dropout
from torch.nn.modules.dropout import Dropout

import torch
import torch.nn as nn
from torch.autograd import Variable

class Predict_class_dense(nn.Module):
  def __init__(self):
    super(Predict_class_dense,self).__init__()
    self.device=torch.device("cuda" if torch.cuda.is_available() else "cpu")
    self.LSTM=nn.LSTM(input_size=176,hidden_size=1500,num_layers=2,batch_first=True)
    self.dense1=nn.Linear(in_features=1500,out_features=1800)
    self.dense2=nn.Linear(in_features=1800,out_features=2000)
    self.dense3=nn.Linear(in_features=2000,out_features=780)
    self.dense4=nn.Linear(in_features=780,out_features=1)
    self.relu=nn.ReLU()
    self.softmax=nn.Softmax()
    self.h_0=Variable(torch.zeros(2,1,1500)).to(self.device)

    self.dropout1=nn.Dropout(p=0.2)
    self.dropout2=nn.Dropout(p=0.3)
    self.dropout3=nn.Dropout(p=0.2)
  def forward(self,x):
    x,_=self.LSTM(x,(self.h_0,self.h_0))
    x=x[:,-1,:].reshape(-1,x.shape[2])
    x=self.relu(x)
    x=self.dense1(x)
    x=self.relu(x)
    x=self.dropout1(x)
    x=self.dense2(x)
    x=self.relu(x)
    x=self.dropout2(x)
    x=self.dense3(x)
    x=self.relu(x)
    x=self.dropout3(x)
    x=self.dense4(x)
    x=nn.Sigmoid(x)
    return x



    


In [None]:
from torch.cuda import is_available
from torch.utils.data import DataLoader,SubsetRandomSampler
from sklearn.model_selection import train_test_split,KFold
from sklearn.metrics import confusion_matrix

Train_labels=train_labels.sample(frac=0.8)
Test_labels=train_labels.drop(Train_labels.index).reset_index()
Train_labels=Train_labels.reset_index()
device=torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
train_dataset=TabnetDataset(Train_labels,indices)
test_dataset=TabnetDataset(Test_labels,indices)

train_loader=DataLoader(train_dataset,batch_size=1,shuffle=True)
test_loader=DataLoader(test_dataset,batch_size=1,shuffle=False)

epochs=6
model=Predict_class_dense().to(device)
optimizer=torch.optim.Adam(model.parameters(),lr=0.00001)
criterion=nn.BCEWithLogitsLoss()
test_loss=[]
train_loss=[]
test_accuracy=[]
train_accuracy=[]
for epoch in range(epochs):
    for i,(images,labels) in enumerate(train_loader):
        images=images.to(device)
        labels=labels.type(torch.LongTensor)
        labels=labels.to(device)

        outputs=model(images)
        #print(outputs,labels)
        loss=criterion(outputs,labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f"Epoch[{epoch+1}/{epochs}], loss: {loss.item()} ")
    with torch.no_grad():
        model.eval()

        tr_loss=[]
        te_loss=[]
        print("Testing on test data")
        true_labels=[]
        final_output=[]
        n_correct=0
        n_samples=0


        for images,labels in test_loader:
            images=images.to(device)
            labels=labels.type(torch.LongTensor)
            labels=labels.to('cpu')
            true_labels.append(labels)
            labels=labels.to(device)
            outputs=model(images)
            
            loss=criterion(outputs,labels)

            _,predicted=torch.max(outputs.data,1)
            n_samples+=labels.size(0)
            n_correct+=(predicted==labels).sum().item()
            predicted=predicted.cpu()
            predicted=predicted.detach().numpy()
            final_output.append(predicted)
            te_loss.append(loss.item())
        test_loss.append(np.mean(te_loss))
        acc=100.0*n_correct/n_samples
        acc=round(acc,4)
        test_accuracy.append(acc)
        print(f"Testing accuracy for epoch {epoch+1}: {acc}")
        print("confusion matrix:")
        print(confusion_matrix(true_labels,final_output))

        if epoch==0:
            torch.save(model.state_dict(),f'test_model_{epoch+1}.h5')
            max_acc=acc
        else:
            if acc>max_acc:
                torch.save(model.state_dict(),f'test_model_{epoch+1}.h5')
                max_acc=acc

        true_labels=[]
        final_output=[]
        n_correct=0
        n_samples=0

        for images,labels in train_loader:
            images=images.to(device)
            labels=labels.type(torch.LongTensor)
            labels=labels.to('cpu')
            true_labels.append(labels)
            labels=labels.to(device)
            outputs=model(images)
            
            loss=criterion(outputs,labels)

            _,predicted=torch.max(outputs.data,1)
            n_samples+=labels.size(0)
            n_correct+=(predicted==labels).sum().item()
            predicted=predicted.cpu()
            predicted=predicted.detach().numpy()
            final_output.append(predicted)
            train_loss.append(loss.item())

        acc=100.0*n_correct/n_samples
        acc=round(acc,4)
        train_accuracy.append(acc)
        print(f"Training accuracy for epoch {epoch+1}: {acc}")
        print("confusion matrix:")
        print(confusion_matrix(true_labels,final_output))

        if epoch==0:
            torch.save(model.state_dict(),f'train_model_{epoch+1}.h5')
            max_acc=acc
        else:
            if acc>max_acc:
                torch.save(model.state_dict(),f'train_model_{epoch+1}.h5')
                max_acc=acc
        



