In [None]:
mkdir ~/.kaggle

In [None]:
cp kaggle.json ~/.kaggle

In [None]:
!chmod 600 ~/.kaggle/kaggle.json

In [None]:
!kaggle competitions download -c amex-default-prediction

Downloading amex-default-prediction.zip to /content
100% 20.5G/20.5G [04:03<00:00, 96.0MB/s]
100% 20.5G/20.5G [04:03<00:00, 90.4MB/s]


In [None]:
!unzip amex-default-prediction.zip

Archive:  amex-default-prediction.zip
  inflating: sample_submission.csv   
  inflating: test_data.csv           
  inflating: train_data.csv          
  inflating: train_labels.csv        


In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import csv
import json
chunk=pd.read_csv('train_data.csv',chunksize=100000)
train_data=pd.concat(chunk)
del chunk
print('Data Loaded')
samples=len(train_data)
print('Looking for Nan Values')
nans={}
for column in train_data.columns:
  if column=="S_2":
    pass
  else:
    nan=train_data[column].isna().sum()
    if nan/samples>=0.95:
      print(column,train_data[column].dtype,(nan/samples)*100)

    
    if train_data[column].dtype=="object":
        train_data[column]=train_data[column].fillna(train_data[column].mode())
        #print(column,"obj")
    else:
        train_data[column]=train_data[column].fillna(train_data[column].mean())
        #print(column,"num")
  nans[column]=nan/samples


#print('merging the training labels')
train_labels=pd.read_csv('train_labels.csv')
#training_data=pd.merge(train_data,train_labels,on='customer_ID')

nan_threshold=0.95
print(f'The nan theshold is set at:{nan_threshold}')
selected_features=[]
for key in nans.keys():
  if nans[key]>nan_threshold:
    print(key)
    continue
  else:
    selected_features.append(key)

#selected_features.append("S_2")
train_data=train_data[selected_features]
features={"f":[]}
for feature in selected_features:
  features["f"].append(feature)

with open("Features.json","w") as f:
  json.dump(features,f)
print(f'The original feature length is:{len(train_data.columns)}')
print(f'The selected feature lenght is:{len(selected_features)}')


Data Loaded
Looking for Nan Values
D_73 float64 98.99021070601547
D_87 float64 99.93012683290515
D_88 float64 99.891457051685
D_108 float64 99.4768461295237
D_110 float64 99.43353018945662
D_111 float64 99.43353018945662
B_39 float64 99.39198593642065
B_42 float64 98.70778933050298
D_134 float64 96.48014598701137
D_135 float64 96.48014598701137
D_136 float64 96.48014598701137
D_137 float64 96.48014598701137
D_138 float64 96.48014598701137
The nan theshold is set at:0.95
D_73
D_87
D_88
D_108
D_110
D_111
B_39
B_42
D_134
D_135
D_136
D_137
D_138
The original feature length is:177
The selected feature lenght is:177


In [None]:
print("Convert the Dates in S_2 column from string to Datetime")
train_data["S_2"]=pd.to_datetime(train_data["S_2"])

gb=train_data.groupby("customer_ID")
indices=gb.indices

"""
We need to encode the object values into categorical first and then ordinally encode it
We also need to round up the values to 2 decimals
"""
needed_cat_column=["customer_ID","S_2","target"]
for column in train_data.columns:
  nan=train_data[column].isna().sum()
  if nan/samples:
    print(column,train_data[column].dtype,(nan/samples)*100)
  if train_data[column].dtype=="object":
    if column not in needed_cat_column:
      train_data[column]=train_data[column].astype("category").cat.codes.astype(float)
  else:
    try:
      train_data[column]=train_data[column].round(decimals=2)
    except:
      print(column)




Convert the Dates in S_2 column from string to Datetime
S_2
D_64 object 3.9310119532831442


In [None]:
train_data.D_64=train_data.D_64.fillna(train_data.D_64.mode())
print(train_data.D_64.isna().sum())

0


In [None]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset
from datetime import timedelta


class Amex_dataset(Dataset):
  def __init__(self,train_labels,map):
    self.map=map
    self.labels=train_labels
    self.length=len(train_labels)
    self.device=torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
  def __len__(self):
    return self.length

  def __getitem__(self,index):
    
    cust_id,target=self.labels.iloc[index]["customer_ID"],self.labels.iloc[index]["target"]
    index=self.map[cust_id]
    #print(index)
    df=train_data.iloc[index].sort_values("S_2").drop(columns=["customer_ID"])
    df["S_2"]=df["S_2"].diff()/timedelta(minutes=1)
    df["S_2"]=df["S_2"].astype(float)
    df["S_2"]=df["S_2"].fillna(0).cumsum(axis=0)
    if len(df)>1:
      df["S_2"]=df["S_2"]/df["S_2"].max()
    #print("here")
    #print(self.labels.iloc[index]["target"].to_numpy())
    return torch.from_numpy(df.to_numpy()).float().to(self.device),torch.from_numpy(np.array(target)).int().to(self.device)
    




    








In [None]:
from torch.nn.modules import dropout
from torch.nn.modules.dropout import Dropout

import torch
import torch.nn as nn
from torch.autograd import Variable

class Predict_class_dense(nn.Module):
  def __init__(self):
    super(Predict_class_dense,self).__init__()
    self.device=torch.device("cuda" if torch.cuda.is_available() else "cpu")
    self.LSTM=nn.LSTM(input_size=176,hidden_size=1500,num_layers=2,batch_first=True)
    self.dense1=nn.Linear(in_features=1500,out_features=1800)
    self.dense2=nn.Linear(in_features=1800,out_features=2000)
    self.dense3=nn.Linear(in_features=2000,out_features=780)
    self.dense4=nn.Linear(in_features=780,out_features=1)
    self.relu=nn.ReLU()
    self.softmax=nn.Softmax()
    self.h_0=Variable(torch.zeros(2,1,1500)).to(self.device)

    self.dropout1=nn.Dropout(p=0.2)
    self.dropout2=nn.Dropout(p=0.3)
    self.dropout3=nn.Dropout(p=0.2)
  def forward(self,x):
    x,_=self.LSTM(x,(self.h_0,self.h_0))
    x=x[:,-1,:].reshape(-1,x.shape[2])
    x=self.relu(x)
    x=self.dense1(x)
    x=self.relu(x)
    x=self.dropout1(x)
    x=self.dense2(x)
    x=self.relu(x)
    x=self.dropout2(x)
    x=self.dense3(x)
    x=self.relu(x)
    x=self.dropout3(x)
    x=self.dense4(x)
    x=nn.Sigmoid(x)
    return x



    


In [None]:
from typing_extensions import ParamSpecKwargs
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import csv
import json
with open("Features.json","r") as f:
  dic=json.load(f)

selected_features=dic["f"]
sub_data_chunk=pd.read_csv("test_data.csv",chunksize=100000)
sub_data=pd.concat(sub_data_chunk)
samples=len(sub_data)
sub_data=sub_data[selected_features]

for column in sub_data.columns:
  if column=="S_2":
    sub_data["S_2"]=pd.to_datetime(sub_data["S_2"])
  elif column=="customer_ID":
    pass
  else:
    if sub_data[column].dtype=="object":
      sub_data[column]=sub_data[column].fillna(sub_data[column].mode()).astype("category").cat.codes.astype(float)
    else:
      sub_data[column]=sub_data[column].fillna(sub_data[column].mean()).round(decimals=2)
      
sub_gb=sub_data.groupby("customer_ID")
map=sub_gb.indices


for column in sub_data.columns:
  nans=sub_data[column].isna().sum()
  if nans:
    sub_data[column]=sub_data[column].fillna(sub_data[column].mean())
    print(column,nans/samples)
    
  




In [None]:
for column in sub_data.columns:
  if column=="customer_ID":
    print(column)

customer_ID


In [None]:
from torch._C import DeviceObjType
from datetime import timedelta
from torch.types import Device
import time
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import csv
import json

model=Predict_class_dense()
model.load_state_dict(torch.load("model_3.h5"))
device=torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
model.to(device)
saved_ans={"customer_ID":[],"prediction":[]}
custom_n=len(map.keys())
prev_time=time.time()
with torch.no_grad():
  model.eval()
  for i,key in enumerate(map.keys()):
    if time.time()-prev_time>100:
      print(f"percent complete:{(i/custom_n)*100}%")
      prev_time=time.time()
    inx=map[key]
    df=sub_data.iloc[inx].sort_values("S_2").drop("customer_ID",axis=1)
    df["S_2"]=df["S_2"].diff()/timedelta(minutes=1)
    df["S_2"]=df["S_2"].astype(float)
    df["S_2"]=df["S_2"].fillna(0).cumsum(axis=0)
    if len(df)>1:
      df["S_2"]=df["S_2"]/df["S_2"].max()
    images=torch.unsqueeze(torch.from_numpy(df.to_numpy()).float(),0).to(device)
    outputs=model(images)
    _,pred=torch.max(outputs.data,1)
    saved_ans["customer_ID"].append(key)
    pred=pred.cpu().detach()
    saved_ans["prediction"].append(pred)

with open("submission_1.json","w") as f:
  json.dump(saved_ans,f)




cuda
percent complete:1.3706156360281672%
percent complete:3.1307962938328244%
percent complete:4.893140000064891%
percent complete:6.636232575293012%
percent complete:8.405065426807308%
percent complete:10.16178520712811%
percent complete:11.921208797983173%
percent complete:13.671655737864489%
percent complete:15.425347250386915%
percent complete:17.18304040250005%
percent complete:18.93943572555674%
percent complete:20.687719617010643%
percent complete:22.447900274815304%
percent complete:24.20808093261996%
percent complete:25.96901865737421%
percent complete:27.732984649926834%
percent complete:29.599046528253197%
percent complete:31.389077254356106%
percent complete:33.15228617995914%
percent complete:34.928040786441144%
percent complete:36.69817146701189%
percent complete:38.46667986126208%
percent complete:40.233998578877184%
percent complete:42.00066838196407%
percent complete:43.76268763093202%
percent complete:45.53476505508743%
percent complete:47.301002248488835%
percent co

TypeError: ignored

In [None]:
def reset_weights(m):
  '''
    Try resetting model weights to avoid
    weight leakage.
  '''
  for layer in m.children():
   if hasattr(layer, 'reset_parameters'):
    print(f'Reset trainable parameters of layer = {layer}')
    layer.reset_parameters()



In [None]:
from torch.cuda import is_available
from torch.utils.data import DataLoader,SubsetRandomSampler
from sklearn.model_selection import train_test_split,KFold


Train_labels=train_labels.sample(frac=0.8)
Test_labels=train_labels.drop(Train_labels.index).reset_index()
Train_labels=Train_labels.reset_index()

train_dataset=Amex_dataset(train_labels,indices)
test_dataset=Amex_dataset(Test_labels,indices)
torch.manual_seed(42)
kfold=KFold(n_splits=5,shuffle=True)


for fold,(train_index,test_index) in enumerate(kfold.split(train_dataset)):
  train_subsampler=SubsetRandomSampler(train_index)
  test_subsampler=SubsetRandomSampler(test_index)

  train_loader=DataLoader(dataset=train_dataset,batch_size=1,shuffle=True,num_workers=0,sampler=train_subsampler)
  test_loader=DataLoader(dataset=train_dataset,batch_size=1,shuffle=True,num_workers=0,sampler=test_subsampler)




  test_loss=[]
  train_loss=[]
  test_accuracy=[]
  train_accuracy=[]
  epochs=15
  device=torch.device("cuda" if torch.cuda.is_available() else "cpu")
  print(device)

  model=Predict_class_dense().to(device)
  criterion=nn.BCEWithLogitsLoss()
  optimizer=torch.optim.Adam(model.parameters(),lr=0.0001)


  for epoch in range(epochs):
    model.train()
    for i,(images,labels) in enumerate(train_loader):

      images=images.to(device)
      labels=labels.type(torch.LongTensor)
      labels=labels.to(device)

      outputs=model(images)
      #print(outputs,labels)
      loss=criterion(outputs,labels)

      optimizer.zero_grad()
      loss.backward()
      optimizer.step()

    print(f"Epoch[{epoch+1}/{epochs}], loss: {loss.item()} ")

    with torch.no_grad():
    #   model.eval()
    #   saved_ans={"customer_ID":[],"prediction":[]}
    #   for key in map.keys:
    #     inx=map[key]
    #     df=sub_data.iloc[inx].sort_values("S_2").drop("customer_ID")
    #     df["S_2"]=df["S_2"].diff()/timedelta(minutes=1)
    #     df["S_2"]=df["S_2"].astype(float)
    #     df["S_2"]=df["S_2"].fillna(0).cumsum(axis=0)
    #     if len(df)>1:
    #       df["S_2"]=df["S_2"]/df["S_2"].max()
    #     images=torch.from_numpy(df.to_numpy()).float().to(device)
    #     outputs=model(images)
    #     _,pred=torch.max(outputs.data,1)
    #     saved_ans["customer_ID"].append(key)
    #     pred=pred.cpu().detach()
    #     saved_ans["prediction"].append(pred)

    #   with open(f"Submission_file_{epoch}.csv","wb") as f:
    #     writer=csv.DictWriter(f,fieldnames=["customer_ID","prediction"])
    #     writer.writeheader()
    #     writer.writerows(saved_ans)

    #   print(f"Submission_file_{epoch} saved")

      torch.save(model.state_dict(),f"model_{epoch}.h5")

      train_loss=[]
      test_loss=[]
      true_labels=[]
      final_output=[]
      n_correct=0
      n_samples=0
      for images,labels in test_loader:
        images=images.to(device)
        labels=labels.type(torch.LongTensor)
        labels=labels.to('cpu')
        true_labels.append(labels)
        labels=labels.to(device)
        outputs=model(images)
        
        loss=criterion(outputs,labels)

        _,predicted=torch.max(outputs.data,1)
        n_samples+=labels.size(0)
        n_correct+=(predicted==labels).sum().item()
        predicted=predicted.cpu()
        predicted=predicted.detach().numpy()
        final_output.append(predicted)
        test_loss.append(loss.item())

      acc=100.0*n_correct/n_samples
      acc=round(acc,4)
      test_accuracy.append(acc)
      print(f"Testing accuracy for epoch {epoch+1}: {acc}")







cuda
Epoch[1/15], loss: 0.0023148665204644203 
Testing accuracy for epoch 1: 89.5612
Epoch[2/15], loss: 0.011056939139962196 
Testing accuracy for epoch 2: 89.7574
Epoch[3/15], loss: 1.0306141376495361 
Testing accuracy for epoch 3: 89.6637
Epoch[4/15], loss: 0.001956217223778367 
Testing accuracy for epoch 4: 89.8445
Epoch[5/15], loss: 0.5829675793647766 
Testing accuracy for epoch 5: 89.6811
Epoch[6/15], loss: 0.080205038189888 
Testing accuracy for epoch 6: 89.3444
Epoch[7/15], loss: 0.20785275101661682 
Testing accuracy for epoch 7: 89.8227
Epoch[8/15], loss: 0.0007753941463306546 
Testing accuracy for epoch 8: 89.6342
Epoch[9/15], loss: 0.4168844521045685 
Testing accuracy for epoch 9: 89.7105
Epoch[10/15], loss: 7.152531907195225e-06 
Testing accuracy for epoch 10: 89.4991
Epoch[11/15], loss: 0.7171515226364136 
Testing accuracy for epoch 11: 89.7672
Epoch[12/15], loss: 0.0023771857377141714 
Testing accuracy for epoch 12: 89.7323


KeyboardInterrupt: ignored

In [None]:
test_data_chunk=pd.read_csv("test_data.csv",chunksize=100000)

NameError: ignored

In [None]:
test_data=pd.concat(test_data_chunk)

In [None]:
print(len(test_data))

11363762


In [None]:
test_data=test_data[selected_features]

NameError: ignored

In [None]:
test_data.head()

NameError: ignored