# Module list

In [None]:
%matplotlib inline
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm

import time
import datetime

!pip install torch-summary
from torchsummary import summary

# Conda check

In [None]:
# check if CUDA is available
use_cuda = torch.cuda.is_available()
print(use_cuda)

# Inputs & Variables


In [None]:
# display the set-up for pandas dataframe
pd.set_option('display.max_rows', None)

# Train CSV dataset - length 1460
train_csv = pd.read_csv("../input/house-prices-advanced-regression-techniques/train.csv")

# Test CSV dataset - length 1459
test_csv = pd.read_csv("../input/house-prices-advanced-regression-techniques/test.csv")

# submission CSV file
submission = pd.read_csv("../input/house-prices-advanced-regression-techniques/sample_submission.csv")

# Directory 
data_dir = "../input/house-prices-advanced-regression-techniques/"

# Date and Time
date_res = datetime.datetime.now()

# Define Version
#VERSION = 'version_'+date_res.strftime("%m/%d/%Y")+'.pt'
VERSION = 'CNN_v1.pt'

# Train set description

In [None]:
dtf_description = train_csv.describe()
dtf_description

In [None]:
train_csv.info

In [None]:
# Mean for the non-dummy entries
dtf_description.loc['mean']

# Number of entries that are null/void

In [None]:
train_csv.isnull().sum()

# Function to split training and validation data from the training set

In [None]:
def split_train_valid_data(data, perc=0.75):
    return data.head(int(len(data)*(perc)))
    


# Creation of the Dataloader

In [None]:
class CustomDataset(Dataset):
    def __init__(self, csv_file_data, csv_file_test, id_col, target_col, data=None):
        self.data_train= pd.read_csv(csv_file_data)
        self.data_test = pd.read_csv(csv_file_test)
        self.id        = id_col
        self.target    = target_col
        self.data = data

    def __len__(self):
        if self.data == 'train':
            return len(self.data_train)
        else:
            return len(self.data_test)

    def __getitem__(self, idx):
        # remove the target column
        train_wo_SP = self.data_train.drop(self.target, axis='columns')
        # concat train and test features to have the same number of columns one the dummies features appear
        all_features = pd.concat([train_wo_SP, self.data_test], keys=["train", "test"])
        # Normalize the numerical features
        numeric_features = all_features.dtypes[all_features.dtypes != 'object'].index
        all_features[numeric_features] = all_features[numeric_features].apply(lambda x: (x - x.mean()) / (x.std()))
        
        # creathe the dummies for train and test set
        all_features_dummies = pd.get_dummies(all_features)
        all_features_dummies = all_features_dummies.fillna(0)
        #print(all_features_dummies.shape)
        # all features_dummies is 1460 x 289 for the train set and 1459 x 289 for the test set
        
        # creation of the label of train dataset
        train_label1 = self.data_train['Id']
        train_label2 = self.data_train['SalePrice']
        train_label = pd.DataFrame(columns = ['Id', 'SalePrice'])
        train_label['Id'] = train_label1
        train_label['SalePrice'] = train_label2

        #Split Data - creation of the Validation dataset
        train_data = split_train_valid_data(all_features_dummies.loc['train'])
        valid_data = all_features_dummies.loc['train'].iloc[max(train_data.index+1):]

        
        #Split label - creation of the validation labelset
        label_train = split_train_valid_data(train_label)
        label_valid = train_label.iloc[max(train_data.index+1):]
         
        # creation of the test data set
        test_data = all_features_dummies.loc['test']
        
        # creation of an Empty label test
        label_test = pd.DataFrame(np.empty((test_data.shape[0],1)))
        
        train_data = train_data.astype(np.float32)
        valid_data = valid_data.astype(np.float32)
        test_data = test_data.astype(np.float32)
        label_train = label_train.astype(np.float32)
        label_valid = label_valid.astype(np.float32)
        
        # remove 'ID' columns - data
        train_data = train_data.drop(['Id'], axis=1)
        #print(len(train_data))
        valid_data = valid_data.drop(['Id'],axis=1)
        #print(len(valid_data))
        test_data = test_data.drop(['Id'], axis=1)
        #print(len(test_data))
        
        # remove 'ID' column - label
        label_train = label_train.drop(['Id'], axis=1)
        label_valid = label_valid.drop(['Id'], axis=1)
            
        # data preparation
        if self.data == 'train':
            use_data = train_data.to_numpy()
            use_data = torch.from_numpy(use_data)
        elif self.data == 'valid':
            use_data = valid_data.to_numpy()
            use_data = torch.from_numpy(use_data)
        elif self.data == 'test':
            use_data = test_data.to_numpy()
            use_data = torch.from_numpy(use_data)
            
        # label preparation
        if self.data == 'train':
            label_data = label_train.to_numpy()
            label_data = torch.from_numpy(label_data)
        elif self.data == 'valid':
            label_data = label_valid.to_numpy()
            label_data = torch.from_numpy(label_data)
        elif self.data == 'test':
            label_data = label_test.to_numpy()
            label_data = torch.from_numpy(label_data)
        
        return use_data, label_data

params = {
    'id_col':'Id',  
    'target_col': ['SalePrice'],
}

In [None]:
data_dataset = {x: CustomDataset(csv_file_data=data_dir+'train.csv', 
                                 csv_file_test=data_dir+'test.csv', 
                                 **params, 
                                 data='train' if x == 'train' 
                                 else 'valid' if x =='valid' 
                                 else 'test') 
                for x in ['train', 'valid', 'test']
               }


In [None]:
data_loader = {x :torch.utils.data.DataLoader(data_dataset[x], batch_size=1, shuffle=True)
                for x in ['train', 'valid', 'test']}

In [None]:
print('TRAINING')

data, lab_target = data_dataset['train'].__getitem__(0)

print('DATASET')
print('Data shape: ', data.shape)
print('Data type: ', type(data))
print('Data size: {}'.format(data.size()))
#print('Exampe of the feature for the 1st entry {}'.format(data[0]))
print('\nTarget at the first row: {}'.format(lab_target.size()))
print('Example of the label for the 1st entry: {}'.format(lab_target[0]))


print()
print('Train Loader type')
train_iter = iter(data_loader['train'])
print(type(train_iter))

datas, labels_target = train_iter.next()

print('DATALOADER')
print('images shape on batch size = ', datas.size())
print('Example of datas for the 1st entry {}'.format(datas[0].size()))
#print('\nTaregt type on batch size = {}'.format(labels_target))
print('Target type on batch size = {}'.format(type(labels_target)))
print('Target shape on batch size = ', labels_target.shape)
print(len(train_iter))

In [None]:
'''for idx, (data, target) in enumerate(data_loader['train']):
    print(data[0][idx])'''

# Model creation

In [None]:
class Net(nn.Module):
    ### TODO: choose an architecture, and complete the class
    def __init__(self):
        super(Net, self).__init__()
        ## Define linear layers
        
        # linear layer (330 -> 755001)
        self.fc1 = nn.Linear(288, 100)
        # linear layer (500 -> 250)
        self.fc2 = nn.Linear(100, 50)
        # linear layer (250 -> 125)
        self.fc3 = nn.Linear(50, 1)

        self.dropout = nn.Dropout(0.1)
        
        # LogSoftmax
        self.RELU = nn.ReLU()
    
    def forward(self, x):
        
        #print('before l1 {}'.format(x))
        # 1LL
        x = torch.tanh(self.fc1(x))
        x = self.dropout(x)
        
        #print('before l2 {}'.format(x))
        # 2LL
        x = torch.tanh(self.fc2(x))
        x = self.dropout(x)
        
        #print('before l3 {}'.format(x))
        # 3LL
        x = self.RELU(self.fc3(x))
        
        #print('After LR {}'.format(x))
        return x

#-#-# You do NOT have to modify the code below this line. #-#-#

# instantiate the CNN
model_HR = Net()

# move tensors to GPU if CUDA is available
if use_cuda:
    model_HR.cuda()

In [None]:
model_HR

In [None]:
next(model_HR.parameters()).is_cuda

# Torch Summary

In [None]:
summary(Net(),input_size=(1, 288))

In [None]:
#Learning rate
lr=0.001

#momentum
momentum = 0.8

### select loss function
criterion = nn.MSELoss()

# other Loss function
# 1-MSE

# 2-Mae
def mae(true, pred):
    return np.sum(np.abs(true - pred))

# 3-huber loss
def huber(true, pred, delta):
    loss = np.where(np.abs(true-pred) < delta , 0.5*((true-pred)**2), delta*np.abs(true - pred) - 0.5*(delta**2))
    return np.sum(loss)

# 4-log cosh loss
def logcosh(true, pred):
    loss = np.log(np.cosh(pred - true))
    return np.sum(loss)

# 5-Quantile Loss / Note: the Quantile Loss is not define yet

### select optimizer


optimizer = optim.SGD(model_HR.parameters(), lr=lr, momentum = momentum)

#other otpions: 
# SGD
# optim.SGD(model_HR.parameters(), lr=lr, momentum = momemtum)
# Sparse Adam - In this variant, only moments that show up in the gradient get updated, and only those portions of the gradient get applied to the parameters.
# optim.SparseAdam(model_HR.parameters(), lr=lr)
# Average Stochastic gradient descent
# optim.ASGD(model_HR.parameters(), lr=lr)
# RMSprop
# optim.RMSprop(model_HR.parameters(), lr=lr, momentum = momemtum)



In [None]:


for data in iter(data_loader['train']):
    print(len(data))

In [None]:
def train(n_epochs, loaders, model, optimizer, criterion):
    """returns trained model"""
    # initialize tracker for minimum validation loss
    valid_loss_min = np.Inf 
    time_start = time.time()
    train_class = []
    valid_class = []
    epoch_class = []
    
    for epoch in range(1, n_epochs+1):
        # initialize variables to monitor training and validation loss
        train_loss = 0.0
        valid_loss = 0.0
        
        ###################
        # train the model #
        ###################
        model.train()
        for idx, (data, target) in enumerate(loaders['train']):
            data = data[0][idx]
            target = target[0][idx]
            if use_cuda:
                data, target = data.cuda(), target.cuda()
            optimizer.zero_grad()
            output = model(data)
            '''for name, param in model.named_parameters(): 
                if param.requires_grad: 
                    print(name, param.data)'''
            loss = criterion(output, target)
            loss.backward()
            optimizer.step()
            train_loss = criterion(output, target)           

        model.eval()
        for idx, (data, target) in enumerate(loaders['valid']):
            data = data[0][idx]
            target = target[0][idx]
            # move to GPU
            if use_cuda:
                data, target = data.cuda(), target.cuda()
            ## update the average validation loss
            output = model(data)
            # calculate the batch loss
            loss = criterion(output, target)
            # update average validation loss 
            valid_loss = criterion(output, target)
        
        # Next todo, create a variable lr according to validation loss
        '''if valid_loss < 0.35 and valid_loss > 0.15:
            LR=0.005
        elif valid_loss < 0.15:
            LR=0.001'''
        
        # Calcul time
        time_now = time.time()
        
        time_epoch = (time_now - time_start)/60
            
        # print training/validation statistics 
        print('Epoch: {} \tTraining Loss: {:.6f} \tValidation Loss: {:.6f} \tTime since the beginning {:.1f} min \tLearning rate: {:.6f} '.format(
            epoch, 
            train_loss,
            valid_loss,
            time_epoch,
            lr
            ))
        
        ## save the model if validation loss has decreased
        if valid_loss <= valid_loss_min:
            print('Validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(
            valid_loss_min,
            valid_loss,
            torch.save(model.state_dict(), VERSION))
                 )
            valid_loss_min = valid_loss
        
        # store class data
        train_class.append(train_loss)
        valid_class.append(valid_loss)
        epoch_class.append(epoch)
    
    plt.plot(epoch_class, train_class, 'g', label='Training loss')
    plt.plot(epoch_class, valid_class, 'b', label='validation loss')
    plt.title('Training and Validation loss')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()
    plt.show()
    
    
    # return trained model
    return model

In [None]:
model_res = train(5, data_loader, model_HR, optimizer, criterion)

# Result and submission file

In [None]:

print(len(data_loader['test']))
dataiter_test = iter(data_loader['test'])
print('data_iter\n',dataiter_test)
datas, labels = dataiter_test.next()
print('images\n',datas.shape)
print('labels\n',labels.shape)

model_test = model_HR
model_test.load_state_dict(torch.load('/kaggle/working/'+VERSION))
model_test = model_test.eval()
print(model_test)

if use_cuda:
    datas = datas.cuda()
out_fwd = model_test.forward(datas)
print(out_fwd)
#print('Result preditcion model on dataset:\n {}\n'.format(out_fwd))
#probs = torch.exp(out_fwd)
#print('probs\n', probs)
#print(probs.max())
#print(probs.min())

In [None]:
for i in range(len(out_fwd)):
    submission.iloc[i][1] = out_fwd[i][0]

pd.set_option("display.max_rows", 10, "display.max_columns", None)

submission.to_csv(path_or_buf='sample_submission_'+VERSION+'.csv', index=False)

# Score

result: - Ranking: 