# Neural Network & Diversity

## Prepocessing

Ce travail fait suite à la recherche de modèle avec Random Forest et XGBoost.

In [592]:
# Libraries import
import pandas as pd
import numpy as np
import torch
from torch import nn, cuda, optim
from torch.nn import functional as F
from sklearn.model_selection import train_test_split
from torch.utils.data import TensorDataset, DataLoader, Dataset, random_split, sampler
from sklearn.metrics import mean_absolute_error, mean_squared_error


In [593]:
# Checks if GPU is available.
isGPUAvailable = cuda.is_available()
device = "cpu"

if isGPUAvailable:
    device = "cuda"
    print("Training on GPU")
else:
    device = "cpu"
    print("Training on CPU")

Training on CPU


## Réseau Convolutionnel

In [594]:
# Définir le réseau de neurones convolutif
class CNN(nn.Module):
    def __init__(self, in_classes, dropout=0.3):
        super(CNN, self).__init__()
        self.lin1 = nn.Linear(in_classes, int(in_classes))
        self.lin2 = nn.Linear(int(in_classes), int(in_classes/2))
        self.lin3 = nn.Linear(int(in_classes/2), 1)
        self.dropout = nn.Dropout(dropout)
        self.classifier = nn.Sequential( self.lin1,self.dropout,nn.ReLU()
                                        ,self.lin2,self.dropout, nn.ReLU()
                                        ,self.lin3)

    def forward(self, x):
        # print(x.size())
        # x = self.features(x)
        x = x.view(x.size(0), -1)
        x = self.classifier(x)
        return x



In [595]:
from tqdm import tqdm

In [596]:
# Train and validation
def train (clf, train_loader, valid_loader, epochs=50, min_valid_loss=np.Inf, lr=1e-3):
    best_clf = clf
    # optimizer = torch.optim.SGD(clf.parameters(), lr = lr, momentum=0.5, nesterov=True)
    optimizer = torch.optim.Adam(clf.parameters(), lr = lr, weight_decay=0.0001)
    #optimizer = torch.optim.RMSprop(clf.parameters(), lr = lr, weight_decay=0.0001, momentum=0.1)
    criterion = nn.MSELoss()
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, verbose=True,
                                                     patience=20, min_lr=0.00001)
    for epoch in tqdm(range(epochs)):
        train_loss = 0.0
        validation_loss = 0.0

        # train
        clf.train()
        for inputs, labels in train_loader:
            optimizer.zero_grad() # Reset optimizer for every iteration.
            inputs, labels = inputs.to(device), labels.to(device) # Move tensors to GPU
            output = clf(inputs)# Forward pass
            # loss = criterion(output.squeeze(), labels)
            # print(output.squeeze(-1).size(),labels.view(labels.size(0), -1).size())
            loss = criterion(output, labels.view(labels.size(0), -1))
            loss.backward()  # Backward pass (Backpropagation)  # Loss
            optimizer.step() # Update weights
            train_loss += loss.item() # Update the loss.
            
        # Validation
        clf.eval()
        for inputs, labels in valid_loader:
            # Move tensors to GPU
            inputs, labels = inputs.to(device), labels.to(device)
            # Forward pass
            output = clf(inputs)
            # Loss
            loss = criterion(output, labels.view(labels.size(0), -1))
            # Update the validation loss.
            validation_loss += loss.item()
            
        
        # Calculate the losses.
        train_loss = train_loss/len(train_loader)
        validation_loss = validation_loss/len(valid_loader)
        
        #Update lr
        scheduler.step(validation_loss)
        
        # Print the losses
        # print("Epoch {0}".format(epoch + 1))
        # #print('LR:', scheduler.get_lr())
        # print("Train loss = {0}".format(train_loss))
        # print("Validation loss = {0}".format(validation_loss))
        
        # Check if validation loss has reduced, and therefore the model predicts better
        if validation_loss < min_valid_loss:
            min_valid_loss = validation_loss
            # print("Validation loss has decreased. Saving the model...")
            best_clf = clf
        # print("------------------------------------")
    return best_clf

In [597]:
from sklearn.preprocessing import MinMaxScaler

def scaled_data(df):
    scaler = MinMaxScaler()
    features = [feature for feature in df.columns]
    
    df[features] = scaler.fit_transform(df[features])
    return df

In [598]:
from sklearn.model_selection import KFold

# Split k-fold validation
n_splits = 15
random_state = np.random.seed(654658)

kf = KFold(n_splits=n_splits, random_state=random_state, shuffle=False)

In [601]:
# K-fold cross validation
lr = 0.001
epochs = 40
batch_size = 60

df = pd.read_csv('./data/diversity.csv', delimiter=',')
df = df.astype('float32')
df['budget'] = np.log(df['budget'] + 1e-5)


pourcentage_80 = int(0.8 * len(df))

# Train data
data_train = df[:pourcentage_80]
data_test = df[pourcentage_80:]

target = np.log(data_train['revenue'] + 1e-5)
data_train.pop('revenue')
data_train.pop('adult')


# Test data
target_test = np.log(data_test['revenue'] + 1e-5)
data_test.pop('revenue')
data_test.pop('adult')


# Convert test data to tensor
x_test = np.array(scaled_data(data_test))
x_test_tensor = torch.tensor(x_test, dtype=torch.float).to(device)

test_preds_nn = np.zeros((len(data_test)))
oof_nn = np.zeros(len(data_train))

cnn = CNN(data_train.shape[1])

for fold_i, (train_index, val_index) in enumerate(kf.split(data_train.values)):
    
    print("\n")
    print("Fold {0}".format(fold_i + 1))
    
    training_data = data_train.copy()
    
    x_train_raw, x_val_raw = training_data.loc[train_index], training_data.loc[val_index]
    y_train_raw, y_val_raw = target.loc[train_index].values, target.loc[val_index].values
    
    x_train_fold = torch.tensor(x_train_raw.to_numpy(), dtype=torch.float)
    y_train_fold = torch.tensor(y_train_raw, dtype=torch.float32)
    
    x_val_fold = torch.tensor(x_val_raw.to_numpy(), dtype=torch.float)
    y_val_fold = torch.tensor(y_val_raw, dtype=torch.float32)
    
    train_dataset = TensorDataset(x_train_fold, y_train_fold)
    test_dataset = TensorDataset(x_val_fold, y_val_fold)
    
    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    valid_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
    
    
    model = train(cnn, train_loader, valid_loader, epochs=epochs, lr = lr)
    
    # Out of fold data to check accuracy at the end
    oof_nn[val_index] = model(x_val_fold.to(device)).squeeze().to('cpu').detach().numpy()   
    
    test_preds_nn += model(x_test_tensor).squeeze().to('cpu').detach().numpy() / kf.n_splits
    rmae = mean_absolute_error(target_test, test_preds_nn)
    print("The log mae score for test is: {}".format(rmae))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[features] = scaler.fit_transform(df[features])




Fold 1


100%|██████████| 40/40 [00:35<00:00,  1.12it/s]


The log mae score for test is: 14.20834073253545


Fold 2


 55%|█████▌    | 22/40 [00:19<00:16,  1.08it/s]

Epoch 00022: reducing learning rate of group 0 to 1.0000e-04.


100%|██████████| 40/40 [00:37<00:00,  1.07it/s]


The log mae score for test is: 13.206992342387903


Fold 3


 55%|█████▌    | 22/40 [00:19<00:16,  1.11it/s]

Epoch 00022: reducing learning rate of group 0 to 1.0000e-04.


100%|██████████| 40/40 [00:35<00:00,  1.12it/s]


The log mae score for test is: 12.199842809181835


Fold 4


100%|██████████| 40/40 [00:35<00:00,  1.12it/s]


The log mae score for test is: 11.105593622517588


Fold 5


 60%|██████    | 24/40 [00:21<00:13,  1.14it/s]

Epoch 00024: reducing learning rate of group 0 to 1.0000e-04.


100%|██████████| 40/40 [00:35<00:00,  1.12it/s]


The log mae score for test is: 10.077676114640274


Fold 6


 62%|██████▎   | 25/40 [00:21<00:12,  1.17it/s]

Epoch 00025: reducing learning rate of group 0 to 1.0000e-04.


100%|██████████| 40/40 [00:35<00:00,  1.13it/s]


The log mae score for test is: 9.082566064984984


Fold 7


 57%|█████▊    | 23/40 [00:21<00:15,  1.07it/s]

Epoch 00023: reducing learning rate of group 0 to 1.0000e-04.


100%|██████████| 40/40 [00:37<00:00,  1.05it/s]


The log mae score for test is: 8.094916968105164


Fold 8


 78%|███████▊  | 31/40 [00:29<00:08,  1.10it/s]

Epoch 00031: reducing learning rate of group 0 to 1.0000e-04.


100%|██████████| 40/40 [00:37<00:00,  1.06it/s]


The log mae score for test is: 7.089657135980655


Fold 9


 62%|██████▎   | 25/40 [00:23<00:14,  1.03it/s]

Epoch 00025: reducing learning rate of group 0 to 1.0000e-04.


100%|██████████| 40/40 [00:39<00:00,  1.02it/s]


The log mae score for test is: 6.112607648166526


Fold 10


100%|██████████| 40/40 [00:51<00:00,  1.28s/it]


The log mae score for test is: 5.169170097895113


Fold 11


 82%|████████▎ | 33/40 [00:36<00:06,  1.05it/s]

Epoch 00033: reducing learning rate of group 0 to 1.0000e-04.


100%|██████████| 40/40 [00:43<00:00,  1.09s/it]


The log mae score for test is: 4.297852480389249


Fold 12


 75%|███████▌  | 30/40 [00:28<00:09,  1.03it/s]

Epoch 00030: reducing learning rate of group 0 to 1.0000e-04.


100%|██████████| 40/40 [00:40<00:00,  1.02s/it]


The log mae score for test is: 3.540416622598536


Fold 13


100%|██████████| 40/40 [00:39<00:00,  1.03it/s]


The log mae score for test is: 2.8936602614943867


Fold 14


 82%|████████▎ | 33/40 [00:32<00:07,  1.01s/it]

Epoch 00033: reducing learning rate of group 0 to 1.0000e-04.


100%|██████████| 40/40 [00:39<00:00,  1.02it/s]


The log mae score for test is: 2.451588608640068


Fold 15


100%|██████████| 40/40 [00:36<00:00,  1.09it/s]

The log mae score for test is: 2.2833597152989324





Le réseau de neurone ne nous donne pas une meilleure performance que nos algorithmes de random forest ou XGBoost. Il est peu sans doute être largement améliorer encore.