# Score alla data with all model

In [1]:
import pandas as pd
import torch
from torch.utils.data import DataLoader

from deep_nlp.cnncharclassifier import CNNCharClassifier, charToTensor
from src.deep_nlp.embed_cnn.embcnnmodel_gradcam import classifier3F
from deep_nlp.bilstm_cnn.bilstmcnn_gradcam import BilstmCnn


import pickle

from deep_nlp.grad_cam.utils.letter import rebuild_text, prepare_heatmap, LetterToToken
from deep_nlp.grad_cam.plot import plot_bar_heatmap, plot_text_and_heatmap

In [2]:
from pathlib import Path

current_dir = Path.cwd()  # this points to 'notebooks/' folder
proj_path = current_dir.parent.parent 
print(proj_path)

C:\Users\wenceslas\Documents\cours\ENSAE\2A\Normal\statapp\nlp_understanding


### CNN character level

In [3]:
# params
cnn_sequence_len= 1014
cnn_feature_num= 87
cnn_feature_size= 256
cnn_kernel_one= 7
cnn_kernel_two= 3
cnn_stride_one= 1
cnn_stride_two= 3
cnn_output_linear= 1024
cnn_num_class= 2
cnn_dropout= 0.5
cnn_cuda_allow= True

In [4]:
# Load model
model_path_saved= "data/06_models/cnn_char_classifier/cnn_char_model/cnn_char_model.pt"

with open(str(proj_path)+ "\\" + model_path_saved, 'rb') as f:
    model_saved= pickle.load(f)

#### Engineering process

In [5]:
# Load test data
data_df= pd.read_csv(str(proj_path)+ "\\" + "data/01_raw/allocine_test.csv")

test_data= charToTensor(data_df= data_df, sentence_max_size= cnn_sequence_len)

test_load = DataLoader(test_data, batch_size= 1
                       , num_workers=4)

#### Load trained model

In [6]:
# Initialisation
parameters = {"sequence_len": cnn_sequence_len, "feature_num": cnn_feature_num
    , "feature_size": cnn_feature_size, "kernel_one": cnn_kernel_one
    , "kernel_two": cnn_kernel_two, "stride_one": cnn_stride_one
    , "stride_two": cnn_stride_two, "output_linear": cnn_output_linear
    , "num_class": cnn_num_class, "dropout": cnn_dropout}

In [7]:
model = CNNCharClassifier(**parameters)

if cnn_cuda_allow:
    model = torch.nn.DataParallel(model).cuda()
else:
    model = torch.nn.DataParallel(model)

model.load_state_dict(model_saved)

# state_dict= model.module.state_dict() # delete module to allow cpu loading

# cpu_model= CNNCharClassifier(**parameters).cpu()
# cpu_model.load_state_dict(state_dict)

# cpu_model.eval()

<All keys matched successfully>

#### Score

In [8]:
pred_test = []
lab = []
reviews= []
alphabet= test_data.get_alphabet()+" "

with torch.no_grad():
    for review, label in test_load:
        pred_test.append(torch.exp(model(review)))
        lab.append(label.float())
        
pred_test = torch.cat(pred_test)
lab = torch.cat(lab)

In [9]:
text_review_all= data_df["review"].values

In [10]:
results= pd.DataFrame({
    "review": text_review_all
    , "label": lab
    , "cnn_char_proba_1": pred_test.cpu()[:,1]
})

In [11]:
results

Unnamed: 0,review,label,cnn_char_proba_1
0,"Magnifique épopée, une belle histoire, touchan...",1.0,0.998912
1,Je n'ai pas aimé mais pourtant je lui mets 2 é...,0.0,0.046238
2,Un dessin animé qui brille par sa féerie et se...,1.0,0.934723
3,"Si c'est là le renouveau du cinéma français, c...",0.0,0.020743
4,Et pourtant on s’en Doutait !Second volet très...,0.0,0.001339
...,...,...,...
19995,"je suis éventreur, arracheur, tailladeur, goug...",1.0,0.983251
19996,Trémors 3 essouffle la série des trémors par u...,0.0,0.860303
19997,"0/20 : Tout d’abord, la mise en scène est tout...",0.0,0.000185
19998,Un scénario très original mené par des personn...,1.0,0.996394


### Embedding CNN (5 filters)

#### Load Test loader

In [12]:
test_iterator_cnn_embed_path= "data/02_intermediate/test_iterator_cnn_embed.pkl"

with open(str(proj_path)+ "\\" + test_iterator_cnn_embed_path, 'rb') as f:
    test_iterator_cnn_embed= pickle.load(f)
    
print(type(test_iterator_cnn_embed))

<class 'torch.utils.data.dataloader.DataLoader'>


#### Load Embedding

In [13]:
# Embedding model
embed_for_torch_path= "data/04_feature/w2v_torch.pkl"

with open(str(proj_path)+ "\\" + embed_for_torch_path, 'rb') as f:
    embed_for_torch= pickle.load(f)
    
print(type(embed_for_torch))

<class 'torch.Tensor'>


#### Load vocabulary

In [14]:
word_ind_dict_path= "data/04_feature/voc.pkl"

with open(str(proj_path)+ "\\" + word_ind_dict_path, 'rb') as f:
    word_ind_dict= pickle.load(f)
    
print(type(word_ind_dict))

<class 'dict'>


#### Load model

In [15]:
# Params
params_models = {"wv": embed_for_torch,"no_words": 67,"embedding_dim":200, "nb_filter":200
                 , "height_filter":tuple([1, 2, 3, 4, 5]), "output_dim":2, "dropout":0.8, "padded":True}

In [16]:
# Model dict
embed_cnn_model_for_save_path= "data/06_models/embed_cnn/embed_cnn_classifier/embed_cnn.pt"

with open(str(proj_path)+ "\\" + embed_cnn_model_for_save_path, 'rb') as f:
    embed_cnn_model_for_save= pickle.load(f)
    
print(type(embed_cnn_model_for_save))

<class 'collections.OrderedDict'>


In [17]:
model = classifier3F(**params_models)

model.load_state_dict(embed_cnn_model_for_save)
model.eval()

classifier3F(
  (before_conv): Sequential(
    (conv1_conv_1): Conv2d(1, 200, kernel_size=(1, 200), stride=(1, 1))
    (conv1_conv_2): Conv2d(1, 200, kernel_size=(2, 200), stride=(1, 1))
    (conv1_conv_3): Conv2d(1, 200, kernel_size=(3, 200), stride=(1, 1))
    (conv1_conv_4): Conv2d(1, 200, kernel_size=(4, 200), stride=(1, 1))
    (conv1_conv_5): Conv2d(1, 200, kernel_size=(5, 200), stride=(1, 1))
    (conv1_relu): ReLU()
  )
  (pool): Sequential(
    (conv1_maxpool): Sequential(
      (0): MaxPool1d(kernel_size=67, stride=1, padding=0, dilation=1, ceil_mode=False)
    )
  )
  (after_conv): Sequential(
    (dp): Dropout(p=0.8, inplace=False)
    (fc): Linear(in_features=1000, out_features=2, bias=True)
    (sm): Softmax(dim=1)
  )
  (embedding): Embedding(155564, 200)
  (conv1_conv): ModuleList(
    (0): Conv2d(1, 200, kernel_size=(1, 200), stride=(1, 1))
    (1): Conv2d(1, 200, kernel_size=(2, 200), stride=(1, 1))
    (2): Conv2d(1, 200, kernel_size=(3, 200), stride=(1, 1))
    (3):

In [18]:
pred_test = []
lab = []
reviews= []

with torch.no_grad():
    for review, label in test_iterator_cnn_embed:
        pred_test.append(model(review))
        lab.append(label.float())
        
pred_test = torch.cat(pred_test)
lab = torch.cat(lab)

In [19]:
# Append prediction to df
results["embed_cnn_proba_1"]= pred_test.cpu()[:,1]

### BiLSTM CNN 

#### Load embedding

In [20]:
#
embed_matrix_path= "data/02_intermediate/bilstm_cnn/embed_matrix.pkl"

with open(str(proj_path)+ "\\" + embed_matrix_path, 'rb') as f:
    embed_matrix= pickle.load(f)
    
print(type(embed_matrix))

<class 'numpy.ndarray'>


#### Load Test Loader

In [21]:
#
test_batch_bilstm_path= "data/02_intermediate/test_iterator_cnn_bilstm.pkl"

with open(str(proj_path)+ "\\" + test_batch_bilstm_path, 'rb') as f:
    test_batch_bilstm= pickle.load(f)
    
print(type(test_batch_bilstm))

<class 'torch.utils.data.dataloader.DataLoader'>


#### Load Model

In [22]:
#BiLSTM + CNN
num_epochs=  50
batch_size= 32
patience= 5
lr= 0.001
input_dim= 200
hidden_dim= 128
layer_dim= 2
feature_size= 256
output_dim= 2
kernel_size= 3
dropout_rate= 0.5
sentence_size= 67
padded= True

In [23]:
#
bilstm_cnn_model_for_save_path= "data/06_models/bilstm_cnn/bilstm_cnn_classifier/bilstm_cnn.pt"

with open(str(proj_path)+ "\\" + bilstm_cnn_model_for_save_path, 'rb') as f:
    bilstm_cnn_model_for_save= pickle.load(f)
    
print(type(bilstm_cnn_model_for_save))

<class 'collections.OrderedDict'>


In [24]:
model = BilstmCnn(embed_matrix, sentence_size, input_dim, hidden_dim
                  , layer_dim, output_dim, feature_size, kernel_size, dropout_rate, padded)

if cnn_cuda_allow:
    model = torch.nn.DataParallel(model).cuda()
else:
    model = torch.nn.DataParallel(model)

model.load_state_dict(bilstm_cnn_model_for_save)

<All keys matched successfully>

In [29]:
pred_test = []
lab = []
reviews= []

with torch.no_grad():
    for review, label in test_batch_bilstm:
        test_reviews = review.to(torch.int64)
#         test_labels = label.to(torch.int64)
        
        pred_test.append(model(test_reviews))
        lab.append(label.float())
        
pred_test = torch.cat(pred_test)
lab = torch.cat(lab)

In [30]:
# Append prediction to df
results["bilstm_cnn_proba_1"]= pred_test.cpu()[:,1]

In [32]:
results.to_csv("all_model_prediction.csv", index= False)