# Train the model locally  

## Import librairies (local test)

In [1]:
# Update sys.path
ROOT_DIR = r"D:\Data\Google Drive\Openclassrooms\P7\Projet"
SRC_DIR = ROOT_DIR + r"\src"
import sys
sys.path.append(SRC_DIR)
import config
import pickle
import importlib
importlib.reload(config)
pass

In [2]:
import load
import preprocess
import models
importlib.reload(models)
import simu
import simu_framework
import embeddings

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\cdiet\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\cdiet\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## Define files' paths (local test)

In [3]:
# Paths
DATA_DIR = ROOT_DIR + r"\data"
EMBEDDING_DIR = ROOT_DIR + r"\embeddings"
DATA_FILE = ROOT_DIR + r"\data\data.csv"
CHECKPOINT_FILE = ROOT_DIR + r"\checkpoints\weights.best.hdf5"
model_name = config.model_name
MODEL_FILE = f"./models/model_{model_name}.p"
MODEL_PARAMS_FILE = f"./models/model_params_{model_name}.p"

## Define the parameters of the model (local test)

In [4]:
# Setup model_params
importlib.reload(config)

model_params = dict()

model_params['train_test_split'] = dict()
model_params['train_test_split']['n_samples'] = config.n_samples
model_params['train_test_split']['test_size'] = config.test_size
model_params['train_test_split']['val_test_ratio'] = config.val_test_ratio

model_params['preprocessing'] = dict()
model_params['preprocessing']['stemming'] = config.stemming
model_params['preprocessing']['lemmatization'] = config.lemmatization
model_params['preprocessing']['pos_tag'] = config.pos_tag
model_params['preprocessing']['max_df'] = config.max_df
model_params['preprocessing']['min_df'] = config.min_df

model_params['embedding'] = dict()
model_params['embedding']['type'] = config.embedding_type
model_params['embedding']['output_dim'] = config.embedding_output_dim
model_params['embedding']['trainable'] = config.embedding_trainable

model_params['model'] = dict()
model_params['model']['type'] = config.model_type

model_params['model_selection'] = dict()
model_params['model_selection']['n_folds'] = config.n_folds
model_params['model_selection']['epochs'] = config.epochs
model_params['model_selection']['batch_size'] = config.batch_size

model_params['model_selection']['learning_rate'] = config.learning_rate

model_params['model_selection']['dropout_rate'] = config.dropout_rate
model_params['model_selection']['l2_reg'] = config.l2_reg

model_params['model_selection']['es_min_delta'] = config.early_stopping_min_delta
model_params['model_selection']['es_patience'] = config.early_stopping_patience
model_params['model_selection']['lr_reduce_factor'] = config.lr_reduce_factor
model_params['model_selection']['lr_reduce_min_delta'] = config.lr_reduce_min_delta
model_params['model_selection']['lr_reduce_patience'] = config.lr_reduce_patience
model_params['model_selection']['lr_reduce_min_lr'] = config.lr_reduce_min_lr

model_params['paths'] = dict()
model_params['paths']['checkpoint_file'] = CHECKPOINT_FILE
model_params['paths']['model_file'] = MODEL_FILE
model_params['paths']['model_params_file'] = MODEL_PARAMS_FILE

## Load the model (local test)

In [5]:
importlib.reload(load)
corpus_train, corpus_test, y_train, y_test = load.load_and_split_docs(
    DATA_FILE,
    n_tweets=model_params['train_test_split']['n_samples'],
    test_size = model_params['train_test_split']['test_size']
)


****************************************
Load the data
****************************************
Data loaded
Original dataset size: (1600000, 2)

****************************************
Sample the data
****************************************
Data sampled
Sampled dataset size: ((20000,), (20000,))

****************************************
Train and test split
****************************************
Train and test split done
Train set size: ((19000,), (19000,))
Test set size: ((1000,), (1000,))


## Preprocess the data (local test)

In [6]:
importlib.reload(preprocess)

# Preprocess the train set
preprocessing_inputs= (corpus_train, y_train, model_params)
tk_train_corpus, padded_corpus_train = preprocess.preprocess_train(*preprocessing_inputs)

# Preprocessing the test set
preprocessing_inputs = (corpus_test, y_test, model_params)
padded_corpus_val, padded_corpus_test, y_val, y_test = preprocess.preprocess_test(*preprocessing_inputs)

vocab_train = model_params['preprocessing']['vocab_train']
tk_doc_max_len = model_params['preprocessing']['tk_doc_max_len']


****************************************
Preprocess train set
****************************************
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\cdiet\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\cdiet\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
Tokenization of the train set done
Vocabulary size after tokenization: 18516
Vocabulary reduction done
Vocabulary size after reduction: 1917
Encoding of the train set done
Padding of the train set done
Train size: (19000, 19000)

****************************************
Preprocess test set
****************************************
Tokenization of the test set done
Encoding of the test set done
Padding of the test set done
Test size: (1000, 1000)

****************************************
Split test set further into validation and test sets
************************************

## Compute the embeddings (local test)

In [7]:
# importlib.reload(embeddings)
# embeddings.compute_dummy_embedding(EMBEDDING_DIR, vocab_train)

In [8]:
# importlib.reload(embeddings)
# embeddings.load_glove_embedding(DATA_DIR, EMBEDDING_DIR, vocab_train)

In [9]:
# importlib.reload(embeddings)
# embeddings.compute_w2v_embedding(EMBEDDING_DIR, tk_train_corpus, vocab_train)

In [10]:
# importlib.reload(embeddings)
# embeddings.load_w2v_embedding(DATA_DIR, EMBEDDING_DIR, vocab_train)

## Load the selected embedding (local test)

In [11]:
importlib.reload(embeddings)

weights = embeddings.select_embedding(
        EMBEDDING_DIR,
        vocab=vocab_train,
        embedding_type=model_params['embedding']['type']
    )

# Record model parameter
model_params['embedding']['weights'] = weights
model_params['embedding']['output_dim'] = weights.shape[1]


****************************************
Select and load the embedding
****************************************
Selected embedding: pretrained_glove
Original embedding loaded
Embedding reduced to the words in the vocabulary
Shape of the weights: (1917, 200)


## Train the model (local test)

In [12]:
importlib.reload(config)
importlib.reload(models)
importlib.reload(simu)
importlib.reload(simu_framework)

# Run the simulation
simu.simu(None,
        padded_corpus_train,
        padded_corpus_val,
        padded_corpus_test,
        y_train,
        y_val,
        y_test,
        model_params,
        )

# Save the model
print("\nDump the model parameters in a pickle file")
with open(MODEL_PARAMS_FILE, 'wb') as f:
        pickle.dump(model_params, f)


****************************************
Train model
****************************************

Configuration 1 / 12: {'epochs': 10, 'batch_size': 256, 'learning_rate': 0.01, 'dropout_rate': 0.0, 'l2_reg': 0.0}
Fold 1 / 1

Epoch 00001: val_loss improved from inf to 0.56370, saving model to D:\Data\Google Drive\Openclassrooms\P7\Projet\checkpoints\weights.best.hdf5

Epoch 00002: val_loss improved from 0.56370 to 0.54145, saving model to D:\Data\Google Drive\Openclassrooms\P7\Projet\checkpoints\weights.best.hdf5

Epoch 00003: val_loss did not improve from 0.54145

Epoch 00004: val_loss did not improve from 0.54145

Epoch 00005: val_loss did not improve from 0.54145

 Currently best mean val_loss: 0.54145348072052

 Currently best mean val_accuracy: 0.7379999756813049

Configuration 2 / 12: {'epochs': 10, 'batch_size': 256, 'learning_rate': 0.01, 'dropout_rate': 0.0, 'l2_reg': 0.0001}
Fold 1 / 1

Epoch 00001: val_loss improved from inf to 0.57323, saving model to D:\Data\Google Drive\Open

AttributeError: 'NoneType' object has no attribute 'log'

# Train the model on Azure

In [None]:
import config
importlib.reload(config)
from azureml.core import Workspace, Experiment, Environment, Dataset, ScriptRunConfig

# Name the model
model_name = config.model_name

# Load the workspace
ws = Workspace.from_config()
print("\nWorkspace loaded")

# Get the default Azure Machine Learning datastore
datastore = ws.get_default_datastore()
print("\nDatastore created")

# Instantiate an Azure Machine Learning dataset
dataset = Dataset.File.from_files(path=[(datastore, 'datasets')])
print("\nDataset instantiated")

# Register the dataset
dataset = dataset.register(workspace=ws,
                           name='data',
                           description='Données P7',
                           create_new_version=True)
print("\nDataset registered")

# Define what compute target to use
compute_target = ws.compute_targets['gpu-cluster']
print("\nCompute target defined")

# Configure an environment
env = Environment.from_conda_specification(name='P7-env-gpu',
                                           file_path='./.azureml/P7-env-gpu.yml')
print("\nEnvironment configured")

# Specify a GPU base image
env.docker.enabled = True
env.docker.base_image = 'mcr.microsoft.com/azureml/openmpi3.1.2-cuda10.1-cudnn7-ubuntu18.04'
print("\nGPU image specified")

# Instantiate a script
# Define the source directory, the main script, the compute target, the arguments
src = ScriptRunConfig(source_directory='./src',
                      script='train.py',
                      compute_target=compute_target,
                      arguments=[
                                 '--data-folder', dataset.as_named_input('input').as_mount(),
                                 '--model-name', model_name,
                                ],
                     )
print("\nScript configured")

# Update the script with the environment
src.run_config.environment = env
print("\nConfig updated with the GPU image")

# Instantiate an experiment
experiment = Experiment(workspace=ws, name='xp-1')
print("\nExperiment instantiated")

# Run an experiment
run = experiment.submit(src)

# Display the link to the experiment
aml_url = run.get_portal_url()
print("\nSubmitted to compute cluster. Click link below")
print("")
print(aml_url)


# Deploy the model on Azure

In [None]:
ROOT_DIR = r"D:\Data\Google Drive\Openclassrooms\P7\Projet"
SRC_DIR = ROOT_DIR + r"\src"
sys.path.append(SRC_DIR)

import config
import pickle
import importlib
from azureml.core import Workspace, Environment, Model

In [None]:
ROOT_DIR = r"D:\Data\Google Drive\Openclassrooms\P7\Projet"
SRC_DIR = ROOT_DIR + r"\src"
sys.path.append(SRC_DIR)
import config

import importlib
importlib.reload(config)

from azureml.core import Workspace, Environment
from azureml.core.model import InferenceConfig, Model
from azureml.core.webservice import AciWebservice

#Retrieve the model name
model_name = config.model_name

# Load the workspace
ws = Workspace.from_config()
print("\nWorkspace loaded")

# Clean-up
# Delete services with the same name
if config.model_name in ws.webservices:
    ws.webservices[config.model_name].delete()
print('\nClean-up done')

# Configure an environment
env = Environment.from_conda_specification(name='P7-env',
                                           file_path='./.azureml/P7-env.yml')
print("\nEnvironment configured")

# Combine the script and environment in an InferenceConfig
inference_config = InferenceConfig(source_directory='./src',
                                   entry_script='entry_script.py',
                                   environment=env)
print("\nInference configured") 

# Define what compute target to use
compute_target = ws.compute_targets['cpu-cluster']
print("\nCompute target defined")

# Define the deployment configuration which sets the target 
# specific compute specification for the containerized deployement
deployment_config = AciWebservice.deploy_configuration(cpu_cores = 1, memory_gb = 1)
print("\nDeployment configured\n")

# Deploy the model
model = ws.models[model_name]
service = Model.deploy(workspace=ws,
                       name = model_name,
                       models = [model],
                       inference_config = inference_config,
                       deployment_config = deployment_config,
                       deployment_target = None)
service.wait_for_deployment(show_output = True)
print('\nDeployment done')

# Consume the web service on Azure

## Test the run(data) script locally

In [None]:
ROOT_DIR = r"D:\Data\Google Drive\Openclassrooms\P7\Projet"
SRC_DIR = ROOT_DIR + r"\src"
import sys
import json
sys.path.append(SRC_DIR)

import pandas as pd
import numpy as np
import load
import preprocess

from tensorflow.keras.models import load_model

print("\n****************************************")
print("SEND REQUEST")
print("****************************************")

# Set-up Data
data = {"data":
        [
           "I hate this company !",
           "The flight was great"
        ]
        }
print("\nData set-up")

# Convert to JSON string
input_data = json.dumps(data)
print('\nConvert to JSON')


print("\n****************************************")
print("TREAT REQUEST")
print("****************************************")

# Set up  the parameters of the model
model_params['train_test_split']['n_samples'] = 10
model_params['train_test_split']['test_size'] = 1.0
model_params['train_test_split']['val_test_ratio'] = 0.0
print('\nParameters of the model set-up')

# Load the data
corpus_test = json.loads(input_data)['data']
y_test = np.zeros(len(corpus_test))
print('\nData loaded')

# Load the model
model_name = config.model_name
model_params_file_name = f"./models/model_params_{model_name}.p"
model_file_name = f"./models/model_{model_name}.p"
model = load_model(model_file_name)
print('\nModel loaded')

# Preprocess the data
preprocessing_inputs = (corpus_test, y_test, model_params)
padded_corpus_val, padded_corpus_test, y_val, y_test = preprocess.preprocess_test(*preprocessing_inputs)
print('\nData preprocessed')

# Predict the data
result = model.predict(padded_corpus_test)
result = ["positive" if r > 0.5 else "negative" for r in result]
result = zip(result, corpus_test)
print(f'\nRequest result: {list(result)}')

## Test the call to the webservice (on Azure)

In [None]:
import argparse
from azureml.core import Workspace
from azureml.core.webservice import Webservice

import json
import requests

# parser = argparse.ArgumentParser()
# parser.add_argument('--data', type=list, dest='data', help='List of tweets')
# args = parser.parse_args()
# data = args.data

# Format the data
data = {"data":
        [
           "I hate this company !",
           "The flight was great"
        ]
        }

input_data = json.dumps(data)
print("\nData formatted")

# Connect to workspace
ws = Workspace.from_config()
print("\nWorkspace connected")

# Connect to webservice
webservice = Webservice(workspace=ws, name='model-1')
scoring_uri = webservice.scoring_uri
print("\nWebservice connected")

# Set the content type
headers = {'Content-Type': 'application/json'}
# If authentication is enabled, set the authorization header
# headers['Authorization'] = f'Bearer {key}'

# Make the request and display the response
resp = requests.post(scoring_uri, input_data, headers=headers)
print('\nRequest done')

print('\nResponse:')
print(resp.text)
