In [1]:
import sagemaker
from sagemaker import get_execution_role
import boto3
import json

my_session = sagemaker.Session()

role = get_execution_role()

s3 = boto3.resource('s3')
bucket_name = 'machine-learning-exam'       # Change to your bucket
prefix = 'language_identification/fasttext' # and prefix

In [2]:
my_region_name = boto3.Session().region_name

In [3]:
blazing_text_container = sagemaker.amazon.amazon_estimator.get_image_uri(my_region_name, "blazingtext", "latest")
print('Using SageMaker BlazingText container: {} ({})'.format(blazing_text_container, my_region_name))

Using SageMaker BlazingText container: 811284229777.dkr.ecr.us-east-1.amazonaws.com/blazingtext:latest (us-east-1)


In [4]:
!wget -O model.bin https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin

--2020-02-26 00:59:24--  https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 104.20.6.166, 104.20.22.166, 2606:4700:10::6814:16a6, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|104.20.6.166|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 131266198 (125M) [application/octet-stream]
Saving to: ‘model.bin’


2020-02-26 00:59:30 (22.6 MB/s) - ‘model.bin’ saved [131266198/131266198]



In [5]:
!tar -czvf langid.tar.gz model.bin
blazing_text_model_location = my_session.upload_data("langid.tar.gz", bucket=bucket_name, key_prefix=prefix)
!rm langid.tar.gz model.bin

model.bin


In [6]:
language_identifier = sagemaker.Model(model_data=blazing_text_model_location, image=blazing_text_container, role=role, sagemaker_session=my_session)
language_identifier.deploy(initial_instance_count = 1,instance_type = 'ml.m4.xlarge')
language_identifier_predictor = sagemaker.RealTimePredictor(endpoint=language_identifier.endpoint_name, 
                                   sagemaker_session=my_session,
                                   serializer=json.dumps,
                                   deserializer=sagemaker.predictor.json_deserializer)

-------------------!

In [7]:
some_language_examples = ["À quoi sert l'intelligence artificielle",
             "Was ist der Zweck der künstlichen Intelligenz?",
             "Wat is die doel van kunsmatige intelligensie",
             "ما هو الغرض من الذكاء الاصطناعي",
             "Süni intellektin məqsədi nədir",
             "Hvad er formålet med kunstig intelligens"]
prediction_input = {"instances" : some_language_examples}

In [9]:
language_predictions = language_identifier_predictor.predict(prediction_input)
print(language_predictions)

[{'prob': [0.8571586608886719], 'label': ['__label__fr']}, {'prob': [0.9994584321975708], 'label': ['__label__de']}, {'prob': [0.465190052986145], 'label': ['__label__af']}, {'prob': [0.9983780980110168], 'label': ['__label__ar']}, {'prob': [0.9949907064437866], 'label': ['__label__az']}, {'prob': [0.864094614982605], 'label': ['__label__da']}]


In [10]:
# Remove the '__label__' before each language identifier in the prediction output
# and change the label and prob to more readable values
for output in language_predictions:
    output['label'] = output['label'][0][9:].upper() # remove __label__ preceding the language identifier
    output['language'] = output.pop('label')         # make the labels 
    output['probability'] = output.pop('prob')       # readable

print(language_predictions)

[{'language': 'FR', 'probability': [0.8571586608886719]}, {'language': 'DE', 'probability': [0.9994584321975708]}, {'language': 'AF', 'probability': [0.465190052986145]}, {'language': 'AR', 'probability': [0.9983780980110168]}, {'language': 'AZ', 'probability': [0.9949907064437866]}, {'language': 'DA', 'probability': [0.864094614982605]}]
