# <center> **Requesting an external API**

A partir d'une liste de titres de films nous allons requêter l'API publique [https://www.omdbapi.com](https://www.omdbapi.com)

Nous enregistrerons les données non-structurées (résumé et affiche du film) dans une base NoSQL (MongoDB)

In [83]:
%reset

## **Imports**

In [84]:
import math
import copy
import re
import json
import requests
import numpy as np
import pandas as pd
from tqdm import tqdm
from unidecode import unidecode

# MongoDB / Pymongo
import pymongo
from pymongo.mongo_client import MongoClient
from pymongo.server_api import ServerApi
import pprint

pd.set_option('display.max_rows', 10)
tqdm.pandas()

api_key = "b8dd5759"

## **Reading the data**


In [138]:
# df_movies = pd.read_csv('csv/movies_year_1982.csv', delimiter = ',')
# df_movies = pd.read_csv('csv/movies_decade_80.csv', delimiter = ',')
df_movies = pd.read_csv('csv/movies_year_1995_to_2000.csv', delimiter = ',', usecols=['title', 'original_title', 'summary', 'url_thumbnail'])
print("Nb movies :", df_movies.shape[0])
df_movies

Nb movies : 807


Unnamed: 0,title,original_title,summary,url_thumbnail
0,Une histoire vraie,The Straight Story,"Alvin Straight, vétéran de 73 ans, vit avec sa...",https://fr.web.img6.acsta.net/c_310_420/pictur...
1,La Ligne verte,The Green Mile,"Paul Edgecomb, pensionnaire centenaire d'une m...",https://fr.web.img2.acsta.net/c_310_420/medias...
2,Matrix,The Matrix,Programmeur anonyme dans un service administra...,https://fr.web.img4.acsta.net/c_310_420/medias...
3,Eyes Wide Shut,Eyes Wide Shut,"William Harford, médecin, mène une paisible ex...",https://fr.web.img6.acsta.net/c_310_420/medias...
4,Fight Club,Fight Club,"Le narrateur, sans identité précise, vit seul,...",https://fr.web.img6.acsta.net/c_310_420/pictur...
...,...,...,...,...
802,Un vampire à Brooklyn,Vampire in Brooklyn,Maximillian appartient à une longue lignée de ...,https://fr.web.img6.acsta.net/c_310_420/medias...
803,Fair Game,Fair Game,Pour avoir menacé son ex-mari de faire saisir ...,https://fr.web.img2.acsta.net/c_310_420/medias...
804,Les Maitres du monde,The Puppet Masters,"Sam Nivens, jeune agent du gouvernement, est e...",https://fr.web.img2.acsta.net/c_310_420/medias...
805,Halloween 6 : La Malédiction de Michael Myers,Halloween: The Curse of Michael Myers,Dix ans après avoir terrorisé la petite ville ...,https://fr.web.img4.acsta.net/c_310_420/medias...


In [131]:
def format_string(st):
    ''' format string 
        from "title of the movie" 
        to title+of+the+movie

        Arg: st string to be converted.
    '''
    res = ''
    for c in st:
        if c.isdigit() or c.isalpha() or c.isspace():
            res += unidecode(c)
        else:
            res += ' '
    return '+'.join([word for word in res.split() if len(word) > 1])

def request_omdb_from_title(title):
    ''' Request the omdb API
    
        return a json dictionary with the information about the movie.

        Arg:
         - title: string with title of the movie we want the infos about.
    '''
    url = f"https://www.omdbapi.com/?apikey={api_key}&t={format_string(title)}"
    r = requests.get(url)
    if r.status_code != 200:
        print(f"ERROR {title}, Response Code: {r.status_code}")
        print("Request:", url)
        return {'Response': 'False'}
    return json.loads(r.text)

def get_plot_and_thumbail_from_omdb(title):
    ''' return movie plot and thumbail through an API request.
        
        return: 
          - plot:      string containing the plot of the movie,
          - thumbnail: string containing the url of the thumbnail.

        Arg: title: string with the title of the movie.
    '''
    plot, thumbnail = '', ''
    res_dict = request_omdb_from_title(title)
    lst_keys = res_dict.keys()
    assert 'Response' in res_dict
    if res_dict['Response'] == 'True':
        # print('res_dict', res_dict)
        assert 'Plot' in lst_keys and 'Poster' in lst_keys
        if res_dict['Plot'] != 'N/A':
            plot = res_dict['Plot']
        if res_dict['Poster'] != '' and res_dict['Poster'] != 'N/A':
            thumbnail = res_dict['Poster']
    return plot + "AND" + thumbnail

## **Get the plot and the thumbnail from the omdb API**

In [None]:
df_movies['temp']      = df_movies['original_title'].apply(get_plot_and_thumbail_from_omdb)
df_movies['plot']      = df_movies['temp'].apply(lambda x : x.split('AND')[0])
df_movies['thumbnail'] = df_movies['temp'].apply(lambda x : x.split('AND')[1])
df_movies['plot']          = np.where(df_movies['plot'] != '', df_movies['plot'], df_movies['summary'])
df_movies['url_thumbnail'] = np.where(df_movies['thumbnail'] != '', df_movies['thumbnail'], df_movies['url_thumbnail'])
df_movies = df_movies[['title', 'original_title', 'plot', 'url_thumbnail']]

## **Store the data in a NoSQL database**

In [134]:
# Connect to MongoDB
client = pymongo.MongoClient("mongodb://localhost:27017/")

# Create database "allocine" (or selects it if already exists)
mydb = client["allocine"]
# client.drop_database("movies")

# Create a collection "movies" (table in SQL)
col_movies = mydb["movies"]
# col_movies.drop()

In [None]:
# Insertion of movie plots in MongoDB database
col_movies.insert_many(df_movies.to_dict(orient='records')) # TO DO ONLY ONCE
# col_movies.drop()

print(client.list_database_names())
print(mydb.list_collection_names())

# for doc in list(col_movies.find().limit(5)):
#     pprint.pprint(doc)

['Rennes', 'Rennes2', 'admin', 'allocine', 'config', 'local', 'mydatabase']
['movies']
{'_id': ObjectId('67abbeaf1e1d959e84245680'),
 'original_title': "L'Armée des Ombres",
 'plot': 'France 1942. Gerbier, ingénieur des Ponts et Chaussées est également '
         "l'un des chefs de la Résistance. Dénoncé et capturé, il est "
         "incarcéré dans un camp de prisonniers. Alors qu'il ",
 'title': "L'Armée des Ombres",
 'url_thumbnail': 'https://fr.web.img4.acsta.net/c_310_420/img/23/c1/23c1acd5c06be11bc9a64f448dae49f4.jpg'}
{'_id': ObjectId('67abbeaf1e1d959e84245681'),
 'original_title': 'Easy Rider',
 'plot': 'Two bikers head from L.A. to New Orleans through the open country '
         'and desert lands, and along the way they meet a man who bridges a '
         'counter-culture gap of which they had been unaware.',
 'title': 'Easy Rider',
 'url_thumbnail': 'https://m.media-amazon.com/images/M/MV5BMTc2MjI2NDc4Ml5BMl5BanBnXkFtZTgwODI4NzU0MTI@._V1_SX300.jpg'}
{'_id': ObjectId('67abbeaf

In [136]:
print("Nb documents:", col_movies.count_documents({}))

Nb documents: 2650


## **Checking what is in the Mongo DB**

In [None]:
df1 = pd.read_csv('csv/movies_year_1960_to_1970.csv', delimiter = ',', usecols=['title', 'original_title', 'summary', 'url_thumbnail'])
df2 = pd.read_csv('csv/movies_year_1970_to_1980.csv', delimiter = ',', usecols=['title', 'original_title', 'summary', 'url_thumbnail'])
df3 = pd.read_csv('csv/movies_year_1980_to_1990.csv', delimiter = ',', usecols=['title', 'original_title', 'summary', 'url_thumbnail'])
df4 = pd.read_csv('csv/movies_year_1990_to_1995.csv', delimiter = ',', usecols=['title', 'original_title', 'summary', 'url_thumbnail'])
df5 = pd.read_csv('csv/movies_year_1995_to_2000.csv', delimiter = ',', usecols=['title', 'original_title', 'summary', 'url_thumbnail'])

df_csv = pd.concat([df1, df2, df3, df4, df5])
# print(df_csv.shape[0])
for df in [df1, df2, df3, df4, df5]:
    print(df.shape[0])

# Reading data fmo Mongo DB
client = pymongo.MongoClient("mongodb://localhost:27017/")
mydb = client["allocine"]
col_movies = mydb["movies"]
print("Nb documents:", col_movies.count_documents({}))

lst = list(col_movies.find())
df_mongo = pd.DataFrame(lst, columns = ['_id', 'title', 'original_title', 'plot', 'url_thumbnail'])
print(df_mongo.shape[0])

# Saving Mongo DB Data into CSV
df_mongo.to_csv('csv/mongoDB.csv', sep=',', index = False)
df_mongo.tail(3)

518 + 678 + 869 + 585 == 2650 # 1960 to 1995

518
678
869
585
807
Nb documents: 2650
2650


True

## **Query MongoDB Using PyMongo**

### **We create another DB**

In [157]:
client2 = pymongo.MongoClient("mongodb://localhost:27017/")
mydb2 = client2["test_queries"]
# client.drop_database("test_queries")

# Create a collection "movies" (table in SQL)
col_fruits = mydb2["fruits"]
# col_fruits.drop()

col_fruits.insert_many([
        # { "_id": 1, "name": "apples", "qty": 5, "rating": 3, "color": "red", "type": ["fuji", "honeycrisp"] },
        # { "_id": 2, "name": "bananas", "qty": 7, "rating": 4, "color": "yellow", "type": ["cavendish"] },
        # { "_id": 3, "name": "oranges", "qty": 6, "rating": 2, "type": ["naval", "mandarin"] },
        # { "_id": 4, "name": "pineapple", "qty": 3, "rating": 5, "color": "yellow" },
        { "_id": 5, "name": "test", "qty": 10, "rating": 3, "color": "green", "type": ["fuji", "typeXXX"] },
    ])
# col_fruits.drop()

print(client2.list_database_names())
print(mydb2.list_collection_names())

['Rennes', 'Rennes2', 'admin', 'allocine', 'config', 'local', 'mydatabase', 'test_queries']
['fruits']


**Sources**<br>
https://www.mongodb.com/docs/languages/python/pymongo-driver/current/read/specify-a-query/<br>
https://www.w3resource.com/mongodb/introduction-mongodb.php<br>
https://www.mongodbtutorial.org/mongodb-crud/mongodb-findone/<br>
https://geekflare.com/fr/mongodb-queries-examples/<br>


**Exact Match**

In [142]:
results = col_movies.find({ "title": "Médée" })
results.to_list()

[{'_id': ObjectId('67abbeaf1e1d959e84245693'),
  'title': 'Médée',
  'original_title': 'Medea',
  'plot': "After his quest to retrieve the fabled Golden Fleece, Jason returns to Greece with powerful sorceress Medea. However, when the king banishes her, it's only human that Medea plots her furious revenge. Can they escape her wrath?",
  'url_thumbnail': 'https://m.media-amazon.com/images/M/MV5BODliMTQzNGQtNTNlNi00OGIyLTkzMWYtNzRkNThlYjJlNjhiXkEyXkFqcGc@._V1_SX300.jpg'}]

In [144]:
results = col_fruits.find({ "color": "yellow" })
results.to_list()

[{'_id': 2,
  'name': 'bananas',
  'qty': 7,
  'rating': 4,
  'color': 'yellow',
  'type': ['cavendish']},
 {'_id': 4, 'name': 'pineapple', 'qty': 3, 'rating': 5, 'color': 'yellow'}]

**Comparison operators**

list of operators: https://www.mongodb.com/docs/manual/reference/operator/query-comparison/

In [143]:
results = col_fruits.find({ "rating": { "$gt" : 2 }})
for f in results:
    print(f) 

{'_id': 1, 'name': 'apples', 'qty': 5, 'rating': 3, 'color': 'red', 'type': ['fuji', 'honeycrisp']}
{'_id': 2, 'name': 'bananas', 'qty': 7, 'rating': 4, 'color': 'yellow', 'type': ['cavendish']}
{'_id': 4, 'name': 'pineapple', 'qty': 3, 'rating': 5, 'color': 'yellow'}


**Logical operators**

https://www.mongodb.com/docs/manual/reference/operator/query-logical/

In [147]:
results = col_fruits.find({ 
    "$or": [
        { "qty": { "$gt": 5 }},
        { "color": "yellow" }
    ]
})
for f in results:
    print(f)

{'_id': 2, 'name': 'bananas', 'qty': 7, 'rating': 4, 'color': 'yellow', 'type': ['cavendish']}
{'_id': 3, 'name': 'oranges', 'qty': 6, 'rating': 2, 'type': ['naval', 'mandarin']}
{'_id': 4, 'name': 'pineapple', 'qty': 3, 'rating': 5, 'color': 'yellow'}


**Arrays operators**

https://www.mongodb.com/docs/manual/reference/operator/query-array/

In [158]:
for f in col_fruits.find({}):
    print(f)

print('----')
results = col_fruits.find({
    "type" : { "$size": 2 }
})

for f in results:
    print(f)

{'_id': 1, 'name': 'apples', 'qty': 5, 'rating': 3, 'color': 'red', 'type': ['fuji', 'honeycrisp']}
{'_id': 2, 'name': 'bananas', 'qty': 7, 'rating': 4, 'color': 'yellow', 'type': ['cavendish']}
{'_id': 3, 'name': 'oranges', 'qty': 6, 'rating': 2, 'type': ['naval', 'mandarin']}
{'_id': 4, 'name': 'pineapple', 'qty': 3, 'rating': 5, 'color': 'yellow'}
{'_id': 5, 'name': 'test', 'qty': 10, 'rating': 3, 'color': 'green', 'type': ['fuji', 'typeXXX']}
----
{'_id': 1, 'name': 'apples', 'qty': 5, 'rating': 3, 'color': 'red', 'type': ['fuji', 'honeycrisp']}
{'_id': 3, 'name': 'oranges', 'qty': 6, 'rating': 2, 'type': ['naval', 'mandarin']}
{'_id': 5, 'name': 'test', 'qty': 10, 'rating': 3, 'color': 'green', 'type': ['fuji', 'typeXXX']}


In [159]:
results = col_fruits.find({
    "type" : { "$all": ['fuji'] }
})

for f in results:
    print(f)

{'_id': 1, 'name': 'apples', 'qty': 5, 'rating': 3, 'color': 'red', 'type': ['fuji', 'honeycrisp']}
{'_id': 5, 'name': 'test', 'qty': 10, 'rating': 3, 'color': 'green', 'type': ['fuji', 'typeXXX']}


**Element Operator**

to check if a field exists

In [160]:
results = col_fruits.find( { "color" : { "$exists": "true" }} )
for f in results:
    print(f)

{'_id': 1, 'name': 'apples', 'qty': 5, 'rating': 3, 'color': 'red', 'type': ['fuji', 'honeycrisp']}
{'_id': 2, 'name': 'bananas', 'qty': 7, 'rating': 4, 'color': 'yellow', 'type': ['cavendish']}
{'_id': 4, 'name': 'pineapple', 'qty': 3, 'rating': 5, 'color': 'yellow'}
{'_id': 5, 'name': 'test', 'qty': 10, 'rating': 3, 'color': 'green', 'type': ['fuji', 'typeXXX']}


**Evaluation operators**

https://www.mongodb.com/docs/manual/reference/operator/query-evaluation/

In [None]:
results = col_fruits.find({ "name" : { "$regex" : "p{2,}" }} ) ## At least 2 consecutives p
for f in results:
    print(f)

{'_id': 1, 'name': 'apples', 'qty': 5, 'rating': 3, 'color': 'red', 'type': ['fuji', 'honeycrisp']}
{'_id': 4, 'name': 'pineapple', 'qty': 3, 'rating': 5, 'color': 'yellow'}
