### Création d'un schema

In [2]:
import hashlib

In [3]:
import whyqd as _w

In [4]:
schema = _w.Schema()

In [5]:
details = {
        "name": "child_and_demography",
        "title": "Inequalities in maternal and child health",
        "description": """This table presents data for developing countries based on data from DHS conducted since 1990.
       Quintiles are defined by socioeconomic status in terms of assets or wealth, rather than in terms of income or consumption.
       For details, see Macro International. 2007b"""
}
schema = _w.Schema()
schema.set_details(**details)

In [6]:
details

{'name': 'child_and_demography',
 'title': 'Inequalities in maternal and child health',
 'description': 'This table presents data for developing countries based on data from DHS conducted since 1990.\n       Quintiles are defined by socioeconomic status in terms of assets or wealth, rather than in terms of income or consumption.\n       For details, see Macro International. 2007b'}

In [7]:
fields = [
    { 
        "name": "Rang_IDH",
        "title": "Rang_IDH",
        "type": "integer",
        "description": "Classement des pays selon leur niveau de l'IDH."
    },
  
    {
        "name": "Country",
        "title": "Country",
        "type": "string",
        "description": "Nom de chaque pays."
    },
        
    {
        "name": "Survey_year",
        "title": "Survey_year",
        "type": "year",
        "description": "year of survey for every country." 
        
    },
       
    {
        "name": "HDI Category",
        "title": "HDI Category",
        "type": "string",
        "description": "Human Development Index Category derived from the HDI Rank."
    },
    {
        "name": "Indicator Name",
        "title": "Indicator Name",
        "type": "string",
        "description": "Indicator described in the data series."
    },
        
    {
        "name": "Reference",
        "title": "Reference",
        "type": "string",
        "description": "Reference to data source."
    },
   
    {
        "name": "Values",
        "title": "Values",
        "type": "number",
        "description": "Value for the Year and Indicator Name."
    }
]


In [8]:
for field in fields:
    schema.set_field(**field)

In [9]:
fields

[{'name': 'Rang_IDH',
  'title': 'Rang_IDH',
  'type': 'integer',
  'description': "Classement des pays selon leur niveau de l'IDH."},
 {'name': 'Country',
  'title': 'Country',
  'type': 'string',
  'description': 'Nom de chaque pays.'},
 {'name': 'Survey_year',
  'title': 'Survey_year',
  'type': 'year',
  'description': 'year of survey for every country.'},
 {'name': 'HDI Category',
  'title': 'HDI Category',
  'type': 'string',
  'description': 'Human Development Index Category derived from the HDI Rank.'},
 {'name': 'Indicator Name',
  'title': 'Indicator Name',
  'type': 'string',
  'description': 'Indicator described in the data series.'},
 {'name': 'Reference',
  'title': 'Reference',
  'type': 'string',
  'description': 'Reference to data source.'},
 {'name': 'Values',
  'title': 'Values',
  'type': 'number',
  'description': 'Value for the Year and Indicator Name.'}]

In [10]:
schema.field("country")

{'name': 'country',
 'type': 'string',
 'title': 'Country',
 'description': 'Nom de chaque pays.'}

In [11]:
schema.field("survey_year")

{'name': 'survey_year',
 'type': 'year',
 'title': 'Survey_year',
 'description': 'year of survey for every country.'}

In [12]:
directory = "C:/Users/AUDREY/"
# vous pouvez également spécifier un nom de fichier facultatif
# si vous l'omettez, le nom du fichier sera par défaut le nom du schéma
filename = "child-and-demographie"
# si le fichier existe déjà, vous devrez spécifier "overwrite=True" sinon vous obtiendrez une erreur
schema.save(directory, filename=filename, overwrite=True)

True

### Création d'une méthode

In [13]:
### Les importations et paramètres suivants vous permettent d'obtenir un large éventail de résultats pour vos tableaux
from IPython.core.display import HTML
display(HTML("<style>pre { white-space: pre !important; }</style>"))

import numpy as np
import whyqd as _w

SCHEMA_SOURCE = "C:/Users/AUDREY/child-and-demographie.json"
DIRECTORY = "C:/Users/AUDREY/"
INPUT_DATA = [
    "C:/Users/AUDREY/sbc4d/HDR 2007-2008 Table 08.xlsx"
]
method = _w.Method(SCHEMA_SOURCE, directory=DIRECTORY, input_data=INPUT_DATA)

In [14]:
print(method.print_input_data())



Data id: 81a21b07-7f31-4fd6-b57a-a0c928041362
Original source: C:/Users/AUDREY/sbc4d/HDR 2007-2008 Table 08.xlsx

  ..    Unnamed: 0  Unnamed: 1                           Unnamed: 2    Unnamed: 3    Unnamed: 4    Unnamed: 5    Unnamed: 6    Unnamed: 7    Unnamed: 8    Unnamed: 9    Unnamed: 10    Unnamed: 11    Unnamed: 12    Unnamed: 13    Unnamed: 14    Unnamed: 15    Unnamed: 16    Unnamed: 17    Unnamed: 18    Unnamed: 19    Unnamed: 20    Unnamed: 21    Unnamed: 22    Unnamed: 23    Unnamed: 24
   0           nan  nan                                         nan           nan           nan           nan           nan           nan           nan           nan            nan            nan            nan            nan            nan            nan            nan            nan            nan            nan            nan            nan            nan            nan            nan
   1           nan  …to lead a long and healthy life…           nan           nan           nan       

In [15]:
method.default_morph_types

['CATEGORISE', 'DEBLANK', 'DEDUPE', 'DELETE', 'MELT', 'REBASE', 'RENAME']

In [16]:
# A titre d'exemple :
method.default_morph_settings("CATEGORISE")

{'name': 'CATEGORISE',
 'title': 'Categorise',
 'type': 'morph',
 'description': 'Convert row-level categories into column categorisations.',
 'structure': ['rows', 'column_names']}

In [17]:
# Utilisez _id, ou une autre variable, puisque `id` est un terme protégé par Python
_id = method.input_data[0]["id"]
df = method.input_dataframe(_id)
df.head()

Unnamed: 0.1,Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,...,Unnamed: 15,Unnamed: 16,Unnamed: 17,Unnamed: 18,Unnamed: 19,Unnamed: 20,Unnamed: 21,Unnamed: 22,Unnamed: 23,Unnamed: 24
0,,,,,,,,,,,...,,,,,,,,,,
1,,…to lead a long and healthy life…,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,


In [18]:
method.reset_input_data_morph(_id)

In [19]:
method.add_input_data_morph(_id, ["REBASE", 11])

In [None]:
df

In [20]:
# On obtient la valeur du dernier élément de l'index, puis on ajoute 1 pour créer l'intervalle
rows = [int(i) for i in np.arange(86, df.index[-1]+1)]
method.add_input_data_morph(_id, ["DELETE", rows])

In [21]:
columns = [
    "Rang_IDH",
    "Country",
    "Reference 1",
    "Survey_year",
    "Birth_attended_Poor",
    "Birth_attended_Rich",
    "One-year-olds  immunized_Poor",
    "Reference 2",
    "One-year-olds  immunized_Rich",
    "Reference 3",
    "Child_underheight_ Poor",
    "Child_underheight_ Rich",
    "IMR_Poor",
    "Reference 4",
    "IMR_Rich",
    "Reference 5",
    "Under5_MR_Poor",
    "Reference 6",
    "Under5_MR_Rich",
    "Reference 7",
]
method.add_input_data_morph(_id, ["RENAME", columns])

In [22]:
df = method.input_dataframe(_id)
df.head()

Unnamed: 0,Rang_IDH,Country,Reference 1,Survey_year,Birth_attended_Poor,Birth_attended_Rich,One-year-olds immunized_Poor,Reference 2,One-year-olds immunized_Rich,Reference 3,Child_underheight_ Poor,Child_underheight_ Rich,IMR_Poor,Reference 4,IMR_Rich,Reference 5,Under5_MR_Poor,Reference 6,Under5_MR_Rich,Reference 7
12,HIGH HUMAN DEVELOPMENT,,,,,,,,,,,,,,,,,,,
13,70,Brazil,,1996.0,72.0,99.0,57.0,,74.0,,23.0,2.0,83.0,,29.0,,99.0,,33.0,
14,MEDIUM HUMAN DEVELOPMENT,,,,,,,,,,,,,,,,,,,
15,73,Kazakhstan,,1999.0,99.0,99.0,69.0,,62.0,c,15.0,8.0,68.0,,42.0,,82.0,,45.0,
16,75,Colombia,,2005.0,72.0,99.0,47.0,,72.0,,20.0,3.0,32.0,,14.0,,39.0,,16.0,


In [23]:
# Obtenir les indices des lignes de données catégorielles
hdi_categories = ["HIGH HUMAN DEVELOPMENT", "MEDIUM HUMAN DEVELOPMENT", "LOW HUMAN DEVELOPMENT"]
rows = df[df["Rang_IDH"].isin(hdi_categories)].index
method.add_input_data_morph(_id, ["CATEGORISE", rows, "HDI category"])

ValueError: Task morph `CATEGORISE` has invalid structure `['rows', 'column_names']`.

In [24]:
type(rows)

pandas.core.indexes.numeric.Int64Index

In [25]:
method.add_input_data_morph(_id, ["CATEGORISE", list(rows), "HDI category"])

In [26]:
df = method.input_dataframe(_id)
df.head()

Unnamed: 0,Rang_IDH,Country,Reference 1,Survey_year,Birth_attended_Poor,Birth_attended_Rich,One-year-olds immunized_Poor,Reference 2,One-year-olds immunized_Rich,Reference 3,...,Child_underheight_ Rich,IMR_Poor,Reference 4,IMR_Rich,Reference 5,Under5_MR_Poor,Reference 6,Under5_MR_Rich,Reference 7,HDI category
13,70,Brazil,,1996,72,99,57,,74,,...,2,83,,29,,99,,33,,HIGH HUMAN DEVELOPMENT
15,73,Kazakhstan,,1999,99,99,69,,62,c,...,8,68,,42,,82,,45,,MEDIUM HUMAN DEVELOPMENT
16,75,Colombia,,2005,72,99,47,,72,,...,3,32,,14,,39,,16,,MEDIUM HUMAN DEVELOPMENT
17,78,Thailand,d,2005-06,93,100,92,e,86,e,...,7,..,,..,,..,,..,,MEDIUM HUMAN DEVELOPMENT
18,79,Dominican Republic,,1996,89,98,34,,47,,...,2,67,,23,,90,,27,,MEDIUM HUMAN DEVELOPMENT


In [27]:
# Sélectionnez toutes les colonnes à "melt"
columns = [
    "Rang_IDH",
    "Country",
    "Survey_year",
    "Birth_attended_Poor",
    "Birth_attended_Rich",
    "One-year-olds  immunized_Poor",
    "One-year-olds  immunized_Rich",
    "Child_underheight_ Poor",
    "Child_underheight_ Rich",
    "IMR_Poor",
    "IMR_Rich",
    "Under5_MR_Poor",
    "Under5_MR_Rich",
]
method.add_input_data_morph(_id, ["MELT", columns, ["Indicator Name", "Indicator Value"]])

In [28]:
columns = [
    "Reference 1",
    "Reference 2",
    "Reference 3",
    "Reference 4",
    "Reference 5",
    "Reference 6",
    "Reference 7"
]
method.add_input_data_morph(_id, ["MELT", columns, ["Reference Name", "Reference"]])

In [29]:
method.add_input_data_morph(_id, ["DEBLANK"])

In [30]:
df = method.input_dataframe(_id)
df.head()

Unnamed: 0,Indicator Value,Indicator Name,HDI category,Reference Name,Reference
0,70,Rang_IDH,HIGH HUMAN DEVELOPMENT,Reference 1,
1,73,Rang_IDH,MEDIUM HUMAN DEVELOPMENT,Reference 1,
2,75,Rang_IDH,MEDIUM HUMAN DEVELOPMENT,Reference 1,
3,78,Rang_IDH,MEDIUM HUMAN DEVELOPMENT,Reference 1,d
4,79,Rang_IDH,MEDIUM HUMAN DEVELOPMENT,Reference 1,


In [31]:
print(method.help("merge"))


`merge` will join, in order from right to left, your input data on a common column.

To add input data, where `input_data` is a filename, or list of filenames:

	>>> method.add_input_data(input_data)

To remove input data, where `id` is the unique id for that input data:

	>>> method.remove_input_data(id)

Prepare an `order_and_key` list, where each dict in the list has:

	{{id: input_data id, key: column_name for merge}}

Run the merge by calling (and, optionally - if you need to overwrite an existing merge - setting
`overwrite_working=True`):

	>>> method.merge(order_and_key, overwrite_working=True)

To view your existing `input_data`:

	>>> method.input_data


Data id: 81a21b07-7f31-4fd6-b57a-a0c928041362
Original source: C:/Users/AUDREY/sbc4d/HDR 2007-2008 Table 08.xlsx

  ..    Unnamed: 0  Unnamed: 1                           Unnamed: 2    Unnamed: 3    Unnamed: 4    Unnamed: 5    Unnamed: 6    Unnamed: 7    Unnamed: 8    Unnamed: 9    Unnamed: 10    Unnamed: 11    Unnamed: 12   

In [32]:
%time method.merge(overwrite_working=True)

Wall time: 8.12 s


### Structure

In [33]:
print(method.help("structure"))


`structure` is the core of the wrangling process and is the process where you define the actions
which must be performed to restructure your working data.

Create a list of methods of the form:

	{
		"schema_field1": ["action", "column_name1", ["action", "column_name2"]],
		"schema_field2": ["action", "column_name1", "modifier", ["action", "column_name2"]],
	}

The format for defining a `structure` is as follows::

	[action, column_name, [action, column_name]]

e.g.::

	["CATEGORISE", "+", ["ORDER", "column_1", "column_2"]]

This permits the creation of quite expressive wrangling structures from simple building
blocks.

The schema for this method consists of the following terms:

['rang_idh', 'country', 'survey_year', 'hdi_category', 'indicator_name', 'reference', 'values']

The actions:

['CALCULATE', 'CATEGORISE', 'JOIN', 'NEW', 'ORDER', 'ORDER_NEW', 'ORDER_OLD', 'RENAME']

The columns from your working data:

['Indicator Value', 'Indicator Name', 'HDI category', 'Reference Name', '

In [34]:
structure = {
    "country_name": ["RENAME", "Country"],
    "year":["RENAME", "Survey_year"],
    "hdi_category": ["RENAME", "HDI category"],
    "indicator_name": ["RENAME", "Indicator Name"],
    "reference": ["RENAME", "Reference"],
    "values": ["RENAME", "Indicator Value"]
}    
# Notez le "**" au début du nom du paramètre
# Ceci "dépaquette"  le dictionnaire de sorte que tous les termes soient visibles pour la fonction
#method.set_structure(**structure)

In [35]:
method.set_structure(**structure)

ValueError: Term `country_name` not a valid field for this schema.

In [36]:
structure

{'country_name': ['RENAME', 'Country'],
 'year': ['RENAME', 'Survey_year'],
 'hdi_category': ['RENAME', 'HDI category'],
 'indicator_name': ['RENAME', 'Indicator Name'],
 'reference': ['RENAME', 'Reference'],
 'values': ['RENAME', 'Indicator Value']}

In [None]:
method.transform(overwrite_output=True)
FILENAME = "hdi_report_exercise"
method.save(directory, filename=FILENAME, overwrite=True)

In [None]:
method.input_data_morphs(_id)

### Validation des données et sa signification 

In [None]:
%time method.validates

### Validation et manipulation des données

In [None]:
import pandas as pd
import numpy as np

source = "C:/Users/AUDREY/output_9d69188e-e3a3-4005-bbf4-acfceb28d883.csv" 

df = pd.read_csv(source)
df.head()

In [None]:
df.info()

In [None]:
from pandas_schema import Column, Schema
from pandas_schema.validation import LeadingWhitespaceValidation, TrailingWhitespaceValidation, IsDtypeValidation, InListValidation

# Nous ne testerons que ces colonnes
columns = ["country_name", "hdi_category", "values"]
# Et ces catégories
hdi_categories = ["HIGH HUMAN DEVELOPMENT", "MEDIUM HUMAN DEVELOPMENT", "LOW HUMAN DEVELOPMENT"]

schema = Schema([
    Column("country_name", [LeadingWhitespaceValidation(), TrailingWhitespaceValidation()]),
    Column("hdi_category", [InListValidation(hdi_categories)]),
    Column("values", [IsDtypeValidation(np.dtype(float)), IsDtypeValidation(np.dtype(int))])
])

errors = schema.validate(df[columns])

print(F"Nombre d'erreurs :  {len(errors)}")
# Juste les 10 premiers
for error in errors[:10]:
    print(error)

In [None]:
for l in method.citation.split(","):
    print(l)