## <div style="text-align:center"><span style="font-size:1em;"> <code>Analyse des données de systèmes éducatifs</code></span> </div>

## 1. Préparation de l'analyse des données de systèmes éducatifs

### 1.1 Importer les librairies nécéssaires pour charger les données

In [28]:
# Importation des librairies Pandas et Numpy

import pandas as pd # Manipulation et l'analyse de données

import numpy as np # Calcul scientifique et manipulation des tableaux


### 1.2 Optimiser l'affichage de l'output des codes

In [29]:

pd.set_option('display.max_columns', 70)
pd.set_option('display.max_rows', 200)
pd.set_option('display.precision', 5)
pd.set_option('display.width', 1000)
pd.set_option('display.max_colwidth',1000)
pd.options.mode.chained_assignment = None  # default='warn'


 ### 1.3 Chargement des données
Fichier csv ; 'EdStatsCountry.csv' / 'EdStatsCountry-Series.csv' / 'EdStatsSeries.csv' / 'EdStatsFootNote.csv' / 'EdStatsData.csv'

In [30]:
# Importer la série de données, les fichiers au format "csv" 

data_EdStatsCountry = pd.read_csv('EdStatsCountry.csv')

data_EdStatsCountriesSeries = pd.read_csv('EdStatsCountry-Series.csv')

data_EdStatsSeries = pd.read_csv('EdStatsSeries.csv')

data_EdStatsFootNote = pd.read_csv('EdStatsFootNote.csv')

data_EdStatsData = pd.read_csv('EdStatsData.csv')


## Analyse du fichier

### 2. data_EdStatsCountry

### Dimension du fichier

In [7]:
data_EdStatsCountry.shape

(241, 32)

le fichier data_EdStatsCountry contient 241 lignes et 32 colonnes.

### Liste des colonnes du fichier 

In [8]:
data_EdStatsCountry.columns.tolist()

['Country Code',
 'Short Name',
 'Table Name',
 'Long Name',
 '2-alpha code',
 'Currency Unit',
 'Special Notes',
 'Region',
 'Income Group',
 'WB-2 code',
 'National accounts base year',
 'National accounts reference year',
 'SNA price valuation',
 'Lending category',
 'Other groups',
 'System of National Accounts',
 'Alternative conversion factor',
 'PPP survey year',
 'Balance of Payments Manual in use',
 'External debt Reporting status',
 'System of trade',
 'Government Accounting concept',
 'IMF data dissemination standard',
 'Latest population census',
 'Latest household survey',
 'Source of most recent Income and expenditure data',
 'Vital registration complete',
 'Latest agricultural census',
 'Latest industrial data',
 'Latest trade data',
 'Latest water withdrawal data',
 'Unnamed: 31']

### Typologie des données de chaque colonne du fichier

In [31]:
pd.set_option('display.max_rows', None)
data_EdStatsCountry.dtypes

Country Code                                          object
Short Name                                            object
Table Name                                            object
Long Name                                             object
2-alpha code                                          object
Currency Unit                                         object
Special Notes                                         object
Region                                                object
Income Group                                          object
WB-2 code                                             object
National accounts base year                           object
National accounts reference year                     float64
SNA price valuation                                   object
Lending category                                      object
Other groups                                          object
System of National Accounts                           object
Alternative conversion f

### Afficher les informations générales sur le dataframe (nombre de valeur "NULL" + la "TYPOLOGIE" des données pour chaque colonne)

In [10]:
data_EdStatsCountry.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 241 entries, 0 to 240
Data columns (total 32 columns):
 #   Column                                             Non-Null Count  Dtype  
---  ------                                             --------------  -----  
 0   Country Code                                       241 non-null    object 
 1   Short Name                                         241 non-null    object 
 2   Table Name                                         241 non-null    object 
 3   Long Name                                          241 non-null    object 
 4   2-alpha code                                       238 non-null    object 
 5   Currency Unit                                      215 non-null    object 
 6   Special Notes                                      145 non-null    object 
 7   Region                                             214 non-null    object 
 8   Income Group                                       214 non-null    object 
 9   WB-2 code 

### Afficher le nombre de valeur unique pour chaque colonne

In [11]:
print(data_EdStatsCountry.nunique())

Country Code                                         241
Short Name                                           241
Table Name                                           241
Long Name                                            241
2-alpha code                                         238
Currency Unit                                        152
Special Notes                                        131
Region                                                 7
Income Group                                           5
WB-2 code                                            240
National accounts base year                           43
National accounts reference year                      11
SNA price valuation                                    2
Lending category                                       3
Other groups                                           2
System of National Accounts                            3
Alternative conversion factor                         32
PPP survey year                

### Afficher les doublons

In [32]:
doublons = data_EdStatsCountry[data_EdStatsCountry.duplicated()]
# Afficher les doublons
print("Les lignes dupliquées sont :")
print(doublons)

Les lignes dupliquées sont :
Empty DataFrame
Columns: [Country Code, Short Name, Table Name, Long Name, 2-alpha code, Currency Unit, Special Notes, Region, Income Group, WB-2 code, National accounts base year, National accounts reference year, SNA price valuation, Lending category, Other groups, System of National Accounts, Alternative conversion factor, PPP survey year, Balance of Payments Manual in use, External debt Reporting status, System of trade, Government Accounting concept, IMF data dissemination standard, Latest population census, Latest household survey, Source of most recent Income and expenditure data, Vital registration complete, Latest agricultural census, Latest industrial data, Latest trade data, Latest water withdrawal data, Unnamed: 31]
Index: []


### Afficher un section du fichier (les 10 premières lignes et colonnes)

In [33]:
data_EdStatsCountry.head(10)

Unnamed: 0,Country Code,Short Name,Table Name,Long Name,2-alpha code,Currency Unit,Special Notes,Region,Income Group,WB-2 code,National accounts base year,National accounts reference year,SNA price valuation,Lending category,Other groups,System of National Accounts,Alternative conversion factor,PPP survey year,Balance of Payments Manual in use,External debt Reporting status,System of trade,Government Accounting concept,IMF data dissemination standard,Latest population census,Latest household survey,Source of most recent Income and expenditure data,Vital registration complete,Latest agricultural census,Latest industrial data,Latest trade data,Latest water withdrawal data,Unnamed: 31
0,ABW,Aruba,Aruba,Aruba,AW,Aruban florin,SNA data for 2000-2011 are updated from official government statistics; 1994-1999 from UN databases. Base year has changed from 1995 to 2000.,Latin America & Caribbean,High income: nonOECD,AW,2000,,Value added at basic prices (VAB),,,Country uses the 1993 System of National Accounts methodology.,,,"IMF Balance of Payments Manual, 6th edition.",,Special trade system,,,2010,,,Yes,,,2012.0,,
1,AFG,Afghanistan,Afghanistan,Islamic State of Afghanistan,AF,Afghan afghani,Fiscal year end: March 20; reporting period for national accounts data: FY (from 2013 are CY). National accounts data are sourced from the IMF and differ from the Central Statistics Organization numbers due to exclusion of the opium economy.,South Asia,Low income,AF,2002/03,,Value added at basic prices (VAB),IDA,HIPC,Country uses the 1993 System of National Accounts methodology.,,,,Actual,General trade system,Consolidated central government,General Data Dissemination System (GDDS),1979,"Multiple Indicator Cluster Survey (MICS), 2010/11","Integrated household survey (IHS), 2008",,2013/14,,2012.0,2000.0,
2,AGO,Angola,Angola,People's Republic of Angola,AO,Angolan kwanza,"April 2013 database update: Based on IMF data, national accounts data were revised for 2000 onward; the base year changed to 2002.",Sub-Saharan Africa,Upper middle income,AO,2002,,Value added at producer prices (VAP),IBRD,,Country uses the 1993 System of National Accounts methodology.,1991–96,2005,"IMF Balance of Payments Manual, 6th edition.",Actual,Special trade system,Budgetary central government,General Data Dissemination System (GDDS),1970,"Malaria Indicator Survey (MIS), 2011","Integrated household survey (IHS), 2008",,2015,,,2005.0,
3,ALB,Albania,Albania,Republic of Albania,AL,Albanian lek,,Europe & Central Asia,Upper middle income,AL,Original chained constant price data are rescaled.,1996.0,Value added at basic prices (VAB),IBRD,,Country uses the 1993 System of National Accounts methodology.,,Rolling,"IMF Balance of Payments Manual, 6th edition.",Actual,General trade system,Budgetary central government,General Data Dissemination System (GDDS),2011,"Demographic and Health Survey (DHS), 2008/09","Living Standards Measurement Study Survey (LSMS), 2012",Yes,2012,2010.0,2012.0,2006.0,
4,AND,Andorra,Andorra,Principality of Andorra,AD,Euro,,Europe & Central Asia,High income: nonOECD,AD,1990,,,,,Country uses the 1968 System of National Accounts methodology.,,,,,Special trade system,,,2011. Population figures compiled from administrative registers.,,,Yes,,,2006.0,,
5,ARB,Arab World,Arab World,Arab World,1A,,Arab World aggregate. Arab World is composed of members of the League of Arab States.,,,1A,,,,,,,,,,,,,,,,,,,,,,
6,ARE,United Arab Emirates,United Arab Emirates,United Arab Emirates,AE,U.A.E. dirham,"April 2013 database update: Based on data from the National Bureau of Statistics, national accounts data were revised for 2001 onward; the base year changed to 2007.",Middle East & North Africa,High income: nonOECD,AE,2007,,Value added at producer prices (VAP),,,Country uses the 1993 System of National Accounts methodology.,,,"IMF Balance of Payments Manual, 6th edition.",,General trade system,Consolidated central government,General Data Dissemination System (GDDS),2010,,,,2012,,2011.0,2005.0,
7,ARG,Argentina,Argentina,Argentine Republic,AR,Argentine peso,,Latin America & Caribbean,Upper middle income,AR,2004,,Value added at basic prices (VAB),IBRD,,Country uses the 1993 System of National Accounts methodology.,1971–84,2005,"IMF Balance of Payments Manual, 6th edition.",Actual,Special trade system,Consolidated central government,Special Data Dissemination Standard (SDDS),2010,"Multiple Indicator Cluster Survey (MICS), 2011/12","Integrated household survey (IHS), 2012",Yes,2013,2002.0,2012.0,2000.0,
8,ARM,Armenia,Armenia,Republic of Armenia,AM,Armenian dram,,Europe & Central Asia,Lower middle income,AM,Original chained constant price data are rescaled.,1996.0,Value added at basic prices (VAB),IBRD,,Country uses the 1993 System of National Accounts methodology.,1990–95,2005,"IMF Balance of Payments Manual, 6th edition.",Actual,Special trade system,Consolidated central government,Special Data Dissemination Standard (SDDS),2011,"Demographic and Health Survey (DHS), 2010","Integrated household survey (IHS), 2012",Yes,2013/14,,2012.0,2007.0,
9,ASM,American Samoa,American Samoa,American Samoa,AS,U.S. dollar,,East Asia & Pacific,Upper middle income,AS,,,,,,Country uses the 1968 System of National Accounts methodology.,,,,,Special trade system,,,2010,,,Yes,2007,,,,


## Analyse du fichier

### 3. data_EdStatsCountriesSeries

### Dimension du fichier

In [15]:
data_EdStatsCountriesSeries.shape

(613, 4)

le fichier data_EdStatsCountriesSeries contient 613 lignes et 4 colonnes.

### Liste des colonnes du fichier 

In [21]:
data_EdStatsCountriesSeries.columns.tolist()

['CountryCode', 'SeriesCode', 'DESCRIPTION', 'Unnamed: 3']

### Typologie des données de chaque colonne du fichier

In [22]:
pd.set_option('display.max_rows', None)
data_EdStatsCountriesSeries.dtypes

CountryCode     object
SeriesCode      object
DESCRIPTION     object
Unnamed: 3     float64
dtype: object

### Afficher les informations générales sur le dataframe (nombre de valeur "NULL" + la "TYPOLOGIE" des données pour chaque colonne)

In [23]:
data_EdStatsCountriesSeries.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 613 entries, 0 to 612
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   CountryCode  613 non-null    object 
 1   SeriesCode   613 non-null    object 
 2   DESCRIPTION  613 non-null    object 
 3   Unnamed: 3   0 non-null      float64
dtypes: float64(1), object(3)
memory usage: 19.3+ KB


### Afficher le nombre de valeur unique pour chaque colonne

In [24]:
print(data_EdStatsCountriesSeries.nunique())

CountryCode    211
SeriesCode      21
DESCRIPTION     97
Unnamed: 3       0
dtype: int64


### Afficher les doublons

In [26]:
doublons = data_EdStatsCountriesSeries[data_EdStatsCountriesSeries.duplicated()]
# Afficher les doublons
print("Les lignes dupliquées sont :")
print(doublons)

Les lignes dupliquées sont :
Empty DataFrame
Columns: [CountryCode, SeriesCode, DESCRIPTION, Unnamed: 3]
Index: []


### Afficher un section du fichier (les 10 premières lignes et colonnes)

In [27]:
data_EdStatsCountriesSeries.head(10)

Unnamed: 0,CountryCode,SeriesCode,DESCRIPTION,Unnamed: 3
0,ABW,SP.POP.TOTL,Data sources : United Nations World Population Prospects,
1,ABW,SP.POP.GROW,Data sources: United Nations World Population Prospects,
2,AFG,SP.POP.GROW,Data sources: United Nations World Population Prospects,
3,AFG,NY.GDP.PCAP.PP.CD,Estimates are based on regression.,
4,AFG,SP.POP.TOTL,Data sources : United Nations World Population Prospects,
5,AFG,NY.GDP.MKTP.PP.KD,Estimates are based on regression.,
6,AFG,NY.GNP.MKTP.PP.CD,Estimates are based on regression.,
7,AFG,NY.GDP.MKTP.PP.CD,Estimates are based on regression.,
8,AFG,NY.GDP.PCAP.PP.KD,Estimates are based on regression.,
9,AFG,NY.GNP.PCAP.PP.CD,Estimates are based on regression.,


## Analyse du fichier

### 4. data_EdStatsSeries

### Dimension du fichier

In [123]:
data_EdStatsSeries.shape

(3665, 21)

le fichier data_EdStatsSeries contient 3665 lignes et 21 colonnes.

### Liste des colonnes du fichier 

In [124]:
data_EdStatsSeries.columns.tolist()

['Series Code',
 'Topic',
 'Indicator Name',
 'Short definition',
 'Long definition',
 'Unit of measure',
 'Periodicity',
 'Base Period',
 'Other notes',
 'Aggregation method',
 'Limitations and exceptions',
 'Notes from original source',
 'General comments',
 'Source',
 'Statistical concept and methodology',
 'Development relevance',
 'Related source links',
 'Other web links',
 'Related indicators',
 'License Type',
 'Unnamed: 20']

### Typologie des données de chaque colonne du fichier

In [125]:

pd.set_option('display.max_rows', None)
data_EdStatsSeries.dtypes

Series Code                             object
Topic                                   object
Indicator Name                          object
Short definition                        object
Long definition                         object
Unit of measure                        float64
Periodicity                             object
Base Period                             object
Other notes                             object
Aggregation method                      object
Limitations and exceptions              object
Notes from original source             float64
General comments                        object
Source                                  object
Statistical concept and methodology     object
Development relevance                   object
Related source links                    object
Other web links                        float64
Related indicators                     float64
License Type                           float64
Unnamed: 20                            float64
dtype: object

### Afficher les informations générales sur le dataframe (nombre de valeur "NULL" + la "TYPOLOGIE" des données pour chaque colonne)

In [126]:

data_EdStatsSeries.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3665 entries, 0 to 3664
Data columns (total 21 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   Series Code                          3665 non-null   object 
 1   Topic                                3665 non-null   object 
 2   Indicator Name                       3665 non-null   object 
 3   Short definition                     2156 non-null   object 
 4   Long definition                      3665 non-null   object 
 5   Unit of measure                      0 non-null      float64
 6   Periodicity                          99 non-null     object 
 7   Base Period                          314 non-null    object 
 8   Other notes                          552 non-null    object 
 9   Aggregation method                   47 non-null     object 
 10  Limitations and exceptions           14 non-null     object 
 11  Notes from original source    

### Afficher le nombre de valeur unique pour chaque colonne

In [127]:
print(data_EdStatsSeries.nunique())

Series Code                            3665
Topic                                    37
Indicator Name                         3665
Short definition                       1169
Long definition                        2060
Unit of measure                           0
Periodicity                               1
Base Period                               4
Other notes                              14
Aggregation method                        3
Limitations and exceptions                9
Notes from original source                0
General comments                          8
Source                                   31
Statistical concept and methodology       2
Development relevance                     1
Related source links                      1
Other web links                           0
Related indicators                        0
License Type                              0
Unnamed: 20                               0
dtype: int64


### Afficher les doublons

In [128]:
doublons = data_EdStatsSeries[data_EdStatsSeries.duplicated()]
# Afficher les doublons
print("Les lignes dupliquées sont :")
print(doublons)

Les lignes dupliquées sont :
Empty DataFrame
Columns: [Series Code, Topic, Indicator Name, Short definition, Long definition, Unit of measure, Periodicity, Base Period, Other notes, Aggregation method, Limitations and exceptions, Notes from original source, General comments, Source, Statistical concept and methodology, Development relevance, Related source links, Other web links, Related indicators, License Type, Unnamed: 20]
Index: []


### Afficher un section du fichier (les 10 premières lignes et colonnes)

In [129]:
data_EdStatsSeries.head(10)

Unnamed: 0,Series Code,Topic,Indicator Name,Short definition,Long definition,Unit of measure,Periodicity,Base Period,Other notes,Aggregation method,Limitations and exceptions,Notes from original source,General comments,Source,Statistical concept and methodology,Development relevance,Related source links,Other web links,Related indicators,License Type,Unnamed: 20
0,BAR.NOED.1519.FE.ZS,Attainment,Barro-Lee: Percentage of female population age 15-19 with no education,Percentage of female population age 15-19 with no education,Percentage of female population age 15-19 with no education,,,,,,,,,Robert J. Barro and Jong-Wha Lee: http://www.barrolee.com/,,,,,,,
1,BAR.NOED.1519.ZS,Attainment,Barro-Lee: Percentage of population age 15-19 with no education,Percentage of population age 15-19 with no education,Percentage of population age 15-19 with no education,,,,,,,,,Robert J. Barro and Jong-Wha Lee: http://www.barrolee.com/,,,,,,,
2,BAR.NOED.15UP.FE.ZS,Attainment,Barro-Lee: Percentage of female population age 15+ with no education,Percentage of female population age 15+ with no education,Percentage of female population age 15+ with no education,,,,,,,,,Robert J. Barro and Jong-Wha Lee: http://www.barrolee.com/,,,,,,,
3,BAR.NOED.15UP.ZS,Attainment,Barro-Lee: Percentage of population age 15+ with no education,Percentage of population age 15+ with no education,Percentage of population age 15+ with no education,,,,,,,,,Robert J. Barro and Jong-Wha Lee: http://www.barrolee.com/,,,,,,,
4,BAR.NOED.2024.FE.ZS,Attainment,Barro-Lee: Percentage of female population age 20-24 with no education,Percentage of female population age 20-24 with no education,Percentage of female population age 20-24 with no education,,,,,,,,,Robert J. Barro and Jong-Wha Lee: http://www.barrolee.com/,,,,,,,
5,BAR.NOED.2024.ZS,Attainment,Barro-Lee: Percentage of population age 20-24 with no education,Percentage of population age 20-24 with no education,Percentage of population age 20-24 with no education,,,,,,,,,Robert J. Barro and Jong-Wha Lee: http://www.barrolee.com/,,,,,,,
6,BAR.NOED.2529.FE.ZS,Attainment,Barro-Lee: Percentage of female population age 25-29 with no education,Percentage of female population age 25-29 with no education,Percentage of female population age 25-29 with no education,,,,,,,,,Robert J. Barro and Jong-Wha Lee: http://www.barrolee.com/,,,,,,,
7,BAR.NOED.2529.ZS,Attainment,Barro-Lee: Percentage of population age 25-29 with no education,Percentage of population age 25-29 with no education,Percentage of population age 25-29 with no education,,,,,,,,,Robert J. Barro and Jong-Wha Lee: http://www.barrolee.com/,,,,,,,
8,BAR.NOED.25UP.FE.ZS,Attainment,Barro-Lee: Percentage of female population age 25+ with no education,Percentage of female population age 25+ with no education,Percentage of female population age 25+ with no education,,,,,,,,,Robert J. Barro and Jong-Wha Lee: http://www.barrolee.com/,,,,,,,
9,BAR.NOED.25UP.ZS,Attainment,Barro-Lee: Percentage of population age 25+ with no education,Percentage of population age 25+ with no education,Percentage of population age 25+ with no education,,,,,,,,,Robert J. Barro and Jong-Wha Lee: http://www.barrolee.com/,,,,,,,


## Analyse du fichier

### 5. data_EdStatsFootNote

### Dimension du fichier

In [130]:
data_EdStatsFootNote.shape

(643638, 5)

le fichier data_EdStatsFootNote contient 643 638 lignes et 5 colonnes.

### Liste des colonnes du fichier 

In [131]:
data_EdStatsFootNote.columns.tolist()

['CountryCode', 'SeriesCode', 'Year', 'DESCRIPTION', 'Unnamed: 4']

### Typologie des données de chaque colonne du fichier

In [132]:
pd.set_option('display.max_rows', None)
data_EdStatsFootNote.dtypes

CountryCode     object
SeriesCode      object
Year            object
DESCRIPTION     object
Unnamed: 4     float64
dtype: object

### Afficher les informations générales sur le dataframe (nombre de valeur "NULL" + la "TYPOLOGIE" des données pour chaque colonne)

In [133]:
data_EdStatsFootNote.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 643638 entries, 0 to 643637
Data columns (total 5 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   CountryCode  643638 non-null  object 
 1   SeriesCode   643638 non-null  object 
 2   Year         643638 non-null  object 
 3   DESCRIPTION  643638 non-null  object 
 4   Unnamed: 4   0 non-null       float64
dtypes: float64(1), object(4)
memory usage: 24.6+ MB


### Afficher le nombre de valeur unique pour chaque colonne

In [134]:
print(data_EdStatsFootNote.nunique())

CountryCode     239
SeriesCode     1558
Year             56
DESCRIPTION    9102
Unnamed: 4        0
dtype: int64


### Afficher les doublons

In [135]:
doublons = data_EdStatsFootNote[data_EdStatsFootNote.duplicated()]
# Afficher les doublons
print("Les lignes dupliquées sont :")
print(doublons)

Les lignes dupliquées sont :
Empty DataFrame
Columns: [CountryCode, SeriesCode, Year, DESCRIPTION, Unnamed: 4]
Index: []


### Afficher un section du fichier (les 10 premières lignes et colonnes)

In [136]:
data_EdStatsFootNote.head(10)

Unnamed: 0,CountryCode,SeriesCode,Year,DESCRIPTION,Unnamed: 4
0,ABW,SE.PRE.ENRL.FE,YR2001,Country estimation.,
1,ABW,SE.TER.TCHR.FE,YR2005,Country estimation.,
2,ABW,SE.PRE.TCHR.FE,YR2000,Country estimation.,
3,ABW,SE.SEC.ENRL.GC,YR2004,Country estimation.,
4,ABW,SE.PRE.TCHR,YR2006,Country estimation.,
5,ABW,SE.PRE.NENR,YR2000,Country estimation.,
6,ABW,SE.SEC.ENRL.VO.FE,YR2005,Country estimation.,
7,ABW,SE.SEC.ENRL.GC,YR2003,Country estimation.,
8,ABW,SE.PRM.TCHR.FE,YR1999,Country estimation.,
9,ABW,SE.PRE.TCHR.FE.ZS,YR2008,Country estimation.,


## Analyse du fichier

### 6. data_EdStatsData

### Dimension du fichier

In [137]:
data_EdStatsData.shape

(886930, 70)

le fichier data_EdStatsData contient 886 930 lignes et 70 colonnes.

### Liste des colonnes du fichier 

In [138]:
data_EdStatsData.columns.tolist()

['Country Name',
 'Country Code',
 'Indicator Name',
 'Indicator Code',
 '1970',
 '1971',
 '1972',
 '1973',
 '1974',
 '1975',
 '1976',
 '1977',
 '1978',
 '1979',
 '1980',
 '1981',
 '1982',
 '1983',
 '1984',
 '1985',
 '1986',
 '1987',
 '1988',
 '1989',
 '1990',
 '1991',
 '1992',
 '1993',
 '1994',
 '1995',
 '1996',
 '1997',
 '1998',
 '1999',
 '2000',
 '2001',
 '2002',
 '2003',
 '2004',
 '2005',
 '2006',
 '2007',
 '2008',
 '2009',
 '2010',
 '2011',
 '2012',
 '2013',
 '2014',
 '2015',
 '2016',
 '2017',
 '2020',
 '2025',
 '2030',
 '2035',
 '2040',
 '2045',
 '2050',
 '2055',
 '2060',
 '2065',
 '2070',
 '2075',
 '2080',
 '2085',
 '2090',
 '2095',
 '2100',
 'Unnamed: 69']

### Typologie des données de chaque colonne du fichier

In [34]:

pd.set_option('display.max_rows', None)
data_EdStatsData.dtypes


Country Name       object
Country Code       object
Indicator Name     object
Indicator Code     object
1970              float64
1971              float64
1972              float64
1973              float64
1974              float64
1975              float64
1976              float64
1977              float64
1978              float64
1979              float64
1980              float64
1981              float64
1982              float64
1983              float64
1984              float64
1985              float64
1986              float64
1987              float64
1988              float64
1989              float64
1990              float64
1991              float64
1992              float64
1993              float64
1994              float64
1995              float64
1996              float64
1997              float64
1998              float64
1999              float64
2000              float64
2001              float64
2002              float64
2003              float64
2004        

### Afficher les informations générales sur le dataframe (nombre de valeur "NULL" + la "TYPOLOGIE" de donnée pour chaque colonne)

In [35]:

data_EdStatsData.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 886930 entries, 0 to 886929
Data columns (total 70 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   Country Name    886930 non-null  object 
 1   Country Code    886930 non-null  object 
 2   Indicator Name  886930 non-null  object 
 3   Indicator Code  886930 non-null  object 
 4   1970            72288 non-null   float64
 5   1971            35537 non-null   float64
 6   1972            35619 non-null   float64
 7   1973            35545 non-null   float64
 8   1974            35730 non-null   float64
 9   1975            87306 non-null   float64
 10  1976            37483 non-null   float64
 11  1977            37574 non-null   float64
 12  1978            37576 non-null   float64
 13  1979            36809 non-null   float64
 14  1980            89122 non-null   float64
 15  1981            38777 non-null   float64
 16  1982            37511 non-null   float64
 17  1983      

### Afficher le nombre de valeur unique pour chaque colonne

In [141]:
print(data_EdStatsData.nunique())

Country Name         242
Country Code         242
Indicator Name      3665
Indicator Code      3665
1970               24595
1971               30892
1972               30982
1973               30988
1974               31139
1975               37838
1976               32679
1977               32808
1978               32887
1979               32359
1980               39456
1981               34090
1982               32969
1983               33683
1984               33903
1985               41133
1986               34774
1987               34021
1988               34013
1989               33181
1990               72800
1991               67172
1992               68026
1993               68440
1994               69974
1995               79225
1996               69787
1997               66665
1998               77746
1999              104452
2000              115971
2001              108633
2002              108974
2003              114344
2004              113431
2005              123821


### Afficher les doublons

In [36]:
doublons = data_EdStatsData[data_EdStatsData.duplicated()]
# Afficher les doublons
print("Les lignes dupliquées sont :")
print(doublons)

Les lignes dupliquées sont :
Empty DataFrame
Columns: [Country Name, Country Code, Indicator Name, Indicator Code, 1970, 1971, 1972, 1973, 1974, 1975, 1976, 1977, 1978, 1979, 1980, 1981, 1982, 1983, 1984, 1985, 1986, 1987, 1988, 1989, 1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2020, 2025, 2030, 2035, 2040, 2045, 2050, 2055, 2060, 2065, 2070, 2075, 2080, 2085, 2090, 2095, 2100, Unnamed: 69]
Index: []


### Afficher un section du fichier (les 10 premières lignes et colonnes)

In [37]:
data_EdStatsData.head(10)

Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,1970,1971,1972,1973,1974,1975,1976,1977,1978,1979,1980,1981,1982,1983,1984,1985,1986,1987,1988,1989,1990,1991,1992,1993,1994,1995,1996,1997,1998,1999,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2020,2025,2030,2035,2040,2045,2050,2055,2060,2065,2070,2075,2080,2085,2090,2095,2100,Unnamed: 69
0,Arab World,ARB,"Adjusted net enrolment rate, lower secondary, both sexes (%)",UIS.NERA.2,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,Arab World,ARB,"Adjusted net enrolment rate, lower secondary, female (%)",UIS.NERA.2.F,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,Arab World,ARB,"Adjusted net enrolment rate, lower secondary, gender parity index (GPI)",UIS.NERA.2.GPI,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,Arab World,ARB,"Adjusted net enrolment rate, lower secondary, male (%)",UIS.NERA.2.M,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,Arab World,ARB,"Adjusted net enrolment rate, primary, both sexes (%)",SE.PRM.TENR,54.82212,54.89414,56.20944,57.26711,57.99114,59.36554,60.99996,61.92268,62.69342,64.38319,65.61777,66.08515,66.60814,67.29045,68.51009,69.03321,69.94491,71.04187,71.69378,71.6991,71.99582,72.60284,70.03272,70.46482,72.64568,71.81176,73.90351,74.4252,75.11082,76.25432,77.24568,78.80052,80.0514,80.80539,81.60706,82.48949,82.68551,83.28034,84.01187,84.19596,85.212,85.24514,86.10167,85.51194,85.32015,,,,,,,,,,,,,,,,,,,,,
5,Arab World,ARB,"Adjusted net enrolment rate, primary, female (%)",SE.PRM.TENR.FE,43.3511,43.31815,44.6407,45.84572,46.4495,48.36389,50.04619,51.24528,52.24232,54.75437,56.48679,57.31466,58.22605,59.28923,60.74818,61.52087,62.73495,64.11588,65.09966,65.1292,65.82749,66.79703,63.26043,63.97211,67.03304,65.76156,68.7808,69.26705,70.43539,72.04729,73.27562,75.13298,76.64102,77.65358,78.4854,79.47577,79.60833,80.58242,81.4738,81.69569,82.87165,82.86139,84.40141,83.91403,83.82083,,,,,,,,,,,,,,,,,,,,,
6,Arab World,ARB,"Adjusted net enrolment rate, primary, gender parity index (GPI)",UIS.NERA.1.GPI,0.65857,0.6564,0.66329,0.67204,0.67261,0.69176,0.69995,0.71014,0.71843,0.74374,0.75944,0.76925,0.77986,0.79071,0.79961,0.80677,0.81606,0.82513,0.83419,0.83476,0.84466,0.85432,0.82649,0.83408,0.85909,0.84731,0.87266,0.87269,0.88494,0.89737,0.90406,0.91275,0.91979,0.9263,0.92768,0.93084,0.92962,0.93846,0.9425,0.94347,0.94762,0.9467,0.96208,0.96409,0.9662,,,,,,,,,,,,,,,,,,,,,
7,Arab World,ARB,"Adjusted net enrolment rate, primary, male (%)",SE.PRM.TENR.MA,65.82623,65.99358,67.30186,68.21908,69.05901,69.91455,71.49951,72.16206,72.71769,73.61997,74.37998,74.50713,74.66263,74.98183,75.97254,76.25549,76.87505,77.70386,78.03937,78.02189,77.93385,78.18749,76.5411,76.69742,78.02813,77.6119,78.81749,79.37204,79.59354,80.28753,81.05137,82.31505,83.32406,83.83223,84.60439,85.38029,85.63508,85.86669,86.44414,86.59069,87.45258,87.52652,87.72817,87.03988,86.75339,,,,,,,,,,,,,,,,,,,,,
8,Arab World,ARB,"Adjusted net enrolment rate, upper secondary, both sexes (%)",UIS.NERA.3,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
9,Arab World,ARB,"Adjusted net enrolment rate, upper secondary, female (%)",UIS.NERA.3.F,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
