# Get and clean data

In [1]:
import pandas as pd

# load education data
data = pd.read_csv('Projet_Python_Dataset_Edstats_csv/EdStatsData.csv')

In [2]:
# Get last value and value year for each row using .apply and add it to dataframe
columns_num = pd.to_numeric(data.columns.to_numpy(), errors='coerce')

max_year = 2020
min_year = 2000
max_year_index = list(data.columns).index(str(max_year))

def get_last_element_value(row):
    row_numeric = pd.to_numeric(row.head(max_year_index).to_numpy(), errors='coerce')
    argmax = (~np.isnan(row_numeric)).cumsum(0).argmax(0)
    if argmax == 0 or columns_num[argmax] < min_year:
        return np.nan, np.nan
    else:
        return columns_num[argmax], row_numeric[argmax]

data['last_element_year'], data['last_element_value'] = zip(*data.apply( \
                            lambda row : get_last_element_value(row), axis = 1))

NameError: name 'np' is not defined

In [None]:
# Delete useless year-columns
data_light = data[\
        ["Country Code",\
         "Country Name",\
         "Indicator Code",\
         "Indicator Name",\
         "last_element_year",\
         "last_element_value"]\
        ].dropna(subset=['last_element_value'])

data_light

 # Analyze educational indicators

In [None]:
# List of education rate chosen indicators
educational_indicator = ['SE.SEC.NENR',
                           'SE.TER.ENRR']

# Filter data and keep data for selected indicators
data_edu = data_light[data_light["Indicator Code"].isin(educational_indicator)]

# Reshaped data to put selected indicator into columns
data_edu = data_edu.pivot(index='Country Name',\
                          columns='Indicator Code',\
                          values='last_element_value')

 # Analyze equipement indicators


In [None]:
# List of equipement rate chosen indicators
expenditure_indicator = ['IT.CMP.PCMP.P2',
                        'IT.NET.USER.P2']

# Filter data and keep data for selected indicators
data_exp = data_light[data_light["Indicator Code"].isin(expenditure_indicator)]

# Reshaped data to put selected indicator into columns
data_exp = data_exp.pivot(index='Country Name',
                          columns='Indicator Code',
                          values='last_element_value')

# Merge data education with language data
main_data = pd.merge(data_edu, data_exp, left_index=True, right_index=True)

# Get other indicators (GDP, market access difficulty)

## GDP

In available data, we can find this data : 
 - GDP per capita (current US$) : NY.GDP.PCAP.CD

In [None]:
# Get Population, GDP data
gdp_pop_data = data_light[\
                    data_light["Indicator Code"].isin(["NY.GDP.PCAP.CD"])]

# Delete useless year-columns
gdp_pop_data = gdp_pop_data[\
        ["Country Name",\
         "Indicator Code",\
         "last_element_value"]\
        ].dropna(subset=['last_element_value'])

# Reshaped data to put selected indicator into columns
gdp_pop_data = gdp_pop_data.pivot(index='Country Name',\
                                              columns='Indicator Code',\
                                              values='last_element_value')

# Merge with main data
main_data = pd.merge(main_data, gdp_pop_data, left_index=True, right_index=True)

## Market access difficulty
We look for an indicator correlated to the difficulty to access to country market
 - Checking the gouvernement system -> not correlated
 - if country language is one of this (English, French, German, Spanish, Portuguese, Italian), it will be easier to access to the market

In [None]:
import re

# Get table containing languages for each country 
# (source : https://en.wikipedia.org/wiki/List_of_official_languages_by_country_and_territory)
langague_data = pd.read_csv('countryLanguages.csv', sep=';', encoding="utf-8")

# Create list of preferred languages 
langagues_ranking = pd.read_csv('languagesRanking.csv', sep=';', encoding="utf-8")

language_list = langagues_ranking["Language"].str.cat(sep=" ")
language_list = ' '.join(language_list.split())

# Create column saying if country langages contain at list one of the preferred languages
langague_data["Language premium"] = langague_data["Languages"].apply(\
            lambda case: any(x in language_list for x in list(re.split(' |\(|\)', case))))

langague_data = langague_data.set_index('Country')
# Drop useless columns
langague_data.drop(columns=['Languages', 'Unnamed: 0'], inplace = True)

# Merge data education with language data
main_data = pd.merge(main_data, langague_data, left_index=True, right_index=True)

# Rename column for better understanding
main_data.columns = \
    pd.Index(["Enrolment rate, secondary",\
           "Enrolment rate, tertiary",\
           "Personal computers (per 100 people)",\
           "Internet users (per 100 people)",\
           "GDP per capita (current US$)",\
           "Language premium"
          ])

main_data

# Attractivity score

The indicators that influence attractivity score are :
- Has the country high Gross enrolement ratio in secondary?
- Has the country high Gross enrolement ratio in tertiary?
- Has the country high GDP?
- Has the country high Internet Internet users (per 100 people) ratio?
- Has the country high Personal computers (per 100 people) ratio
- Has a country premium language?

In [None]:
# Define function to normalize data
def normalize_data(col):
    col_numeric = pd.to_numeric(col.to_numpy(), errors='coerce')
    col_numeric_NoNan = col_numeric[~np.isnan(col_numeric)]
    maxV = np.max(col_numeric_NoNan.astype(float))
    minV = np.min(col_numeric_NoNan.astype(float))
    return (col_numeric.astype(float) - minV) / (maxV - minV)

# Normalize data
main_data_norm = main_data.apply(lambda col : normalize_data(col), 
                                 axis = 0)

# Get indicators average
indicator_mean = main_data_norm.mean(0)

# Replace NaN value by the indicator average to not desavantadge country with missing data
main_data_norm = main_data_norm.fillna(0) + np.isnan(main_data_norm) * indicator_mean

# Multiply country data by weighted matrix
attractivity_score = (main_data_norm) * np.array([1,1,1,1,1,1])

# Process attractivity score
attractivity_score["attractivity_score"] = attractivity_score.apply(lambda col : col.sum(), 
                                 axis = 1)

# Sort table according attractivity score
attractivity_score.sort_values("attractivity_score", ascending=False, inplace=True)

attractivity_score

# Vizualize France position in the selected data

In [None]:
import matplotlib.pyplot as plt
sns.set(style="darkgrid")
def plotBarWithFrance(dataframe, indicator):
    df = dataframe[~np.isnan(dataframe[indicator])]
    france_value = df.loc["France"][indicator]
    plt.figure(figsize=(16, 8))
    sns.histplot(data=df, x=indicator, kde=True, bins=30)
    
    plt.plot([france_value, france_value], [0, 20])
    
    plt.annotate("France",
                xy=(france_value, 20), 
                size=15, 
                xytext=(0, 8),
                textcoords='offset points')
    plt.show()



In [None]:
plotBarWithFrance(main_data, 'Enrolment rate, secondary')
plotBarWithFrance(main_data, "Enrolment rate, tertiary")
plotBarWithFrance(main_data, 'Personal computers (per 100 people)')
plotBarWithFrance(main_data, 'Internet users (per 100 people)')
plotBarWithFrance(main_data, 'GDP per capita (current US$)')
plotBarWithFrance(attractivity_score, 'attractivity_score')