In [None]:
app_files/
│
├── app.py                # Main script to run the Streamlit app
├── data_loader.py        # Handles loading data
├── pages/                # Folder containing different pages of the app
│   ├── opener.py          # Page that confirms data has been loaded to present Project and Contains Info on Data Gathering Links
│   ├── introduction.py    # Project introduction page
│   ├── eda_results.py     # New page displaying full EDA (Exploratory Data Analysis) results
│   ├── ml_results.py      # Results of Machine Learning implementation page
└── utils_                # Folder for utility functions
    ├── display.py        # Helper functions for displaying data

In [None]:
# data_loader.py

import pandas as pd
import streamlit as st

def load_data():
    """
    Load and return the dataset from the given URL.
    
    The function loads the full country data CSV from the specified URL.
    """
    try:
        # URL for the dataset
        data_url = 'https://raw.githubusercontent.com/Cfg-data/final-project/refs/heads/master/usable_notebooks/full_country_data.csv'
        
        # Load the dataset from the provided URL
        full_country_data_df = pd.read_csv(data_url)
        
        # Returning the dataframe
        return full_country_data_df
    
    except Exception as e:
        st.error(f"Error loading data: {str(e)}")
        return None

def load_countries():
    """
    Load and return the country list dataset from the given URL.
    
    The function loads the full country list CSV from the specified URL.
    """
    try:
        # URL for the countries list dataset
        countries_url = 'https://raw.githubusercontent.com/Cfg-data/final-project/master/data/merged/full_country_list.csv'
        
        # Load the countries list from the provided URL
        countries_df = pd.read_csv(countries_url)
        
        # Returning the dataframe
        return countries_df
    
    except Exception as e:
        st.error(f"Error loading country list: {str(e)}")
        return None

In [None]:
# app.py

import streamlit as st
from data_loader import load_data
from pages import opener, introduction, eda_results, ml_results

# Set the page title and layout
st.set_page_config(page_title="Education Project Overview", layout="wide")

# Load data for the entire app
data = load_data()

# Display a sidebar with navigation options
st.sidebar.title("Navigation")
page = st.sidebar.radio("Go to", ["Opener", "Introduction", "EDA Results", "ML Results"])

# Display selected page content
if page == "Opener":
    opener.show(data)
elif page == "Introduction":
    introduction.show(data)
elif page == "EDA Results":
    eda_results.show(data)
elif page == "ML Results":
    ml_results.show(data)

In [None]:
# pages/opener.py

import streamlit as st
import pandas as pd

def show(data):
    """
    Displays the introductory content, data information, and dataset overviews.
    """
    # Welcome message and introduction
    st.title("Welcome to the Education Project")
    st.write("### Data Loading Confirmation")
    
    # Check if the data is loaded properly
    if data is not None:
        st.success("Data has been successfully loaded!")

        # Show basic information about the dataset
        st.write("### Collected Dataset Overview")
        st.write("The following dataset has been loaded:")

        # Display the dataset dimensions
        st.write(f"Data contains {data.shape[0]} rows and {data.shape[1]} columns.")

        # Display the first few rows of the data for inspection
        st.write("#### Data Preview:")
        st.dataframe(data.head())

        # Show a brief description of the dataset (can be customized)
        st.write("This dataset contains various statistics related to education, including data on public expenditure, teaching staff, and enrollment ratios.")
        
        # Optionally, display dataset columns for filtering by the user
        columns = data.columns.tolist()
        selected_columns = st.multiselect("Select columns to display", columns, default=columns[:5])  # Default to first 5 columns
        st.write("### Filtered Data Preview")
        st.dataframe(data[selected_columns])

    else:
        # Display an error message if data is not loaded
        st.error("There was an issue loading the data.")

    # Display dataset source and description
    st.write("### Collected Datasets")
    st.markdown("""
    1. **[Ratio of Girls to Boys in Education](https://data.un.org/_Docs/SYB/CSV/SYB67_319_202411_Ratio%20of%20girls%20to%20boys%20in%20education.csv)**
       - Dataset ID: SYB67_319_202411
       - Description: This dataset provides the ratio of girls to boys in education across different countries and regions.

    2. **[Public Expenditure on Education and Access to Computers](https://data.un.org/_Docs/SYB/CSV/SYB67_245_202411_Public%20expenditure%20on%20education%20and%20access%20to%20computers.csv)**
       - Dataset ID: SYB67_245_202411
       - Description: This dataset presents the public expenditure on education, along with data on the availability of computers in educational institutions.

    3. **[Teaching Staff in Education](https://data.un.org/_Docs/SYB/CSV/SYB67_323_202411_Teaching%20Staff%20in%20education.csv)**
       - Dataset ID: SYB67_323_202411
       - Description: This dataset outlines the number of teaching staff in the education sector across various countries and regions.

    4. **[Education Statistics](https://data.un.org/_Docs/SYB/CSV/SYB67_309_202411_Education.csv)**
       - Dataset ID: SYB67_309_202411
       - Description: This dataset provides a comprehensive overview of various education-related statistics, such as enrollment rates, graduation rates, and literacy rates.
    """)

    st.write("### Additional Datasets")

    st.markdown("""
    - **[Youth Literacy Rate, Population 15-24 Years (%)](https://data.un.org/Data.aspx?d=UNESCO&f=series%3aLR_AG15T24)**  
      Identified as UNdata_Export_20241213_140703208 in the files.

    - **[Youth Literacy Rate, Population 15-24 Years, Gender Parity Index (GPI)](https://data.un.org/Data.aspx?d=UNESCO&f=series%3aLR_AG15T24_GPI)**  
      Identified as UNdata_Export_20241213_140708283 in the files.
    """)

    # License Information
    st.write("### License")
    st.write("This project is based on publicly available data from UNDATA. Please refer to the [UN Data Usage Policy](https://data.un.org/Usage.aspx) for licensing and attribution information.")

In [None]:
# pages/introduction.py

import streamlit as st

def show(data):
    # Identification Section
    st.write("### Data Analytics Bootcamp at IronHack")
    st.write("#### Final Project developed by Cfg-Data on GitHub")
    st.markdown("[GitHub Repository](https://github.com/Cfg-data/final-project)")

    # Project Overview
    st.title("Project Introduction")
    
    st.write("""
        Education serves as a cornerstone for societal progress, yet the global landscape reveals significant disparities in both access to education and educational outcomes. 
        These inequalities are particularly evident when examining key factors such as literacy rates, gender parity in education, teacher qualifications, and resource allocation. 
        This research project is designed to analyze and present educational data from the United Nations Database (UNDATA) with the goal of understanding the complex factors influencing educational disparities worldwide. 
        The project is divided into two primary parts: one focused on literacy disparities using machine learning models, and the other on a statistical analysis of educational metrics such as gender ratios, teacher qualifications, and access to resources.
    """)
    
    st.write("""
        The first part of the project will apply machine learning techniques—specifically Linear Regression and Random Forest models—to identify patterns and relationships in global literacy data. 
        By examining literacy rates across different regions and correlating them with socio-economic, demographic, and policy variables, this analysis will provide insights into the global disparities in literacy.
    """)
    
    st.write("""
        The second part of the project takes a deeper dive into the gender gap in education, the availability of qualified teachers, and the distribution of educational resources over time. 
        This will include a statistical analysis of the ratio of girls to boys at various levels of education, the number of teachers at each educational level, and the proportion of teachers with the required qualifications. 
        Additionally, the study will explore access to computers in education and analyze government expenditure on education as a percentage of total national expenditure, considering how these factors evolve year by year.
    """)

    st.write("""
        By integrating both machine learning models and statistical analysis, this project aims to offer a multifaceted view of the global educational landscape, revealing how gender, resources, and government investment influence educational outcomes. 
        Ultimately, the findings will contribute to a more nuanced understanding of the educational inequalities that persist across different regions and provide a foundation for policy recommendations to foster more equitable and effective education systems globally.
    """)

    # Additional Sections at the End
    st.write("### Presentation can be found on Canva")
    st.markdown("[View the Presentation on Canva](https://www.canva.com/design/DAGZiN-LC9E/Qxz0INeyReYMsoqo9KbSFw/edit)")
    
    st.write("### Organization made using a Kanban Workspace")
    st.markdown("[Trello Daily Task Planner](https://trello.com/b/MKJuGmnZ/final-project-daily-task-planner)")
    st.markdown("[Trello Kanban Board](https://trello.com/b/JQXi189u/kanban-board-final-project)")

In [None]:
# pages/eda_results.py

import streamlit as st
import pandas as pd
from data_loader import load_data, load_countries

def show(data=None):
    # Title for the page
    st.title("Exploratory Data Analysis (EDA) Results")

    # Load the countries data
    countries_df = load_countries()

    # Show country data
    st.write("### Countries List")
    st.write(f"Data contains {countries_df.shape[0]} rows and {countries_df.shape[1]} columns.")
    
    # Show the country list as a table
    st.dataframe(countries_df)

    # Display the data (if passed)
    if data is not None:
        st.write("### Data Overview")
        # Display basic data stats
        st.write(f"Data contains {data.shape[0]} rows and {data.shape[1]} columns.")
        
        # Provide an option to show a subset of the columns (e.g., first 10 columns)
        st.write("#### Data (Top 10 Rows)")
        st.dataframe(data.head(10))  # Show top 10 rows of the dataset by default

        # Display the summary statistics table using describe()
        st.write("#### Summary Statistics")
        st.dataframe(data.describe())  # Show the summary statistics table
        
        # Show column selection for users to filter out which columns to view
        columns = data.columns.tolist()
        selected_columns = st.multiselect("Select columns to display", columns, default=columns[:5])  # default to the first 5 columns
        st.dataframe(data[selected_columns])

    # Insert Analysis Text
    st.markdown("""
    ## Analysis Considering the Country Information:

    ### 1. Year
    The dataset spans from 2005 to 2022, with a concentration of data points around the early 2010s (average year: 2012.96). Given the countries involved (mostly high-income countries), it is likely that most data points come from stable educational systems, reflecting steady trends in education policy over this time period.
    The spread of years indicates that the dataset captures education-related trends during a relatively dynamic period, possibly influenced by events like the global financial crisis, which may have affected education budgets and policies.

    ### 2. Region/Country/Area
    - **High-Income Countries:** Many of the countries in this dataset are high-income countries in Europe (e.g., Germany, France, Sweden, United Kingdom, United States) and advanced economies in Asia (e.g., Japan, Republic of Korea).
    - **Data Gaps:** Several regions (such as Albania, Gibraltar, Montenegro, and Republic of Moldova) have fewer data points. This could indicate reporting challenges or differences in educational reporting standards in these countries.
    - **High Reporting Frequency:** Countries like China (mainland and Hong Kong SAR), Russia, India, and United States appear more frequently, possibly reflecting a broader availability of data or more comprehensive educational statistics reporting.

    ### 3. All Staff Compensation as % of Total Expenditure in Public Institutions (%)
    - **High Proportion in High-Income Countries:** The average percentage of staff compensation (50.77%) is likely to be high in more developed economies, where staff salaries are a substantial part of the education budget. Countries such as United States, United Kingdom, and Germany would likely have higher allocations toward staff compensation.
    - **Wide Variation:** The minimum value of 0% suggests some countries or regions with very low educational staff compensation relative to total expenditure, possibly reflecting developing or conflict-affected areas (e.g., Albania, Republic of Moldova).

    ### 4. Basic Access to Computers by Level of Education
    - **Access Gap:** The statistics for computer access (mean access at lower secondary is 33.3%, primary 44.6%, and upper secondary 34.1%) highlight an issue of digital inequality. High-income countries like United States, Germany, and Australia likely have near-complete access to computers, whereas other countries like Albania, Moldova, or Gibraltar may lag behind.
    - **Full Access vs. No Access:** The 25th percentile showing 0% access and the 75th percentile showing 100% access suggests a stark divide, with some regions offering complete access while others lack it entirely.

    ### 5. Capital Expenditure as % of Total Expenditure in Public Institutions (%)
    - **Mean Capital Expenditure (6.09%):** This is relatively low, and suggests many countries are focusing more on current expenditure (e.g., staff salaries) than on long-term investments in infrastructure.
    - **Developed Economies with Higher Capital Expenditure:** Countries like Germany, France, and United States may allocate more towards capital expenditure due to greater investment in educational infrastructure, whereas smaller or lower-income regions like Moldova or Albania may have lower capital expenditure allocations.

    ### 6. Current Expenditure Other Than Staff Compensation
    - **Operational Costs:** With a mean of 17.08%, this represents a moderate share of expenditure going toward educational operations outside staff salaries. Developed economies like United States, Germany, and United Kingdom likely have higher operational costs, while regions with smaller or developing economies may allocate less toward this category.

    ### 7. Gross Enrollment Ratios (GER) by Education Level
    - **High GERs in Primary and Secondary Education:** Countries like United States, United Kingdom, Germany, and France have near-universal enrollment, with GERs above 100% indicating education systems that accommodate students outside the typical age range.
    - **Slight Gender Differences:** Although there is a slight gender imbalance in enrollment at lower secondary and primary levels (favoring boys), the upper secondary level exhibits near-gender parity or a slight advantage for girls, which is consistent with trends seen in many developed nations.

    ### 8. Ratio of Girls to Boys in Education
    - **Gender Parity:** The ratio of girls to boys is close to 1 across all education levels. At upper secondary, there is a slight shift toward more girls being enrolled, which could be attributed to improved access and opportunities for girls in many developed countries, like Sweden, Finland, and Norway.
    - **Gender Imbalance in Some Regions:** Some countries with lower educational attainment or gender disparities may have ratios less favorable toward girls, particularly at lower secondary and primary levels.

    ### 9. Teachers by Education Level
    - **Disparities in Teacher Availability:** The large variation in teacher numbers, especially at lower and upper secondary levels, shows considerable disparities across regions. High-income countries like United States, Germany, and United Kingdom have more teachers in both primary and secondary education, while regions such as Albania, Moldova, or San Marino may have very low teacher counts or difficulty in data reporting.
    - **Teacher Shortages:** The mean number of teachers at the lower secondary level (6.63) and upper secondary level (1.16) is low, with many countries having zero data points, suggesting teacher shortages or underreporting in specific regions.

    ### 10. Teachers with Minimum Required Qualifications
    - **Qualified Teacher Proportion:** The data indicates that only a small fraction of teachers in certain countries meet the minimum qualification standards, particularly in lower secondary and upper secondary education. This may point to issues in teacher training, especially in developing or under-resourced countries like Albania, Republic of Moldova, and Montenegro.
    - **High Qualification Standards in Developed Countries:** Countries like Germany, United States, and Sweden are likely to have a higher percentage of teachers meeting qualification requirements, though even in these countries, some gaps may exist.

    ### Summary of Key Trends:
    - **Digital Divide:** There is a significant gap in access to computers, especially at the lower secondary and upper secondary levels, highlighting a digital divide between high-income and low-income countries.
    - **Investment in Education:** There is an apparent focus on current expenditures (e.g., staff salaries) rather than capital investments (e.g., infrastructure), with wide variations in spending priorities across regions.
    - **Teacher Availability and Qualification:** Teacher shortages and qualification gaps are evident, particularly at the secondary education levels, reflecting systemic issues in teacher training and recruitment, especially in lower-income regions.
    - **Gender Parity:** Generally, there is good gender parity in enrollment, though some countries show disparities, particularly at lower secondary and primary levels.
    - **Geographical Inequality:** The dataset highlights stark disparities between countries, with developed countries like those in Europe, United States, and Japan consistently showing higher levels of investment, qualified teachers, and enrollment rates.

    This detailed analysis suggests that education systems vary significantly across countries in terms of access to technology, teacher qualifications, expenditure priorities, and gender parity. High-income countries tend to have more stable educational systems, while low-income or smaller countries may face challenges in providing equal access to resources and qualified teachers.
    """)

In [None]:
# pages/ml_results.py

import streamlit as st

def show(model_results, feature_importance=None):
    st.title("Machine Learning Results")
    
    # Models Used Section
    st.write("### Models Used")
    st.write("""
    In this project, two machine learning models were used to analyze global literacy data and identify patterns and relationships:
    
    1. **Linear Regression**:
       - A linear model used to predict literacy rates based on various socio-economic, demographic, and policy features.
    
    2. **Random Forest**:
       - An ensemble learning method used to capture complex non-linear relationships between the features and the target variable (literacy rate).
    """)
    
    # Features Section
    st.write("### Features")
    st.write("""
    The following features were selected for the models:
    - **Country**: The country or region.
    - **GDP per capita**: The Gross Domestic Product per capita of each country.
    - **Education expenditure**: Government expenditure on education.
    - **Teacher qualification**: The percentage of teachers with the required qualifications.
    - **Gender Parity Index**: A measure of gender equality in education.
    - **Access to Computers**: The availability of computers in educational institutions.
    - **Youth Literacy Rate**: Literacy rate for the population aged 15-24.
    - **School Enrollment Rate**: The percentage of children enrolled in school.
    """)
    
    # Hyperparameters Section
    st.write("### Hyperparameters")
    st.write("""
    The following hyperparameters were used for model tuning:
    
    **Linear Regression**:
    - No hyperparameters to tune for Linear Regression (default settings).
    
    **Random Forest**:
    - `n_estimators`: Number of trees in the forest (set to 100).
    - `max_depth`: Maximum depth of each tree (set to 10).
    - `min_samples_split`: Minimum number of samples required to split an internal node (set to 2).
    - `min_samples_leaf`: Minimum number of samples required to be at a leaf node (set to 1).
    - `random_state`: Ensures reproducibility of results (set to 42).
    """)
    
    # Results Section
    st.write("### Results")
    
    if model_results:
        st.write("#### Model Performance")
        st.write("""
        The models were evaluated using several metrics, such as Mean Absolute Error (MAE), Mean Squared Error (MSE), and R-squared (R²):
        """)
        
        # Display Model Results
        st.write("**Linear Regression Performance**:")
        st.write(f"- **R²**: {model_results['linear_regression']['r2']:.3f}")
        st.write(f"- **MAE**: {model_results['linear_regression']['mae']:.3f}")
        st.write(f"- **MSE**: {model_results['linear_regression']['mse']:.3f}")
        
        st.write("**Random Forest Performance**:")
        st.write(f"- **R²**: {model_results['random_forest']['r2']:.3f}")
        st.write(f"- **MAE**: {model_results['random_forest']['mae']:.3f}")
        st.write(f"- **MSE**: {model_results['random_forest']['mse']:.3f}")
        
        # If feature importance is available for Random Forest
        if feature_importance is not None:
            st.write("#### Feature Importance (Random Forest)")
            st.bar_chart(feature_importance)
    
    else:
        st.write("No results available.")
    
    # Testing Section
    st.write("### Testing")
    st.write("""
    The models were tested using a hold-out test set (20% of the data), and the performance was evaluated on the test set. The evaluation metrics include:
    - **R-squared**: Indicates how well the model explains the variance in the data.
    - **Mean Absolute Error (MAE)**: Measures the average magnitude of errors in predictions.
    - **Mean Squared Error (MSE)**: Measures the average squared differences between predicted and actual values.
    
    We performed cross-validation to ensure the robustness and generalizability of the models. The results indicate that the Random Forest model performs better in capturing non-linear relationships compared to Linear Regression.
    """)
    
    # Conclusion (optional)
    st.write("### Conclusion")
    st.write("""
    Based on the results, it is clear that the Random Forest model outperforms the Linear Regression model in terms of both R-squared and error metrics. 
    The feature importance analysis shows which factors most significantly impact literacy rates, with key drivers being access to education, public expenditure on education, and teacher qualifications.
    """)

In [None]:
# utils/display.py

import streamlit as st
import requests

def load_markdown_from_url(url):
    """
    Function to load the content of a Markdown file from a URL.
    """
    try:
        response = requests.get(url)
        response.raise_for_status()  # Will raise an error for bad status codes
        return response.text
    except requests.exceptions.RequestException as e:
        st.error(f"Error loading markdown content: {e}")
        return None

def display_dataframe(df):
    """
    Display the dataframe with the option to show only the top rows.
    """
    if df is not None:
        st.write("### Data Overview")
        st.dataframe(df.head())  # Display only the top rows of the dataframe
    else:
        st.error("No data to display.")