In [None]:
app_files/
│
├── app.py                # Main script to run the Streamlit app
├── data_loader.py        # Handles loading data
├── pages/                # Folder containing different pages of the app
│   ├── opener.py          # Page that confirms data has been loaded to present Project and Contains Info on Data Gathering Links
│   ├── introduction.py          # Project introduction page
│   ├── ml_results.py        # Results of Machine Learning implementation page
└── utils_                # Folder for utility functions
    ├── display.py        # Helper functions for displaying data

In [None]:
# data_loader.py

import pandas as pd
import streamlit as st

def load_data():
    """
    Load and return the data from the three CSV files:
    - Education Statistics
    - Ratio of Girls to Boys in Education
    - Teaching Staff in Education
    
    The function also calculates derived columns like 'completion' and 'age_group' if needed.
    """
    try:
        # File paths for the three datasets
        education_stats_url = 'https://raw.githubusercontent.com/Cfg-data/final-project/refs/heads/master/data/filtered/filtered_areas_totals_SYB67_309_202411.csv'
        girls_boys_ratio_url = 'https://raw.githubusercontent.com/Cfg-data/final-project/refs/heads/master/data/filtered/filtered_areas_totals_SYB67_319_202411.csv'
        teaching_staff_url = 'https://raw.githubusercontent.com/Cfg-data/final-project/refs/heads/master/data/filtered/filtered_areas_totals_SYB67_323_202411.csv'
        
        # Load the datasets from the provided URLs
        education_stats_df = pd.read_csv(education_stats_url)
        girls_boys_ratio_df = pd.read_csv(girls_boys_ratio_url)
        teaching_staff_df = pd.read_csv(teaching_staff_url)
        
        # Returning the dataframes as a dictionary
        return {
            'education_stats': education_stats_df,
            'girls_boys_ratio': girls_boys_ratio_df,
            'teaching_staff': teaching_staff_df
        }
    
    except Exception as e:
        st.error(f"Error loading data: {str(e)}")
        return None

In [None]:
#app.py

import streamlit as st
from data_loader import load_data
from pages import opener, introduction, ml_results

# Set the page title and layout
st.set_page_config(page_title="Education Project Overview", layout="wide")

# Load data for the entire app
data = load_data()

# Display a sidebar with navigation options
st.sidebar.title("Navigation")
page = st.sidebar.radio("Go to", ["Opener", "Introduction", "ML Results"])

# Display selected page content
if page == "Opener":
    opener.show(data)
elif page == "Introduction":
    introduction.show(data)
elif page == "ML Results":
    ml_results.show(data)

In [None]:
# pages/opener.py

import streamlit as st

def show(data):
    st.title("Welcome to the Education Project")
    st.write("### Data Loading Confirmation")
    
    if data is not None:
        st.success("Data has been successfully loaded!")
        
        # Display some basic information about each dataset
        st.write("### Collected Datasets")
        st.write("The following datasets have been collected from UNDATA:")

        st.markdown("""
        1. **[Ratio of Girls to Boys in Education](https://data.un.org/_Docs/SYB/CSV/SYB67_319_202411_Ratio%20of%20girls%20to%20boys%20in%20education.csv)**
           - Dataset ID: SYB67_319_202411
           - Description: This dataset provides the ratio of girls to boys in education across different countries and regions.

        2. **[Public Expenditure on Education and Access to Computers](https://data.un.org/_Docs/SYB/CSV/SYB67_245_202411_Public%20expenditure%20on%20education%20and%20access%20to%20computers.csv)**
           - Dataset ID: SYB67_245_202411
           - Description: This dataset presents the public expenditure on education, along with data on the availability of computers in educational institutions.

        3. **[Teaching Staff in Education](https://data.un.org/_Docs/SYB/CSV/SYB67_323_202411_Teaching%20Staff%20in%20education.csv)**
           - Dataset ID: SYB67_323_202411
           - Description: This dataset outlines the number of teaching staff in the education sector across various countries and regions.

        4. **[Education Statistics](https://data.un.org/_Docs/SYB/CSV/SYB67_309_202411_Education.csv)**
           - Dataset ID: SYB67_309_202411
           - Description: This dataset provides a comprehensive overview of various education-related statistics, such as enrollment rates, graduation rates, and literacy rates.
        """)

        st.write("### Additional Datasets")

        st.markdown("""
        - **[Youth Literacy Rate, Population 15-24 Years (%)](https://data.un.org/Data.aspx?d=UNESCO&f=series%3aLR_AG15T24)**  
          Identified as UNdata_Export_20241213_140703208 in the files.

        - **[Youth Literacy Rate, Population 15-24 Years, Gender Parity Index (GPI)](https://data.un.org/Data.aspx?d=UNESCO&f=series%3aLR_AG15T24_GPI)**  
          Identified as UNdata_Export_20241213_140708283 in the files.
        """)

        # Display dataset information (DataFrame dimensions) for loaded datasets
        for dataset_name, df in data.items():
            st.write(f"### {dataset_name.replace('_', ' ').title()}")
            st.write(f"Data contains {len(df)} rows and {len(df.columns)} columns.")

    else:
        st.error("There was an issue loading the data.")

    # License Information
    st.write("### License")
    st.write("This project is based on publicly available data from UNDATA. Please refer to the [UN Data Usage Policy](https://data.un.org/Usage.aspx) for licensing and attribution information.")

In [None]:
# pages/opener.py

import streamlit as st

def show(df):
    st.title("Machine Learning Results")
    
    # Placeholder for displaying ML results (e.g., charts, metrics)
    st.write("### Results of the Machine Learning Analysis")
    
    if df is not None:
        st.write("The machine learning model results will be displayed here.")
        # Example: Display a simple summary of the data
        st.write("Summary Statistics of the Data:")
        st.dataframe(df.describe())
    else:
        st.error("Data is unavailable for analysis.")

In [None]:
# pages/introduction.py

import streamlit as st

def show(data):
    st.title("Project Introduction")
    st.write("""
        Education serves as a cornerstone for societal progress, yet the global landscape reveals significant disparities in both access to education and educational outcomes. 
        These inequalities are particularly evident when examining key factors such as literacy rates, gender parity in education, teacher qualifications, and resource allocation. 
        This research project is designed to analyze and present educational data from the United Nations Database (UNDATA) with the goal of understanding the complex factors influencing educational disparities worldwide. 
        The project is divided into two primary parts: one focused on literacy disparities using machine learning models, and the other on a statistical analysis of educational metrics such as gender ratios, teacher qualifications, and access to resources.
    """)
    
    st.write("""
        The first part of the project will apply machine learning techniques—specifically Linear Regression and Random Forest models—to identify patterns and relationships in global literacy data. 
        By examining literacy rates across different regions and correlating them with socio-economic, demographic, and policy variables, this analysis will provide insights into the global disparities in literacy.
    """)
    
    st.write("""
        The second part of the project takes a deeper dive into the gender gap in education, the availability of qualified teachers, and the distribution of educational resources over time. 
        This will include a statistical analysis of the ratio of girls to boys at various levels of education, the number of teachers at each educational level, and the proportion of teachers with the required qualifications. 
        Additionally, the study will explore access to computers in education and analyze government expenditure on education as a percentage of total national expenditure, considering how these factors evolve year by year.
    """)

    st.write("""
        By integrating both machine learning models and statistical analysis, this project aims to offer a multifaceted view of the global educational landscape, revealing how gender, resources, and government investment influence educational outcomes. 
        Ultimately, the findings will contribute to a more nuanced understanding of the educational inequalities that persist across different regions and provide a foundation for policy recommendations to foster more equitable and effective education systems globally.
    """)

In [None]:
# pages/ml_results.py

import streamlit as st

def show(model_results, feature_importance=None):
    st.title("Machine Learning Results")
    
    # Models Used Section
    st.write("### Models Used")
    st.write("""
    In this project, two machine learning models were used to analyze global literacy data and identify patterns and relationships:
    
    1. **Linear Regression**:
       - A linear model used to predict literacy rates based on various socio-economic, demographic, and policy features.
    
    2. **Random Forest**:
       - An ensemble learning method used to capture complex non-linear relationships between the features and the target variable (literacy rate).
    """)
    
    # Features Section
    st.write("### Features")
    st.write("""
    The following features were selected for the models:
    - **Country**: The country or region.
    - **GDP per capita**: The Gross Domestic Product per capita of each country.
    - **Education expenditure**: Government expenditure on education.
    - **Teacher qualification**: The percentage of teachers with the required qualifications.
    - **Gender Parity Index**: A measure of gender equality in education.
    - **Access to Computers**: The availability of computers in educational institutions.
    - **Youth Literacy Rate**: Literacy rate for the population aged 15-24.
    - **School Enrollment Rate**: The percentage of children enrolled in school.
    """)
    
    # Hyperparameters Section
    st.write("### Hyperparameters")
    st.write("""
    The following hyperparameters were used for model tuning:
    
    **Linear Regression**:
    - No hyperparameters to tune for Linear Regression (default settings).
    
    **Random Forest**:
    - `n_estimators`: Number of trees in the forest (set to 100).
    - `max_depth`: Maximum depth of each tree (set to 10).
    - `min_samples_split`: Minimum number of samples required to split an internal node (set to 2).
    - `min_samples_leaf`: Minimum number of samples required to be at a leaf node (set to 1).
    - `random_state`: Ensures reproducibility of results (set to 42).
    """)
    
    # Results Section
    st.write("### Results")
    
    if model_results:
        st.write("#### Model Performance")
        st.write("""
        The models were evaluated using several metrics, such as Mean Absolute Error (MAE), Mean Squared Error (MSE), and R-squared (R²):
        """)
        
        # Display Model Results
        st.write("**Linear Regression Performance**:")
        st.write(f"- **R²**: {model_results['linear_regression']['r2']:.3f}")
        st.write(f"- **MAE**: {model_results['linear_regression']['mae']:.3f}")
        st.write(f"- **MSE**: {model_results['linear_regression']['mse']:.3f}")
        
        st.write("**Random Forest Performance**:")
        st.write(f"- **R²**: {model_results['random_forest']['r2']:.3f}")
        st.write(f"- **MAE**: {model_results['random_forest']['mae']:.3f}")
        st.write(f"- **MSE**: {model_results['random_forest']['mse']:.3f}")
        
        # If feature importance is available for Random Forest
        if feature_importance is not None:
            st.write("#### Feature Importance (Random Forest)")
            st.bar_chart(feature_importance)
    
    else:
        st.write("No results available.")
    
    # Testing Section
    st.write("### Testing")
    st.write("""
    The models were tested using a hold-out test set (20% of the data), and the performance was evaluated on the test set. The evaluation metrics include:
    - **R-squared**: Indicates how well the model explains the variance in the data.
    - **Mean Absolute Error (MAE)**: Measures the average magnitude of errors in predictions.
    - **Mean Squared Error (MSE)**: Measures the average squared differences between predicted and actual values.
    
    We performed cross-validation to ensure the robustness and generalizability of the models. The results indicate that the Random Forest model performs better in capturing non-linear relationships compared to Linear Regression.
    """)
    
    # Conclusion (optional)
    st.write("### Conclusion")
    st.write("""
    Based on the results, it is clear that the Random Forest model outperforms the Linear Regression model in terms of both R-squared and error metrics. 
    The feature importance analysis shows which factors most significantly impact literacy rates, with key drivers being access to education, public expenditure on education, and teacher qualifications.
    """)

In [None]:
# utils/display.py

import streamlit as st

def display_dataframe(df):
    """
    Display the dataframe with the option to show only the top rows.
    """
    if df is not None:
        st.write("### Data Overview")
        st.dataframe(df.head())
    else:
        st.error("No data to display.")