In [None]:
# Capstone Project: Quality Education Analytics

This notebook performs data cleaning, exploratory data analysis (EDA), and clustering on Sustainable Development Goal 4 data related to quality education.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score

sns.set(style="whitegrid")


In [3]:
# Load cleaned dataset
df = pd.read_csv("C:/Users/Hp/Desktop/27193 Final Project/education_data.csv")
df.head()

Unnamed: 0,STRUCTURE,STRUCTURE_ID,STRUCTURE_NAME,ACTION,FREQ,Frequency,INDICATOR,Indicator,GEO_PICT,Pacific Island Countries and territories,...,REPORTING_TYPE,Reporting type,NATURE,Nature,DATA_SOURCE,Data source,OBS_STATUS,Observation Status,OBS_COMMENT,Comment
0,DATAFLOW,SPC:DF_SDG_04(3.0),Sustainable Development Goal 04 - Quality Educ...,I,A,Annual,SE_GC_TCAQ,4.c.1 Trained teachers,WS,Samoa,...,G,Global,C,Country Data,UNESCO Institute of Statistics,,,,,
1,DATAFLOW,SPC:DF_SDG_04(3.0),Sustainable Development Goal 04 - Quality Educ...,I,A,Annual,SE_GC_TCAQ,4.c.1 Trained teachers,WS,Samoa,...,G,Global,C,Country Data,UNESCO Institute of Statistics,,,,,
2,DATAFLOW,SPC:DF_SDG_04(3.0),Sustainable Development Goal 04 - Quality Educ...,I,A,Annual,SE_GC_TCAQ,4.c.1 Trained teachers,WS,Samoa,...,G,Global,C,Country Data,UNESCO Institute of Statistics,,,,,
3,DATAFLOW,SPC:DF_SDG_04(3.0),Sustainable Development Goal 04 - Quality Educ...,I,A,Annual,SE_PRE_PARTN,4.2.2 Participation rate in organised learning,PG,Papua New Guinea,...,G,Global,C,Country Data,UNESCO Institute of Statistics,,,,,
4,DATAFLOW,SPC:DF_SDG_04(3.0),Sustainable Development Goal 04 - Quality Educ...,I,A,Annual,SE_PRE_PARTN,4.2.2 Participation rate in organised learning,PG,Papua New Guinea,...,G,Global,C,Country Data,UNESCO Institute for Statistics,,,,,


In [4]:
# Overview of data
print(df.info())
print(df.describe(include='all'))
print("Missing values:", df.isnull().sum())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2095 entries, 0 to 2094
Data columns (total 42 columns):
 #   Column                                    Non-Null Count  Dtype  
---  ------                                    --------------  -----  
 0   STRUCTURE                                 2095 non-null   object 
 1   STRUCTURE_ID                              2095 non-null   object 
 2   STRUCTURE_NAME                            2095 non-null   object 
 3   ACTION                                    2095 non-null   object 
 4   FREQ                                      2095 non-null   object 
 5   Frequency                                 2095 non-null   object 
 6   INDICATOR                                 2095 non-null   object 
 7   Indicator                                 2095 non-null   object 
 8   GEO_PICT                                  2095 non-null   object 
 9   Pacific Island Countries and territories  2095 non-null   object 
 10  SEX                                 

In [5]:
# Top countries by record count
top_locations = df['Location'].value_counts().head(10)
top_locations.plot(kind='barh', title='Top 10 Countries by Record Count')
plt.xlabel("Number of Records")
plt.show()

KeyError: 'Location'

In [7]:
# Example trend: Out-of-school rate over time
subset = df[df['Indicator'].str.contains("out-of-school", case=False)]

# Check the actual column names in the DataFrame
# print(subset.columns)  # Uncomment this line to see all column names

# Assuming the year column might be named differently, like 'TIME' or 'Date'
# Replace 'TIME' with the actual column name for year in your DataFrame
sns.lineplot(data=subset, x='TIME', y='Value', hue='Location')
plt.title("Out-of-School Rate Over Time")
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.show()

ValueError: Could not interpret value `TIME` for `x`. An entry with this name does not appear in `data`.

In [8]:
# Pivot for clustering
pivot_df = df.pivot_table(index='Location', columns='Indicator', values='Value', aggfunc='mean')
pivot_df.fillna(0, inplace=True)

# Standardize
scaler = StandardScaler()
scaled_data = scaler.fit_transform(pivot_df)

# PCA
pca = PCA(n_components=2)
pca_result = pca.fit_transform(scaled_data)

# KMeans
kmeans = KMeans(n_clusters=3, random_state=42)
clusters = kmeans.fit_predict(scaled_data)

# Add results
pivot_df['Cluster'] = clusters
pivot_df['PCA1'] = pca_result[:, 0]
pivot_df['PCA2'] = pca_result[:, 1]

# Plot
sns.scatterplot(data=pivot_df, x='PCA1', y='PCA2', hue='Cluster', palette='Set2')
plt.title("KMeans Clustering of Countries (Education Indicators)")
plt.show()

KeyError: 'Value'

In [None]:
# Silhouette Score
score = silhouette_score(scaled_data, clusters)
print(f"Silhouette Score: {score:.2f}")

In [None]:
from sklearn.ensemble import IsolationForest

iso = IsolationForest(contamination=0.05, random_state=42)
pivot_df['Anomaly'] = iso.fit_predict(scaled_data)

# Show anomalous countries
print("Anomalous Countries:")
print(pivot_df[pivot_df['Anomaly'] == -1].index.tolist())