In [15]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.metrics import accuracy_score, mean_squared_error

In [16]:
# Load the dataset from the URL
url = "https://raw.githubusercontent.com/Cfg-data/final-project/refs/heads/master/usable_notebooks/full_country_data.csv"
df = pd.read_csv(url)

# Display the first few rows of the dataframe
df.head()

Unnamed: 0,Year,Region/Country/Area,All staff compensation as % of total expenditure in public institutions (%),Basic access to computers by level of education: Lower secondary,Basic access to computers by level of education: Primary,Basic access to computers by level of education: Upper secondary,Capital expenditure as % of total expenditure in public institutions (%),Current expenditure other than staff compensation as % of total expenditure in public institutions (%),Gross enrollment ratio - Lower secondary level (female),Gross enrollment ratio - Lower secondary level (male),...,Students enrolled in upper secondary education (thousands),Ratio of girls to boys in lower secondary education,Ratio of girls to boys in primary education,Ratio of girls to boys in upper secondary education,Teachers at lower secondary level,Teachers at primary level,Teachers at upper secondary level,Teachers with minimum required qualifications at lower secondary level,Teachers with minimum required qualifications at primary level,Teachers with minimum required qualifications at upper secondary level
0,2005,40,72.1,0.0,0.0,0.0,4.7,23.2,101.5,102.7,...,382.0,0.99,0.99,0.92,0.0,0.0,0.0,0.0,0.0,0.0
1,2005,56,85.5,0.0,0.0,0.0,2.8,11.7,178.0,155.3,...,770.0,1.13,1.0,1.09,0.0,0.0,0.0,0.0,0.0,0.0
2,2005,100,63.2,0.0,0.0,0.0,10.1,26.7,122.5,130.6,...,373.0,0.94,0.99,0.98,0.0,0.0,0.0,0.0,0.0,0.0
3,2005,112,65.2,0.0,0.0,0.0,4.8,30.1,110.8,117.3,...,0.0,0.94,0.94,0.0,0.0,0.0,0.0,0.0,99.8,0.0
4,2005,196,76.9,0.0,0.0,0.0,13.4,9.8,95.1,94.4,...,32.0,1.01,1.0,1.05,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
# Drop 'Region/Country/Area' column for the features
X = df.drop(columns=['Region/Country/Area', 'Year'])

# Fill missing values (Simple Imputer)
imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(X)

# Use 'Region/Country/Area' directly as the target for classification
y_class = df['Year']  # No encoding needed, already numeric

# Different train-test splits
splits = [0.5, 0.6, 0.7, 0.8]

In [18]:
# Iterating through different train-test splits and applying scalers
for split in splits:
    print(f"\nTrain-test split: {int(split * 100)} / {int((1 - split) * 100)}")
    
    # Train-test split for classification (Region/Country/Area)
    X_train, X_test, y_train_class, y_test_class = train_test_split(X_imputed, y_class, test_size=1-split, random_state=42)

    # Apply MinMaxScaler
    min_max_scaler = MinMaxScaler()
    X_train_minmax = min_max_scaler.fit_transform(X_train)
    X_test_minmax = min_max_scaler.transform(X_test)

    # Apply StandardScaler
    standard_scaler = StandardScaler()
    X_train_standard = standard_scaler.fit_transform(X_train)
    X_test_standard = standard_scaler.transform(X_test)
    
    # KNN Classifier (MinMaxScaler)
    knn_class_minmax = KNeighborsClassifier(n_neighbors=5, n_jobs=1)  # Ensure single-threaded execution
    knn_class_minmax.fit(X_train_minmax, y_train_class)
    y_pred_class_minmax = knn_class_minmax.predict(X_test_minmax)

    # Evaluate KNN Classifier (MinMaxScaler)
    accuracy_minmax = accuracy_score(y_test_class, y_pred_class_minmax)
    print(f"KNN Classifier Accuracy (MinMaxScaler): {accuracy_minmax * 100:.2f}%")
    
    # KNN Classifier (StandardScaler)
    knn_class_standard = KNeighborsClassifier(n_neighbors=5, n_jobs=1)  # Ensure single-threaded execution
    knn_class_standard.fit(X_train_standard, y_train_class)
    y_pred_class_standard = knn_class_standard.predict(X_test_standard)

    # Evaluate KNN Classifier (StandardScaler)
    accuracy_standard = accuracy_score(y_test_class, y_pred_class_standard)
    print(f"KNN Classifier Accuracy (StandardScaler): {accuracy_standard * 100:.2f}%")


Train-test split: 50 / 50
KNN Classifier Accuracy (MinMaxScaler): 35.71%
KNN Classifier Accuracy (StandardScaler): 28.57%

Train-test split: 60 / 40
KNN Classifier Accuracy (MinMaxScaler): 46.67%
KNN Classifier Accuracy (StandardScaler): 35.56%

Train-test split: 70 / 30
KNN Classifier Accuracy (MinMaxScaler): 52.94%
KNN Classifier Accuracy (StandardScaler): 38.24%

Train-test split: 80 / 19
KNN Classifier Accuracy (MinMaxScaler): 30.43%
KNN Classifier Accuracy (StandardScaler): 26.09%


In [19]:
# Drop 'Region/Country/Area' column for the features
X = df.drop(columns=['Region/Country/Area', 'Year'])

# Fill missing values (Simple Imputer)
imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(X)

# Use 'Region/Country/Area' directly as the target for classification
y_class = df['Year']  # No encoding needed, already numeric

# Different train-test splits
splits = [0.5, 0.6, 0.7, 0.8]

In [20]:
# Iterating through different train-test splits and applying scalers
for split in splits:
    print(f"\nTrain-test split: {int(split * 100)} / {int((1 - split) * 100)}")
    
    # Train-test split for regression (Year)
    X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(X_imputed, y_regression, test_size=1-split, random_state=42)

    # Apply MinMaxScaler
    min_max_scaler = MinMaxScaler()
    X_train_minmax = min_max_scaler.fit_transform(X_train_reg)
    X_test_minmax = min_max_scaler.transform(X_test_reg)

    # Apply StandardScaler
    standard_scaler = StandardScaler()
    X_train_standard = standard_scaler.fit_transform(X_train_reg)
    X_test_standard = standard_scaler.transform(X_test_reg)
    
    # KNN Regressor (MinMaxScaler)
    knn_reg_minmax = KNeighborsRegressor(n_neighbors=5, n_jobs=1)  # Ensure single-threaded execution
    knn_reg_minmax.fit(X_train_minmax, y_train_reg)
    y_pred_reg_minmax = knn_reg_minmax.predict(X_test_minmax)

    # Evaluate KNN Regressor (MinMaxScaler)
    mse_minmax = mean_squared_error(y_test_reg, y_pred_reg_minmax)
    print(f"KNN Regressor MSE (MinMaxScaler): {mse_minmax:.2f}")
    
    # KNN Regressor (StandardScaler)
    knn_reg_standard = KNeighborsRegressor(n_neighbors=5, n_jobs=1)  # Ensure single-threaded execution
    knn_reg_standard.fit(X_train_standard, y_train_reg)
    y_pred_reg_standard = knn_reg_standard.predict(X_test_standard)

    # Evaluate KNN Regressor (StandardScaler)
    mse_standard = mean_squared_error(y_test_reg, y_pred_reg_standard)
    print(f"KNN Regressor MSE (StandardScaler): {mse_standard:.2f}")


Train-test split: 50 / 50
KNN Regressor MSE (MinMaxScaler): 16.97
KNN Regressor MSE (StandardScaler): 17.07

Train-test split: 60 / 40
KNN Regressor MSE (MinMaxScaler): 17.93
KNN Regressor MSE (StandardScaler): 16.68

Train-test split: 70 / 30
KNN Regressor MSE (MinMaxScaler): 19.39
KNN Regressor MSE (StandardScaler): 19.42

Train-test split: 80 / 19
KNN Regressor MSE (MinMaxScaler): 24.90
KNN Regressor MSE (StandardScaler): 24.75
