# Preprocessing 

In [295]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import RobustScaler, OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor, ExtraTreesRegressor, StackingRegressor
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.model_selection import GridSearchCV
import joblib
import matplotlib.pyplot as plt

In [296]:
# Lets assume the final dataset is the merged school and students dataset and this is the target Student_Performance_Score column, just to test the pipelines.
students_df = pd.read_csv('students.csv')
schools_df = pd.read_csv('school.csv')
teachers_df = pd.read_csv('teachers.csv')
activities_df = pd.read_csv('activities.csv')
classes_df = pd.read_csv('class.csv') 

In [297]:
schools_df.head()

Unnamed: 0,Student_ID,Teacher_Student_Ratio,Average_Teacher_Experience_Years,Average_Class_Size,School_Funding_Per_Student,School_Extracurricular_Activities,Parental_Involvement_Score,School_Facilities_Rating,Internet_Access_In_School,School_Distance_From_Home_km,Student_Attendance_Rate,Disciplinary_Actions_Taken,Student_Performance_Score
0,93810,21.85508,5,24,12984,Arts,6.225391,7.827782,True,0.628645,85.473338,1,31.435237
1,24592,21.85508,13,24,23862,Arts,9.678251,8.911485,False,3.563695,99.721658,1,42.479681
2,13278,21.85508,14,24,10914,Sports,7.359483,6.441049,False,1.757442,96.880077,0,86.561526
3,46048,21.85508,15,24,14890,Both,8.206922,8.574389,False,4.828292,80.27329,3,52.178405
4,42098,21.85508,16,24,24956,,7.664513,7.282488,False,3.64256,97.720041,1,72.796161


In [298]:
schools_df.head()

Unnamed: 0,Student_ID,Teacher_Student_Ratio,Average_Teacher_Experience_Years,Average_Class_Size,School_Funding_Per_Student,School_Extracurricular_Activities,Parental_Involvement_Score,School_Facilities_Rating,Internet_Access_In_School,School_Distance_From_Home_km,Student_Attendance_Rate,Disciplinary_Actions_Taken,Student_Performance_Score
0,93810,21.85508,5,24,12984,Arts,6.225391,7.827782,True,0.628645,85.473338,1,31.435237
1,24592,21.85508,13,24,23862,Arts,9.678251,8.911485,False,3.563695,99.721658,1,42.479681
2,13278,21.85508,14,24,10914,Sports,7.359483,6.441049,False,1.757442,96.880077,0,86.561526
3,46048,21.85508,15,24,14890,Both,8.206922,8.574389,False,4.828292,80.27329,3,52.178405
4,42098,21.85508,16,24,24956,,7.664513,7.282488,False,3.64256,97.720041,1,72.796161


In [299]:
teachers_df.head()

Unnamed: 0,Teacher_ID,Student_ID,Marital_Status,Education_Level,Gender,Age,Subject_Taught,Degree,Parental_Status,Teacher_Training,Distance_From_Home_to_School_km,Disability,Health_Issue,Resumption_Time,Have_Lesson_Note,Salary_NGN,Teaching_Experience_Years
0,8950,51681,Single,PhD,Male,35,Commerce,B.A. Religious Studies,No,Trained,18.56,,,18:28,Yes,107433,8
1,7954,18834,Widowed,M.Ed,Male,47,Education,B.Sc. Economics,No,Trained,28.68,,,20:58,No,216557,23
2,3336,81511,Single,M.Ed,Male,33,CRS,B.Sc. Financial Accounting,Yes,Not Trained,24.66,,,15:55,Yes,57479,26
3,4454,30056,Single,M.Ed,Female,28,Fine Arts,B.Sc. Physics,Yes,Trained,17.17,,,19:21,Yes,217837,11
4,9593,49384,Single,NCE,Male,27,English,B.Sc. Environmental Science,No,Not Trained,48.17,,,19:29,No,114184,17


In [300]:
activities_df.head()

Unnamed: 0,Activity_ID,Student_ID,Activity_Type,Activity_Category,Frequency_of_Participation,Duration_per_Session (Hours),Impact_on_Performance,Teacher_Supervisor,Parental_Support
0,407269,93810,Traditional Dance,Cultural,Monthly,2.38,Neutral,Angel Rojas,No
1,156962,93810,Chess,Sports,Daily,1.38,Negative,Raymond Roy,No
2,647155,93810,Scrabble,Sports,Bi-weekly,2.11,Positive,Kevin Ramos,No
3,531106,24592,Mathematics Competitions,Academic,Bi-weekly,2.46,Negative,Amber Richardson,No
4,768470,24592,Quiz Competition,Extracurricular,Daily,1.35,Positive,Alan Simpson,No


In [301]:
classes_df.head()

Unnamed: 0,Class_ID,Student_ID,Class_Level,Class_Section,Teacher_ID
0,1728,93810,SS2,A,8950
1,1728,24592,SS2,A,8950
2,1728,13278,SS2,A,8950
3,1728,46048,SS2,A,8950
4,1728,42098,SS2,A,8950


In [302]:
teachers_df.isnull().sum() 


Teacher_ID                          0
Student_ID                          0
Marital_Status                      0
Education_Level                     0
Gender                              0
Age                                 0
Subject_Taught                      0
Degree                              0
Parental_Status                     0
Teacher_Training                    0
Distance_From_Home_to_School_km     0
Disability                         90
Health_Issue                       53
Resumption_Time                     0
Have_Lesson_Note                    0
Salary_NGN                          0
Teaching_Experience_Years           0
dtype: int64

In [303]:
schools_df.isnull().sum()

Student_ID                              0
Teacher_Student_Ratio                   0
Average_Teacher_Experience_Years        0
Average_Class_Size                      0
School_Funding_Per_Student              0
School_Extracurricular_Activities    1217
Parental_Involvement_Score              0
School_Facilities_Rating                0
Internet_Access_In_School               0
School_Distance_From_Home_km            0
Student_Attendance_Rate                 0
Disciplinary_Actions_Taken              0
Student_Performance_Score               0
dtype: int64

In [304]:
classes_df.isnull().sum()

Class_ID         0
Student_ID       0
Class_Level      0
Class_Section    0
Teacher_ID       0
dtype: int64

In [305]:
activities_df.isnull().sum()

Activity_ID                     0
Student_ID                      0
Activity_Type                   0
Activity_Category               0
Frequency_of_Participation      0
Duration_per_Session (Hours)    0
Impact_on_Performance           0
Teacher_Supervisor              0
Parental_Support                0
dtype: int64

In [306]:
schools_df.shape, students_df.shape, activities_df.shape, teachers_df.shape, classes_df.shape

((5000, 13), (5000, 17), (14954, 9), (100, 17), (5000, 5))

In [307]:
schools_df.duplicated().sum(), students_df.duplicated().sum(), activities_df.duplicated().sum(), teachers_df.duplicated().sum(), classes_df.duplicated().sum()

(0, 0, 0, 0, 0)

In [308]:
matching_ids = students_df[students_df['Student_ID'].isin(schools_df['Student_ID'])]
num_matching_ids = matching_ids['Student_ID'].nunique() 
print(f"Number of matching Student_IDs: {num_matching_ids}")


Number of matching Student_IDs: 5000


In [309]:
non_matching_ids_schools = schools_df[~schools_df['Student_ID'].isin(students_df['Student_ID'])]
num_non_matching_schools = non_matching_ids_schools['Student_ID'].nunique()
print(f"Number of Student_IDs in schools_df not in students_df: {num_non_matching_schools}")


Number of Student_IDs in schools_df not in students_df: 0


In [310]:

merged_df = pd.merge(students_df, schools_df, how='inner', on='Student_ID')

In [311]:
merged_df.columns


Index(['Student_ID', 'First_Name', 'Last_Name', 'Gender', 'Date_of_Birth',
       'Admission_Date', 'Class_Section', 'Class_Level', 'Religion', 'Tribe',
       'State_of_Origin', 'Parent_Name', 'Parent_Occupation', 'Parent_Income',
       'Academic_Performance', 'Attendance_Rate', 'Special_Needs',
       'Teacher_Student_Ratio', 'Average_Teacher_Experience_Years',
       'Average_Class_Size', 'School_Funding_Per_Student',
       'School_Extracurricular_Activities', 'Parental_Involvement_Score',
       'School_Facilities_Rating', 'Internet_Access_In_School',
       'School_Distance_From_Home_km', 'Student_Attendance_Rate',
       'Disciplinary_Actions_Taken', 'Student_Performance_Score'],
      dtype='object')

In [312]:
merged_df= pd.merge(merged_df, activities_df, how='inner', on='Student_ID')
merged_df.shape 

(14954, 37)

In [313]:
merged = pd.merge(merged_df, teachers_df, on='Student_ID', how='outer')

In [314]:
merged.isnull().sum()

Student_ID                               0
First_Name                               0
Last_Name                                0
Gender_x                                 0
Date_of_Birth                            0
Admission_Date                           0
Class_Section                            0
Class_Level                              0
Religion                                 0
Tribe                                    0
State_of_Origin                          0
Parent_Name                              0
Parent_Occupation                        0
Parent_Income                            0
Academic_Performance                     0
Attendance_Rate                          0
Special_Needs                            0
Teacher_Student_Ratio                    0
Average_Teacher_Experience_Years         0
Average_Class_Size                       0
School_Funding_Per_Student               0
School_Extracurricular_Activities     3615
Parental_Involvement_Score               0
School_Faci

In [315]:
merged.duplicated().sum()

0

In [316]:
merged.head()

Unnamed: 0,Student_ID,First_Name,Last_Name,Gender_x,Date_of_Birth,Admission_Date,Class_Section,Class_Level,Religion,Tribe,...,Degree,Parental_Status,Teacher_Training,Distance_From_Home_to_School_km,Disability,Health_Issue,Resumption_Time,Have_Lesson_Note,Salary_NGN,Teaching_Experience_Years
0,93810,Adebayo,Kehinde,Male,1984-04-15,2017-04-23,B,SS1,Christianity,Yoruba,...,,,,,,,,,,
1,93810,Adebayo,Kehinde,Male,1984-04-15,2017-04-23,B,SS1,Christianity,Yoruba,...,,,,,,,,,,
2,93810,Adebayo,Kehinde,Male,1984-04-15,2017-04-23,B,SS1,Christianity,Yoruba,...,,,,,,,,,,
3,24592,Abiola,Tunde,Male,1985-12-23,2017-01-09,C,SS2,Islam,Yoruba,...,,,,,,,,,,
4,24592,Abiola,Tunde,Male,1985-12-23,2017-01-09,C,SS2,Islam,Yoruba,...,,,,,,,,,,


### Data Preprocessing Pipeline 

In [320]:


# Function to prepare the data (drop unnecessary columns and create new features)
def prepare_data(df):
    # Drop irrelevant columns
    df = df.drop(columns=['First_Name', 'Last_Name', 'Parent_Name', 'Student_ID', 'Teacher_ID', 'Activity_ID'], axis=1)
    
    # Create new columns: Age and Years_Since_Admission
    current_year = pd.Timestamp.now().year
    df['Age'] = current_year - pd.to_datetime(df['Date_of_Birth']).dt.year
    df['Years_Since_Admission'] = current_year - pd.to_datetime(df['Admission_Date']).dt.year
    
    # Drop columns related to Date of Birth and Admission Date
    df = df.drop(columns=['Date_of_Birth', 'Admission_Date'])
    
    return df

# Preprocessing pipeline function
def preprocessing_pipeline(numerical_features, categorical_features):
    # Pipeline for numerical features: Imputation and scaling
    numerical_pipeline = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', RobustScaler())
    ])
    
    # Pipeline for categorical features: Imputation and one-hot encoding
    categorical_pipeline = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])
    
    # Column transformer to combine numerical and categorical pipelines
    preprocessor = ColumnTransformer(transformers=[
        ('num', numerical_pipeline, numerical_features),
        ('cat', categorical_pipeline, categorical_features)
    ])
    
    return preprocessor

# Main preprocessing function
def preprocess_data(df, target_column):
    # Prepare the data
    df = prepare_data(df)
    
    # Separate features and target
    X = df.drop(columns=[target_column])
    y = df[target_column]
    
    # Get updated numerical and categorical features after cleaning
    numerical_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
    categorical_features = X.select_dtypes(include=['object', 'category', 'bool']).columns.tolist()
    
    # Split the data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Build the preprocessing pipeline
    preprocessor = preprocessing_pipeline(numerical_features, categorical_features)
    
    # Fit the preprocessor on training data and transform both train and test sets
    X_train_preprocessed = preprocessor.fit_transform(X_train)
    X_test_preprocessed = preprocessor.transform(X_test)
    
    return X_train_preprocessed, X_test_preprocessed, y_train, y_test, preprocessor

# Sample usage
# Assuming 'merged' is your DataFrame
target_column = 'Academic_Performance'

# Preprocess the data
X_train, X_test, y_train, y_test, preprocessor = preprocess_data(merged, target_column)

# Output the shape of the preprocessed train and test data
print(X_train.shape, X_test.shape)


(11966, 12031) (2992, 12031)


In [321]:
#Function to reduce dimensionality, if necessary
def apply_pca(X_train, X_test, n_components=100):
    pca = PCA(n_components=n_components, svd_solver='arpack')
    X_train_pca = pca.fit_transform(X_train)
    X_test_pca = pca.transform(X_test)
    return X_train_pca, X_test_pca

X_train_pca, X_test_pca = apply_pca(X_train, X_test, n_components=100)

print(X_train_pca.shape, X_test_pca.shape)


(11966, 100) (2992, 100)


In [324]:
# Function to get the names of the features after preprocessing 
#def get_feature_names(preprocessor, numerical_features, categorical_features):
    #numerical_names = numerical_features
    #categorical_transformer = preprocessor.named_transformers_['cat']
    #categorical_names = categorical_transformer['onehot'].get_feature_names_out(categorical_features)
    #all_feature_names = list(numerical_names) + list(categorical_names)
    
    #return all_feature_names

#X_train_dense = X_train.toarray()
#all_feature_names = get_feature_names(preprocessor, numerical_features, categorical_features)
#X_train_df = pd.DataFrame(X_train_dense, columns=all_feature_names)

#print(X_train_df.head())


### Model Train, Prediction and Evaluation Pipeline 

In [325]:
def initialize_models():
    models = {
        'RandomForest': RandomForestRegressor(n_estimators=100, random_state=42),
        'GradientBoosting': GradientBoostingRegressor(n_estimators=100, random_state=42),
        'AdaBoost': AdaBoostRegressor(n_estimators=100, random_state=42),
        'ExtraTrees': ExtraTreesRegressor(n_estimators=100, random_state=42)
    }

    # Stacking Regressor (combines all models with Ridge as meta-learner)
    models['Stacking'] = StackingRegressor(
        estimators=[
            ('rf', models['RandomForest']),
            ('gb', models['GradientBoosting']),
            ('et', models['ExtraTrees']),
            ('ada', models['AdaBoost'])
        ],
        final_estimator=Ridge()
    )
    
    return models

# Function to train the models
def train_models(models, X_train, y_train):
    for name, model in models.items():
        print(f"Training {name}...")
        model.fit(X_train, y_train)

# Function to make predictions with the models
def make_predictions(models, X_test):
    predictions = {}
    for name, model in models.items():
        predictions[name] = model.predict(X_test)
    return predictions

# Function to evaluate the models using Mean Squared Error
def evaluate_models(models, y_test, predictions):
    mse_scores = {}
    for name, preds in predictions.items():
        mse = mean_squared_error(y_test, preds)
        mse_scores[name] = mse
        print(f"{name} MSE: {mse}")
    return mse_scores

# Main function to run the entire workflow
def run_ensemble_models(X_train, X_test, y_train, y_test):
    models = initialize_models()
    train_models(models, X_train, y_train)
    predictions = make_predictions(models, X_test)
    mse_scores = evaluate_models(models, y_test, predictions)
    
    return mse_scores



# Run the ensemble models and get MSE scores
mse_scores = run_ensemble_models(X_train_pca, X_test_pca, y_train, y_test)


Training RandomForest...
Training GradientBoosting...
Training AdaBoost...
Training ExtraTrees...
Training Stacking...
RandomForest MSE: 629.9104280748663
GradientBoosting MSE: 789.496643224742
AdaBoost MSE: 827.0017006357756
ExtraTrees MSE: 549.7643188502674
Stacking MSE: 411.95772714128043


### Tuning the best model according to the evaluation

In [None]:
#feature_importances = best_rf.feature_importances_
#importances_df = pd.DataFrame({'Feature': X_train.columns, 'Importance': feature_importances})
#importances_df = importances_df.sort_values(by='Importance', ascending=False)
#print(importances_df.head(10))

In [327]:

# Initialize stacking regressor
def initialize_stacking_model(base_models):
    stacking_model = StackingRegressor(
        estimators=[
            ('rf', base_models['RandomForest']),
            ('gb', base_models['GradientBoosting']),
            ('et', base_models['ExtraTrees']),
            ('ada', base_models['AdaBoost'])
        ],
        final_estimator=Ridge()  # Meta-learner
    )
    return stacking_model

# Function to initialize base models with 50 estimators
def initialize_models():
    models = {
        'RandomForest': RandomForestRegressor(n_estimators=30, random_state=42),
        'GradientBoosting': GradientBoostingRegressor(n_estimators=30, random_state=42),
        'AdaBoost': AdaBoostRegressor(n_estimators=30, random_state=42),
        'ExtraTrees': ExtraTreesRegressor(n_estimators=30, random_state=42)
    }
    return models

# Function to tune the Stacking Regressor
def tune_stacking_model(X_train, y_train):
    base_models = initialize_models()
    stacking_model = initialize_stacking_model(base_models)

    # Define the parameter grid for stacking regressor
    param_grid = {
        'final_estimator__alpha': [0.1, 1.0],  # Regularization parameter for Ridge
        'cv': [3]  # Use fewer cross-validation folds
    }

    # GridSearchCV to tune the Stacking Regressor
    grid_search = GridSearchCV(stacking_model, param_grid, cv=3, scoring='neg_mean_squared_error', n_jobs=-1)
    grid_search.fit(X_train, y_train)

    print("Best Hyperparameters for Stacking Regressor:", grid_search.best_params_)
    return grid_search.best_estimator_

# Function to make predictions with the best stacking model
def make_predictions(model, X_test):
    return model.predict(X_test)

# Function to evaluate the model using Mean Squared Error
def evaluate_model(y_test, preds):
    mse = mean_squared_error(y_test, preds)
    print(f"Stacking Regressor MSE: {mse}")
    return mse

# Function to save the best model
def save_best_model(model, filename):
    joblib.dump(model, filename)
    print(f"Model saved as {filename}")

# Main function to run the tuning process and evaluate the stacking model
def run_stacking_tuning(X_train, X_test, y_train, y_test):
    best_stacking_model = tune_stacking_model(X_train, y_train)
    
    # Save the best stacking model
    save_best_model(best_stacking_model, 'best_stacking_model.pkl')
    
    predictions = make_predictions(best_stacking_model, X_test)
    evaluate_model(y_test, predictions)

# Run the stacking tuning and evaluation
run_stacking_tuning(X_train_pca, X_test_pca, y_train, y_test)


Best Hyperparameters for Stacking Regressor: {'cv': 3, 'final_estimator__alpha': 1.0}
Model saved as best_stacking_model.pkl
Stacking Regressor MSE: 470.0417653979496


- Althogh the best model went up a bit but we are confident that tuning the model with necessary regularization parameter has been able to deal with any issue of overfitting, hence validating the fact that our model has a high probability of performing well in real life