# Preprocessing 

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import RobustScaler, OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor, ExtraTreesRegressor, StackingRegressor
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.model_selection import GridSearchCV
import joblib
import matplotlib.pyplot as plt

In [3]:
# Lets assume the final dataset is the merged school and students dataset and this is the target Student_Performance_Score column, just to test the pipelines.
students_df = pd.read_csv('students.csv')
schools_df = pd.read_csv('school.csv')
teachers_df = pd.read_csv('teachers.csv')
activities_df = pd.read_csv('activities.csv')
classes_df = pd.read_csv('class.csv') 

In [4]:
schools_df.head()

Unnamed: 0,Student_ID,Teacher_Student_Ratio,Average_Teacher_Experience_Years,Average_Class_Size,School_Funding_Per_Student,School_Extracurricular_Activities,Parental_Involvement_Score,School_Facilities_Rating,Internet_Access_In_School,School_Distance_From_Home_km,Student_Attendance_Rate,Disciplinary_Actions_Taken,Student_Performance_Score
0,93810,21.85508,5,24,12984,Arts,6.225391,7.827782,True,0.628645,85.473338,1,31.435237
1,24592,21.85508,13,24,23862,Arts,9.678251,8.911485,False,3.563695,99.721658,1,42.479681
2,13278,21.85508,14,24,10914,Sports,7.359483,6.441049,False,1.757442,96.880077,0,86.561526
3,46048,21.85508,15,24,14890,Both,8.206922,8.574389,False,4.828292,80.27329,3,52.178405
4,42098,21.85508,16,24,24956,,7.664513,7.282488,False,3.64256,97.720041,1,72.796161


In [5]:
schools_df.head()

Unnamed: 0,Student_ID,Teacher_Student_Ratio,Average_Teacher_Experience_Years,Average_Class_Size,School_Funding_Per_Student,School_Extracurricular_Activities,Parental_Involvement_Score,School_Facilities_Rating,Internet_Access_In_School,School_Distance_From_Home_km,Student_Attendance_Rate,Disciplinary_Actions_Taken,Student_Performance_Score
0,93810,21.85508,5,24,12984,Arts,6.225391,7.827782,True,0.628645,85.473338,1,31.435237
1,24592,21.85508,13,24,23862,Arts,9.678251,8.911485,False,3.563695,99.721658,1,42.479681
2,13278,21.85508,14,24,10914,Sports,7.359483,6.441049,False,1.757442,96.880077,0,86.561526
3,46048,21.85508,15,24,14890,Both,8.206922,8.574389,False,4.828292,80.27329,3,52.178405
4,42098,21.85508,16,24,24956,,7.664513,7.282488,False,3.64256,97.720041,1,72.796161


In [6]:
teachers_df.head()

Unnamed: 0,Teacher_ID,Student_ID,Marital_Status,Education_Level,Gender,Age,Subject_Taught,Degree,Parental_Status,Teacher_Training,Distance_From_Home_to_School_km,Disability,Health_Issue,Resumption_Time,Have_Lesson_Note,Salary_NGN,Teaching_Experience_Years
0,8950,51681,Single,PhD,Male,35,Commerce,B.A. Religious Studies,No,Trained,18.56,,,18:28,Yes,107433,8
1,7954,18834,Widowed,M.Ed,Male,47,Education,B.Sc. Economics,No,Trained,28.68,,,20:58,No,216557,23
2,3336,81511,Single,M.Ed,Male,33,CRS,B.Sc. Financial Accounting,Yes,Not Trained,24.66,,,15:55,Yes,57479,26
3,4454,30056,Single,M.Ed,Female,28,Fine Arts,B.Sc. Physics,Yes,Trained,17.17,,,19:21,Yes,217837,11
4,9593,49384,Single,NCE,Male,27,English,B.Sc. Environmental Science,No,Not Trained,48.17,,,19:29,No,114184,17


In [7]:
activities_df.head()

Unnamed: 0,Activity_ID,Student_ID,Activity_Type,Activity_Category,Frequency_of_Participation,Duration_per_Session (Hours),Impact_on_Performance,Teacher_Supervisor,Parental_Support
0,407269,93810,Traditional Dance,Cultural,Monthly,2.38,Neutral,Angel Rojas,No
1,156962,93810,Chess,Sports,Daily,1.38,Negative,Raymond Roy,No
2,647155,93810,Scrabble,Sports,Bi-weekly,2.11,Positive,Kevin Ramos,No
3,531106,24592,Mathematics Competitions,Academic,Bi-weekly,2.46,Negative,Amber Richardson,No
4,768470,24592,Quiz Competition,Extracurricular,Daily,1.35,Positive,Alan Simpson,No


In [8]:
classes_df.head()

Unnamed: 0,Class_ID,Student_ID,Class_Level,Class_Section,Teacher_ID
0,1728,93810,SS2,A,8950
1,1728,24592,SS2,A,8950
2,1728,13278,SS2,A,8950
3,1728,46048,SS2,A,8950
4,1728,42098,SS2,A,8950


In [9]:
teachers_df.isnull().sum() 


Teacher_ID                          0
Student_ID                          0
Marital_Status                      0
Education_Level                     0
Gender                              0
Age                                 0
Subject_Taught                      0
Degree                              0
Parental_Status                     0
Teacher_Training                    0
Distance_From_Home_to_School_km     0
Disability                         90
Health_Issue                       53
Resumption_Time                     0
Have_Lesson_Note                    0
Salary_NGN                          0
Teaching_Experience_Years           0
dtype: int64

In [10]:
schools_df.isnull().sum()

Student_ID                              0
Teacher_Student_Ratio                   0
Average_Teacher_Experience_Years        0
Average_Class_Size                      0
School_Funding_Per_Student              0
School_Extracurricular_Activities    1217
Parental_Involvement_Score              0
School_Facilities_Rating                0
Internet_Access_In_School               0
School_Distance_From_Home_km            0
Student_Attendance_Rate                 0
Disciplinary_Actions_Taken              0
Student_Performance_Score               0
dtype: int64

In [11]:
classes_df.isnull().sum()

Class_ID         0
Student_ID       0
Class_Level      0
Class_Section    0
Teacher_ID       0
dtype: int64

In [12]:
activities_df.isnull().sum()

Activity_ID                     0
Student_ID                      0
Activity_Type                   0
Activity_Category               0
Frequency_of_Participation      0
Duration_per_Session (Hours)    0
Impact_on_Performance           0
Teacher_Supervisor              0
Parental_Support                0
dtype: int64

In [13]:
schools_df.shape, students_df.shape, activities_df.shape, teachers_df.shape, classes_df.shape

((5000, 13), (5000, 17), (14954, 9), (100, 17), (5000, 5))

In [14]:
schools_df.duplicated().sum(), students_df.duplicated().sum(), activities_df.duplicated().sum(), teachers_df.duplicated().sum(), classes_df.duplicated().sum()

(0, 0, 0, 0, 0)

In [15]:
matching_ids = students_df[students_df['Student_ID'].isin(schools_df['Student_ID'])]
num_matching_ids = matching_ids['Student_ID'].nunique() 
print(f"Number of matching Student_IDs: {num_matching_ids}")


Number of matching Student_IDs: 5000


In [16]:
non_matching_ids_schools = schools_df[~schools_df['Student_ID'].isin(students_df['Student_ID'])]
num_non_matching_schools = non_matching_ids_schools['Student_ID'].nunique()
print(f"Number of Student_IDs in schools_df not in students_df: {num_non_matching_schools}")


Number of Student_IDs in schools_df not in students_df: 0


In [17]:

merged_df = pd.merge(students_df, schools_df, how='inner', on='Student_ID')

In [18]:
merged_df.columns


Index(['Student_ID', 'First_Name', 'Last_Name', 'Gender', 'Date_of_Birth',
       'Admission_Date', 'Class_Section', 'Class_Level', 'Religion', 'Tribe',
       'State_of_Origin', 'Parent_Name', 'Parent_Occupation', 'Parent_Income',
       'Academic_Performance', 'Attendance_Rate', 'Special_Needs',
       'Teacher_Student_Ratio', 'Average_Teacher_Experience_Years',
       'Average_Class_Size', 'School_Funding_Per_Student',
       'School_Extracurricular_Activities', 'Parental_Involvement_Score',
       'School_Facilities_Rating', 'Internet_Access_In_School',
       'School_Distance_From_Home_km', 'Student_Attendance_Rate',
       'Disciplinary_Actions_Taken', 'Student_Performance_Score'],
      dtype='object')

In [19]:
merged_df= pd.merge(merged_df, activities_df, how='inner', on='Student_ID')
merged_df.shape 

(14954, 37)

In [20]:
merged = pd.merge(merged_df, teachers_df, on='Student_ID', how='outer')

In [21]:
merged.isnull().sum()

Student_ID                               0
First_Name                               0
Last_Name                                0
Gender_x                                 0
Date_of_Birth                            0
Admission_Date                           0
Class_Section                            0
Class_Level                              0
Religion                                 0
Tribe                                    0
State_of_Origin                          0
Parent_Name                              0
Parent_Occupation                        0
Parent_Income                            0
Academic_Performance                     0
Attendance_Rate                          0
Special_Needs                            0
Teacher_Student_Ratio                    0
Average_Teacher_Experience_Years         0
Average_Class_Size                       0
School_Funding_Per_Student               0
School_Extracurricular_Activities     3615
Parental_Involvement_Score               0
School_Faci

In [22]:
merged.duplicated().sum()

0

In [23]:
merged.head()

Unnamed: 0,Student_ID,First_Name,Last_Name,Gender_x,Date_of_Birth,Admission_Date,Class_Section,Class_Level,Religion,Tribe,...,Degree,Parental_Status,Teacher_Training,Distance_From_Home_to_School_km,Disability,Health_Issue,Resumption_Time,Have_Lesson_Note,Salary_NGN,Teaching_Experience_Years
0,93810,Adebayo,Kehinde,Male,1984-04-15,2017-04-23,B,SS1,Christianity,Yoruba,...,,,,,,,,,,
1,93810,Adebayo,Kehinde,Male,1984-04-15,2017-04-23,B,SS1,Christianity,Yoruba,...,,,,,,,,,,
2,93810,Adebayo,Kehinde,Male,1984-04-15,2017-04-23,B,SS1,Christianity,Yoruba,...,,,,,,,,,,
3,24592,Abiola,Tunde,Male,1985-12-23,2017-01-09,C,SS2,Islam,Yoruba,...,,,,,,,,,,
4,24592,Abiola,Tunde,Male,1985-12-23,2017-01-09,C,SS2,Islam,Yoruba,...,,,,,,,,,,


### Data Preprocessing Pipeline 

In [24]:


def prepare_data(df):
    """
    Prepares the data by dropping irrelevant columns and creating new features.

    Args:
        df (pandas.DataFrame): The input DataFrame.

    Returns:
        pandas.DataFrame: The prepared DataFrame.
    """

    # Drop unnecessary columns
    df = df.drop(columns=['First_Name', 'Last_Name', 'Parent_Name', 'Student_ID', 'Teacher_ID', 'Activity_ID'], axis=1)

    # Create new features: Age and Years_Since_Admission
    current_year = pd.Timestamp.now().year
    df['Age'] = current_year - pd.to_datetime(df['Date_of_Birth']).dt.year
    df['Years_Since_Admission'] = current_year - pd.to_datetime(df['Admission_Date']).dt.year

    # Drop columns related to Date of Birth and Admission Date
    df = df.drop(columns=['Date_of_Birth', 'Admission_Date'])

    return df


def preprocessing_pipeline(numerical_features, categorical_features):
    """
    Creates a preprocessing pipeline for numerical and categorical features.

    Args:
        numerical_features (list): A list of numerical feature names.
        categorical_features (list): A list of categorical feature names.

    Returns:
        sklearn.compose.ColumnTransformer: The preprocessing pipeline.
    """

    # Pipeline for numerical features: Imputation and scaling
    numerical_pipeline = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', RobustScaler())
    ])

    # Pipeline for categorical features: Imputation and one-hot encoding
    categorical_pipeline = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])

    # Combine numerical and categorical pipelines
    preprocessor = ColumnTransformer(transformers=[
        ('num', numerical_pipeline, numerical_features),
        ('cat', categorical_pipeline, categorical_features)
    ])

    return preprocessor


def preprocess_data(df, target_column):
    """
    Preprocesses the data, including data preparation, splitting, feature engineering,
    building a preprocessing pipeline, PCA for dimensionality reduction, and saving the preprocessor.

    Args:
        df (pandas.DataFrame): The input DataFrame.
        target_column (str): The name of the target column.

    Returns:
        tuple: A tuple containing the preprocessed training and testing data,
               target values, preprocessor, a list of the 45 principal component names.
    """

    # Prepare the data
    df_prepared = prepare_data(df.copy())  # Avoid modifying the original DataFrame

    # Separate features and target
    X = df_prepared.drop(columns=[target_column])
    y = df_prepared[target_column]

    # Get updated numerical and categorical features after cleaning
    numerical_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
    categorical_features = X.select_dtypes(include=['object', 'category', 'bool']).columns.tolist()

    # Split the data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Build the preprocessing pipeline
    preprocessor = preprocessing_pipeline(numerical_features, categorical_features)

    # Fit the preprocessor on training data and transform both train and test sets
    X_train_preprocessed = preprocessor.fit_transform(X_train)
    X_test_preprocessed = preprocessor.transform(X_test)

    # Apply PCA (ensure n_components is set to 45)
    pca = PCA(n_components=45, svd_solver='arpack')
    X_train_pca = pca.fit_transform(X_train_preprocessed)
    X_test_pca = pca.transform(X_test_preprocessed)

    # Save the preprocessor
    with open('preprocessor.pkl', 'wb') as f:
        joblib.dump(preprocessor, f)

    # Save the PCA model
    with open('pca_model.pkl', 'wb') as f:
        joblib.dump(pca, f)

    return X_train_pca, X_test_pca, y_train, y_test, preprocessor, pca

# Sample usage
if __name__ == "__main__":
    # Assuming 'merged' is your DataFrame
    target_column = 'Academic_Performance'

    # Preprocess the data
    results = preprocess_data(merged, target_column)

    # Unpack the results
    X_train_pca, X_test_pca, y_train, y_test, preprocessor, pca = results

    # Print the shape of the transformed data
    print("Shape of X_train_pca:", X_train_pca.shape)
    print("Shape of X_test_pca:", X_test_pca.shape)

   


    

Shape of X_train_pca: (11966, 45)
Shape of X_test_pca: (2992, 45)


In [41]:
# Sample usage
if __name__ == "__main__":
    # Assuming 'merged' is your DataFrame
    target_column = 'Academic_Performance'

    # Preprocess the data
    results = preprocess_data(merged, target_column)

    # Unpack the results
    X_train_pca, X_test_pca, y_train, y_test, preprocessor, pca = results

    # Generate the names of the PCA components
    pca_feature_names = [f"PC{i+1}" for i in range(X_train_pca.shape[1])]

    # Get the count of the PCA components (which is the same as columns in X_train_pca)
    num_pca_components = len(pca_feature_names)

    # Print the PCA component names and their count
    print("PCA component names:")
    print(pca_feature_names)
    print(f"\nCount of PCA components (columns in X_train_pca and X_test_pca): {num_pca_components}")


PCA component names:
['PC1', 'PC2', 'PC3', 'PC4', 'PC5', 'PC6', 'PC7', 'PC8', 'PC9', 'PC10', 'PC11', 'PC12', 'PC13', 'PC14', 'PC15', 'PC16', 'PC17', 'PC18', 'PC19', 'PC20', 'PC21', 'PC22', 'PC23', 'PC24', 'PC25', 'PC26', 'PC27', 'PC28', 'PC29', 'PC30', 'PC31', 'PC32', 'PC33', 'PC34', 'PC35', 'PC36', 'PC37', 'PC38', 'PC39', 'PC40', 'PC41', 'PC42', 'PC43', 'PC44', 'PC45']

Count of PCA components (columns in X_train_pca and X_test_pca): 45


### Model Train, Prediction and Evaluation Pipeline 

In [27]:
def initialize_models():
    models = {
        'RandomForest': RandomForestRegressor(n_estimators=100, random_state=42),
        'GradientBoosting': GradientBoostingRegressor(n_estimators=100, random_state=42),
        'AdaBoost': AdaBoostRegressor(n_estimators=100, random_state=42),
        'ExtraTrees': ExtraTreesRegressor(n_estimators=100, random_state=42)
    }

    # Stacking Regressor (combines all models with Ridge as meta-learner)
    models['Stacking'] = StackingRegressor(
        estimators=[
            ('rf', models['RandomForest']),
            ('gb', models['GradientBoosting']),
            ('et', models['ExtraTrees']),
            ('ada', models['AdaBoost'])
        ],
        final_estimator=Ridge()
    )
    
    return models

# Function to train the models
def train_models(models, X_train, y_train):
    for name, model in models.items():
        print(f"Training {name}...")
        model.fit(X_train, y_train)

# Function to make predictions with the models
def make_predictions(models, X_test):
    predictions = {}
    for name, model in models.items():
        predictions[name] = model.predict(X_test)
    return predictions

# Function to evaluate the models using Mean Squared Error
def evaluate_models(models, y_test, predictions):
    mse_scores = {}
    for name, preds in predictions.items():
        mse = mean_squared_error(y_test, preds)
        mse_scores[name] = mse
        print(f"{name} MSE: {mse}")
    return mse_scores

# Main function to run the entire workflow
def run_ensemble_models(X_train, X_test, y_train, y_test):
    models = initialize_models()
    train_models(models, X_train, y_train)
    predictions = make_predictions(models, X_test)
    mse_scores = evaluate_models(models, y_test, predictions)
    
    return mse_scores



# Run the ensemble models and get MSE scores
mse_scores = run_ensemble_models(X_train_pca, X_test_pca, y_train, y_test)


Training RandomForest...
Training GradientBoosting...
Training AdaBoost...
Training ExtraTrees...
Training Stacking...
RandomForest MSE: 673.2455480614973
GradientBoosting MSE: 801.6964960597873
AdaBoost MSE: 829.1000082746564
ExtraTrees MSE: 601.501670421123
Stacking MSE: 464.61414480455124


### Tuning the best model according to the evaluation

In [28]:
#feature_importances = best_rf.feature_importances_
#importances_df = pd.DataFrame({'Feature': X_train.columns, 'Importance': feature_importances})
#importances_df = importances_df.sort_values(by='Importance', ascending=False)
#print(importances_df.head(10))

In [30]:
from sklearn.linear_model import ElasticNet

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error

# Initialize stacking regressor
def initialize_stacking_model(base_models):
    stacking_model = StackingRegressor(
        estimators=[
            ('rf', base_models['RandomForest']),
            ('gb', base_models['GradientBoosting']),
            ('et', base_models['ExtraTrees']),
            ('ada', base_models['AdaBoost'])
        ],
        final_estimator=ElasticNet()  # Meta-learner
    )
    return stacking_model

# Function to initialize base models
def initialize_models():
    models = {
        'RandomForest': RandomForestRegressor(n_estimators=30, random_state=42),
        'GradientBoosting': GradientBoostingRegressor(n_estimators=30, random_state=42),
        'AdaBoost': AdaBoostRegressor(n_estimators=30, random_state=42),
        'ExtraTrees': ExtraTreesRegressor(n_estimators=30, random_state=42)
    }
    return models

# Function to tune the Stacking Regressor
def tune_stacking_model(X_train, y_train):
    base_models = initialize_models()
    stacking_model = initialize_stacking_model(base_models)

    # Define the expanded parameter grid for stacking regressor
    param_grid = {
        'final_estimator__alpha': [0.1, 1.0],  # Regularization parameter
        'final_estimator__l1_ratio': [0.2, 0.5],  # Mix of L1 and L2 regularization
        'cv': [3],  # Number of cross-validation folds
    }

    # GridSearchCV to tune the Stacking Regressor
    grid_search = GridSearchCV(stacking_model, param_grid, cv=3, scoring='neg_mean_squared_error', n_jobs=-1)
    grid_search.fit(X_train, y_train)

    print("Best Hyperparameters for Stacking Regressor:", grid_search.best_params_)
    return grid_search.best_estimator_

# Function to make predictions with the best stacking model
def make_predictions(model, X_test):
    return model.predict(X_test)

# Function to evaluate the model using Mean Squared Error
def evaluate_model(y_test, preds):
    mse = mean_squared_error(y_test, preds)
    print(f"Stacking Regressor MSE: {mse}")
    return mse

# Function to save the best model
def save_best_model(model, filename):
    joblib.dump(model, filename)
    print(f"Model saved as {filename}")

# Main function to run the tuning process and evaluate the stacking model
def run_stacking_tuning(X_train, X_test, y_train, y_test):
    best_stacking_model = tune_stacking_model(X_train, y_train)
    
    # Save the best stacking model
    save_best_model(best_stacking_model, 'best_stacking_model.pkl')
    
    predictions = make_predictions(best_stacking_model, X_test)
    evaluate_model(y_test, predictions)

# Example usage (Assuming you have preprocessed X_train_pca, X_test_pca, y_train, y_test)
if __name__ == "__main__":
    # Assuming X_train_pca, X_test_pca, y_train, y_test are defined
    run_stacking_tuning(X_train_pca, X_test_pca, y_train, y_test)


Best Hyperparameters for Stacking Regressor: {'cv': 3, 'final_estimator__alpha': 0.1, 'final_estimator__l1_ratio': 0.2}
Model saved as best_stacking_model.pkl
Stacking Regressor MSE: 533.7666229890314


- Althogh the best model went up a bit but we are confident that tuning the model with necessary regularization parameter has been able to deal with any issue of overfitting, hence validating the fact that our model has a high probability of performing well in real life

In [31]:
X_test_pca 

array([[ 3.82120153e-06, -2.39235181e-02,  1.39024016e-01, ...,
         7.96643927e-01,  8.01567382e-01,  8.26457375e-02],
       [-2.27022518e-06,  2.29470891e-02, -6.99578381e-02, ...,
        -1.38224856e-01,  2.51538177e-02, -1.94839709e-02],
       [ 2.63288901e-06, -4.46337112e-02,  7.56131906e-02, ...,
         3.78909106e-02, -3.92807316e-02, -4.73467014e-02],
       ...,
       [ 6.77094524e-06,  2.80818977e-02, -6.62844733e-02, ...,
        -2.64941388e-01, -6.79276650e-02, -5.63506168e-02],
       [ 6.63510501e-06,  3.81176293e-02, -5.81841606e-02, ...,
        -6.18692204e-03,  6.76912840e-02,  3.87983676e-02],
       [-3.48929254e-06,  8.16205463e-03,  3.30696569e-02, ...,
         2.04158934e-02, -3.62430353e-03,  3.13375133e-02]])

In [44]:
import warnings

# Suppress warnings
warnings.filterwarnings('ignore')

# Load the saved model, preprocessor, and PCA model
model = joblib.load('best_stacking_model.pkl') 
preprocessor = joblib.load('preprocessor.pkl')  
pca_model = joblib.load('pca_model.pkl')  # Load the PCA model

# Define the input features as a dictionary
features_dict = {
    "Gender": ["Female"],
    "Category": ["A"],
    "Education_Level": ["SS1"],
    "Religion": ["Christianity"],
    "Ethnicity": ["Igbo"],
    "State_of_Origin": ["Anambra"],
    "Occupation": ["Trader"],
    "Income": [50000],
    "Exam_Score": [85.5],
    "Has_Credit": ["No"],
    "Age": [15],
    "Dependents": [10],
    "Years_of_Work_Experience": [30],
    "Monthly_Expense": [2000],
    "Owns_House": ["Yes"],
    "Family_Size": [8],
    "Children": [7],
    "Has_Car": ["Yes"],
    "Rooms": [5],
    "Last_Exam_Score": [90.2],
    "Fail_Count": [1],
    "Study_Hours": [80.0],
    "Favorite_Sport": ["Sports"],
    "Learning_Type": ["Physical"],
    "Exercise_Frequency": ["Weekly"],
    "Days_Exercised": [5],
    "Mood": ["Positive"],
    "Has_Friends": ["Yes"],
    "Family_Strength": ["Strong"],
    "Marital_Status": ["Married"],
    "Degree": ["Bachelor's Degree"],
    "Father's_Gender": ["Male"],
    "Father's_Age": [16],
    "Subject": ["Mathematics"],
    "Parent_Degree": ["PhD"],
    "Parent_Paid_Tuition": ["Yes"],
    "Parent_Available": ["Yes"],
    "Household_Size": [10],
    "Household_Earnings": ["No"],
    "Has_Electricity": ["No"],
    "School_Start_Time": ["8:00 AM"],
    "Living_With_Parents": ["Yes"],
    "Income_Limit": [50000],
    "Homework_Hours": [12],
    "Test_Score": [5],
    "Disciplinary_Actions_Taken": [None],
    "Student_Attendance_Rate": [None],
    "Health_Issue": [None],
    "Distance_From_Home_to_School_km": [None],
    "Parental_Support": [None],
    "Have_Lesson_Note": [None],
    "Teacher_Training": [None],
    "Salary_NGN": [None],
    "Duration_per_Session (Hours)": [None],
    "Parental_Involvement_Score": [None],
    "Tribe": [None],
    "Average_Teacher_Experience_Years": [None],
    "School_Facilities_Rating": [None],
    "Teaching_Experience_Years": [None],
    "Subject_Taught": [None],
    "Resumption_Time": [None],
    "Special_Needs": [None],
    "Parent_Occupation": [None],
    "Attendance_Rate": [None],
    "School_Distance_From_Home_km": [None],
    "Teacher_Student_Ratio": [None],
    "Activity_Category": [None],
    "School_Funding_Per_Student": [None],
    "Student_Performance_Score": [None],
    "Class_Level": [None],
    "Parental_Status": [None],
    "Gender_y": [None],
    "Activity_Type": [None],
    "Internet_Access_In_School": [None],
    "Parent_Income": [None],
    "School_Extracurricular_Activities": [None],
    "Average_Class_Size": [None],
    "Impact_on_Performance": [None],
    "Frequency_of_Participation": [None],
    "Disability": [None],
    "Gender_x": [None],
    "Class_Section": [None],
    "Teacher_Supervisor": [None]
}

# Convert the dictionary to a DataFrame
features_df = pd.DataFrame(features_dict)

# Step 1: Preprocess the input features
input_preprocessed = preprocessor.transform(features_df)

# Step 2: Apply PCA transformation
input_pca_transformed = pca_model.transform(input_preprocessed)

# Step 3: Make the prediction using the trained model
prediction = model.predict(input_pca_transformed)

# Step 4: Classify performance based on prediction
if prediction[0] < 50:
    performance = " Student Performance is bad"
elif 50 <= prediction[0] < 70:
    performance = " Student Performance is good"
else:
    performance = " Student had an Excellent Performance"

# Print the prediction and performance classification
print("Prediction:", prediction[0])
print("Performance Classification:", performance)


Prediction: 48.734237906942994
Performance Classification:  Student Performance is bad
