!pip install --quiet pandas numpy scikit-learn networkx matplotlib pomegranate==0.14.7

In [17]:
# Cell 1: Import libraries and set logging
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, mean_absolute_error, mean_squared_error
from sklearn.preprocessing import LabelEncoder
import joblib
import logging
from itertools import product

logging.basicConfig(level=logging.INFO)

# Cell 2: Load dataset
df = pd.read_csv('Cleaned_Student_performance_data.csv')
logging.info(f"Dataset shape: {df.shape}")

# Cell 3: Discretize continuous features
gpa_bins   = [0, 1.0, 2.0, 3.0, 4.0]
gpa_labels = ['VeryLow', 'Low', 'Medium', 'High']
df['GPA_bin'] = pd.cut(df['GPA'], bins=gpa_bins, labels=gpa_labels, include_lowest=True)

study_bins   = [0, 5, 10, 15, 20, np.inf]
study_labels = ['VeryLow', 'Low', 'Medium', 'High', 'VeryHigh']
df['Study_bin'] = pd.cut(df['StudyTimeWeekly'], bins=study_bins, labels=study_labels, include_lowest=True)

abs_bins   = [0, 5, 10, 15, 20, np.inf]
abs_labels = ['None', 'Few', 'Moderate', 'High', 'VeryHigh']
df['Absences_bin'] = pd.cut(df['Absences'], bins=abs_bins, labels=abs_labels, include_lowest=True)

# Cell 4: Encode categorical variables
categorical_cols = [
    'Gender','Ethnicity','ParentalEducation',
    'Tutoring','ParentalSupport','Extracurricular',
    'Study_bin','Absences_bin','GPA_bin','GradeClass'
]

label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

# Cell 5: Feature selection and data split
features = [
    'Gender','Ethnicity','ParentalEducation',
    'Study_bin','Absences_bin','Tutoring',
    'ParentalSupport','Extracurricular'
]
target_gpa   = 'GPA_bin'
target_class = 'GradeClass'

train_df, test_df = train_test_split(
    df, test_size=0.2, random_state=42, stratify=df[target_class]
)

# Cell 6: Define Bayesian Network structure
parents = {
    'Gender': [],
    'Ethnicity': [],
    'ParentalEducation': [],
    'Tutoring': [],
    'ParentalSupport': [],
    'Extracurricular': [],
    'Study_bin': [],
    'Absences_bin': [],
    'GPA_bin': ['Study_bin', 'ParentalEducation', 'Absences_bin', 'Tutoring'],
    'GradeClass': ['GPA_bin', 'ParentalSupport', 'Extracurricular']
}

# Cell 7: Compute Conditional Probability Tables (CPTs)
CPT = {}

# Hyperparameters
laplace_smoothing = 1  # Smoothing factor
min_data_threshold = 5  # Minimum data points required for a combination

for node, pars in parents.items():
    if not pars:
        # For nodes without parents, calculate prior probabilities
        CPT[node] = train_df[node].value_counts(normalize=True).to_dict()
    else:
        # For nodes with parents, calculate conditional probabilities
        parent_values = [train_df[p].unique() for p in pars]
        combinations = list(product(*parent_values))
        table = {}
        classes = train_df[node].unique()

        for combo in combinations:
            # Create a mask for the current combination of parent values
            mask = np.ones(len(train_df), dtype=bool)
            for p, v in zip(pars, combo):
                mask &= (train_df[p] == v)
            subset = train_df[mask]

            # Check if the subset has enough data points
            if len(subset) < min_data_threshold:
                logging.warning(f"Combination {combo} for node {node} has insufficient data ({len(subset)} rows).")
                continue

            # Calculate counts and probabilities with Laplace smoothing
            counts = subset[node].value_counts()
            total = counts.sum()
            probs = {
                c: (counts.get(c, 0) + laplace_smoothing) / (total + laplace_smoothing * len(classes))
                for c in classes
            }
            table[tuple(int(x) for x in combo)] = probs

        CPT[node] = table

        # Log the size of each CPT
for node, table in CPT.items():
    logging.info(f"CPT for node {node} contains {len(table)} entries.")

# Cell 8: Save model to .pkl
joblib.dump({
    'CPT': CPT,
    'parents': parents,
    'label_encoders': label_encoders,
    'features': features,
    'target_gpa': target_gpa,
    'target_class': target_class
}, 'bayesian_model.pkl')

# Save mean GPA per GPA_bin (for numeric estimate)
mean_gpa = train_df.groupby('GPA_bin')['GPA'].mean().to_dict()
joblib.dump(mean_gpa, 'mean_gpa.pkl')

logging.info("Model and mean GPA saved successfully to bayesian_model.pkl and mean_gpa.pkl.")

# Cell 9: Define prediction functions
def predict_gpa_bin(row):
    key = tuple(row[p] for p in parents['GPA_bin'])
    if key in CPT['GPA_bin']:
        probs = CPT['GPA_bin'][key]
        return max(probs, key=probs.get)
    # Fallback: Use prior distribution
    prior = train_df['GPA_bin'].value_counts(normalize=True).to_dict()
    return max(prior, key=prior.get)

def predict_gradeclass(row):
    key = tuple(row[p] for p in parents['GradeClass'])
    if key in CPT['GradeClass']:
        probs = CPT['GradeClass'][key]
        return max(probs, key=probs.get)
    # Fallback: Use prior distribution
    prior = train_df['GradeClass'].value_counts(normalize=True).to_dict()
    return max(prior, key=prior.get)

# Cell 10: Evaluate model on test set
test_df['Predicted_GPA_bin'] = test_df.apply(predict_gpa_bin, axis=1)
test_df['Predicted_GradeClass'] = test_df.apply(predict_gradeclass, axis=1)

# Evaluate GPA predictions
true_gpa = test_df['GPA']
predicted_gpa = test_df['Predicted_GPA_bin'].map(mean_gpa)
mae = mean_absolute_error(true_gpa, predicted_gpa)
rmse = np.sqrt(mean_squared_error(true_gpa, predicted_gpa))
logging.info(f"MAE (GPA): {mae}")
logging.info(f"RMSE (GPA): {rmse}")

# Evaluate GradeClass predictions
accuracy = accuracy_score(test_df['GradeClass'], test_df['Predicted_GradeClass'])
logging.info(f"Accuracy (GradeClass): {accuracy}")

# Cell 11: Save evaluation metrics
evaluation_metrics = {
    'MAE_GPA': mae,
    'RMSE_GPA': rmse,
    'Accuracy_GradeClass': accuracy
}
joblib.dump(evaluation_metrics, 'evaluation_metrics.pkl')
logging.info("Evaluation metrics saved successfully.")

INFO:root:Dataset shape: (2392, 15)
INFO:root:CPT for node Gender contains 2 entries.
INFO:root:CPT for node Ethnicity contains 4 entries.
INFO:root:CPT for node ParentalEducation contains 5 entries.
INFO:root:CPT for node Tutoring contains 2 entries.
INFO:root:CPT for node ParentalSupport contains 5 entries.
INFO:root:CPT for node Extracurricular contains 2 entries.
INFO:root:CPT for node Study_bin contains 4 entries.
INFO:root:CPT for node Absences_bin contains 5 entries.
INFO:root:CPT for node GPA_bin contains 125 entries.
INFO:root:CPT for node GradeClass contains 38 entries.
INFO:root:Model and mean GPA saved successfully to bayesian_model.pkl and mean_gpa.pkl.
INFO:root:MAE (GPA): 0.4418008535577054
INFO:root:RMSE (GPA): 0.5776360165065325
INFO:root:Accuracy (GradeClass): 0.7494780793319415
INFO:root:Evaluation metrics saved successfully.


In [6]:
import joblib

# Load the model file
try:
    model_data = joblib.load('bayesian_model.pkl')
    CPT = model_data.get('CPT', {})
    if not CPT:
        print("CPT is empty or not found in the model file.")
    else:
        print(f"CPT loaded successfully with {len(CPT)} nodes.")
except Exception as e:
    print(f"Error loading model: {e}")

CPT loaded successfully with 10 nodes.
