In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.metrics import (confusion_matrix, classification_report, 
                           accuracy_score, precision_score, recall_score, 
                           f1_score, precision_recall_fscore_support)
import warnings
warnings.filterwarnings('ignore')

ModuleNotFoundError: No module named 'xgboost'

In [1]:
def load_and_prepare_data():
    """Load and prepare the student grades dataset"""
    try:
        # Try to load the CSV file
        df = pd.read_csv('Grades.csv')
        print("Dataset loaded successfully!")
        print(f"Dataset shape: {df.shape}")
        print(f"Columns: {df.columns.tolist()}")
        return df
    except FileNotFoundError:
        print("Grades.csv not found. Creating sample data based on the provided schema.")
        print(f"Dataset shape: {df.shape}")
        return df


In [None]:
def create_grade_classes(df):
    """Create grade classes based on finalGrade"""
    def grade_to_class(grade):
        if 17 <= grade <= 20:
            return 'A'
        elif 14 <= grade < 17:
            return 'B'
        elif 10 <= grade < 14:
            return 'C'
        else:
            return 'D'
    
    df['gradeClass'] = df['finalGrade'].apply(grade_to_class)
    return df
    

In [None]:
def preprocess_data(df):
    """Preprocess the data for machine learning"""
    # Create grade classes
    df = create_grade_classes(df)
    
    # Encode categorical variables
    label_encoders = {}
    categorical_columns = df.select_dtypes(include=['object']).columns
    categorical_columns = categorical_columns.drop('gradeClass')  # Don't encode target
    
    for col in categorical_columns:
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col])
        label_encoders[col] = le
    
    # Prepare features and target
    X = df.drop(['finalGrade', 'gradeClass'], axis=1)
    y = df['gradeClass']
    
    return X, y, label_encoders

In [None]:
def plot_data_distribution(df):
    """Plot data distribution"""
    plt.figure(figsize=(15, 10))
    
    # Grade distribution
    plt.subplot(2, 3, 1)
    df['gradeClass'].value_counts().plot(kind='bar')
    plt.title('Grade Class Distribution')
    plt.xticks(rotation=0)
    
    # Final grade histogram
    plt.subplot(2, 3, 2)
    plt.hist(df['finalGrade'], bins=20, edgecolor='black')
    plt.title('Final Grade Distribution')
    plt.xlabel('Final Grade')
    
    # Age distribution
    plt.subplot(2, 3, 3)
    plt.hist(df['age'], bins=10, edgecolor='black')
    plt.title('Age Distribution')
    plt.xlabel('Age')
    
    # Study time vs Grade
    plt.subplot(2, 3, 4)
    plt.boxplot([df[df['gradeClass'] == grade]['studyTime'] for grade in ['A', 'B', 'C', 'D']])
    plt.title('Study Time by Grade Class')
    plt.xticks([1, 2, 3, 4], ['A', 'B', 'C', 'D'])
    
    # Absences vs Grade
    plt.subplot(2, 3, 5)
    plt.boxplot([df[df['gradeClass'] == grade]['absences'] for grade in ['A', 'B', 'C', 'D']])
    plt.title('Absences by Grade Class')
    plt.xticks([1, 2, 3, 4], ['A', 'B', 'C', 'D'])
    
    # Previous grades correlation
    plt.subplot(2, 3, 6)
    plt.scatter(df['EPSGrade'], df['DSGrade'], c=df['finalGrade'], alpha=0.6)
    plt.xlabel('EPS Grade')
    plt.ylabel('DS Grade')
    plt.title('Previous Grades Correlation')
    plt.colorbar(label='Final Grade')
    
    plt.tight_layout()
    plt.show()


In [None]:
def define_models():
    """Define models with hyperparameter grids"""
    models = {
        'Naive Bayes': {
            'model': GaussianNB(),
            'params': {
                # Naive Bayes doesn't have many hyperparameters to tune
                'var_smoothing': [1e-9, 1e-8, 1e-7, 1e-6, 1e-5]
            }
        },
        'Decision Tree': {
            'model': DecisionTreeClassifier(random_state=42),
            'params': {
                'max_depth': [3, 5, 7, 10, None],
                'min_samples_split': [2, 5, 10, 20],
                'min_samples_leaf': [1, 2, 5, 10],
                'max_features': ['sqrt', 'log2', None]
            }
        },
        'Random Forest': {
            'model': RandomForestClassifier(random_state=42),
            'params': {
                'n_estimators': [50, 100, 200, 300],
                'max_depth': [3, 5, 7, 10, None],
                'min_samples_split': [2, 5, 10],
                'min_samples_leaf': [1, 2, 5],
                'max_features': ['sqrt', 'log2', None]
            }
        },
        'XGBoost': {
            'model': xgb.XGBClassifier(random_state=42, eval_metric='mlogloss'),
            'params': {
                'n_estimators': [50, 100, 200, 300],
                'learning_rate': [0.01, 0.1, 0.2, 0.3],
                'max_depth': [3, 4, 5, 6],
                'min_child_weight': [1, 3, 5],
                'subsample': [0.8, 0.9, 1.0]
            }
        }
    }
    return models


In [None]:
def train_and_evaluate_models(X_train, X_test, y_train, y_test, models):
    """Train and evaluate all models"""
    results = {}
    best_models = {}
    
    for name, model_info in models.items():
        print(f"\nTraining {name}...")
        
        # Grid search for best hyperparameters
        grid_search = GridSearchCV(
            model_info['model'], 
            model_info['params'], 
            cv=5, 
            scoring='accuracy',
            n_jobs=-1,
            verbose=0
        )
        
        grid_search.fit(X_train, y_train)
        best_model = grid_search.best_estimator_
        best_models[name] = best_model
        
        # Make predictions
        y_pred = best_model.predict(X_test)
        
        # Calculate metrics
        accuracy = accuracy_score(y_test, y_pred)
        precision_macro = precision_score(y_test, y_pred, average='macro')
        precision_micro = precision_score(y_test, y_pred, average='micro')
        precision_weighted = precision_score(y_test, y_pred, average='weighted')
        
        recall_macro = recall_score(y_test, y_pred, average='macro')
        recall_micro = recall_score(y_test, y_pred, average='micro')
        recall_weighted = recall_score(y_test, y_pred, average='weighted')
        
        f1_macro = f1_score(y_test, y_pred, average='macro')
        f1_micro = f1_score(y_test, y_pred, average='micro')
        f1_weighted = f1_score(y_test, y_pred, average='weighted')
        
        # Store results
        results[name] = {
            'best_params': grid_search.best_params_,
            'accuracy': accuracy,
            'precision_macro': precision_macro,
            'precision_micro': precision_micro,
            'precision_weighted': precision_weighted,
            'recall_macro': recall_macro,
            'recall_micro': recall_micro,
            'recall_weighted': recall_weighted,
            'f1_macro': f1_macro,
            'f1_micro': f1_micro,
            'f1_weighted': f1_weighted,
            'y_pred': y_pred,
            'confusion_matrix': confusion_matrix(y_test, y_pred)
        }
        
        print(f"{name} - Best Parameters: {grid_search.best_params_}")
        print(f"{name} - Accuracy: {accuracy:.4f}")
    
    return results, best_models


In [None]:
def plot_confusion_matrices(results, y_test):
    """Plot confusion matrices for all models"""
    fig, axes = plt.subplots(2, 2, figsize=(15, 12))
    axes = axes.ravel()
    
    class_names = ['A', 'B', 'C', 'D']
    
    for i, (name, result) in enumerate(results.items()):
        cm = result['confusion_matrix']
        
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                   xticklabels=class_names, yticklabels=class_names,
                   ax=axes[i])
        axes[i].set_title(f'{name} - Confusion Matrix')
        axes[i].set_xlabel('Predicted')
        axes[i].set_ylabel('Actual')
    
    plt.tight_layout()
    plt.show()


In [None]:
def create_results_summary(results):
    """Create a summary table of all results"""
    summary_data = []
    
    for name, result in results.items():
        summary_data.append({
            'Model': name,
            'Accuracy': result['accuracy'],
            'Precision (Macro)': result['precision_macro'],
            'Precision (Micro)': result['precision_micro'],
            'Precision (Weighted)': result['precision_weighted'],
            'Recall (Macro)': result['recall_macro'],
            'Recall (Micro)': result['recall_micro'],
            'Recall (Weighted)': result['recall_weighted'],
            'F1 (Macro)': result['f1_macro'],
            'F1 (Micro)': result['f1_micro'],
            'F1 (Weighted)': result['f1_weighted']
        })
    
    summary_df = pd.DataFrame(summary_data)
    return summary_df

In [None]:
def plot_model_comparison(summary_df):
    """Plot model comparison"""
    fig, axes = plt.subplots(2, 2, figsize=(15, 12))
    
    # Accuracy comparison
    axes[0, 0].bar(summary_df['Model'], summary_df['Accuracy'])
    axes[0, 0].set_title('Model Accuracy Comparison')
    axes[0, 0].set_ylabel('Accuracy')
    axes[0, 0].set_xticklabels(summary_df['Model'], rotation=45)
    
    # Precision comparison
    axes[0, 1].plot(summary_df['Model'], summary_df['Precision (Macro)'], 'o-', label='Macro')
    axes[0, 1].plot(summary_df['Model'], summary_df['Precision (Micro)'], 's-', label='Micro')
    axes[0, 1].plot(summary_df['Model'], summary_df['Precision (Weighted)'], '^-', label='Weighted')
    axes[0, 1].set_title('Precision Comparison')
    axes[0, 1].set_ylabel('Precision')
    axes[0, 1].legend()
    axes[0, 1].set_xticklabels(summary_df['Model'], rotation=45)
    
    # Recall comparison
    axes[1, 0].plot(summary_df['Model'], summary_df['Recall (Macro)'], 'o-', label='Macro')
    axes[1, 0].plot(summary_df['Model'], summary_df['Recall (Micro)'], 's-', label='Micro')
    axes[1, 0].plot(summary_df['Model'], summary_df['Recall (Weighted)'], '^-', label='Weighted')
    axes[1, 0].set_title('Recall Comparison')
    axes[1, 0].set_ylabel('Recall')
    axes[1, 0].legend()
    axes[1, 0].set_xticklabels(summary_df['Model'], rotation=45)
    
    # F1-Score comparison
    axes[1, 1].plot(summary_df['Model'], summary_df['F1 (Macro)'], 'o-', label='Macro')
    axes[1, 1].plot(summary_df['Model'], summary_df['F1 (Micro)'], 's-', label='Micro')
    axes[1, 1].plot(summary_df['Model'], summary_df['F1 (Weighted)'], '^-', label='Weighted')
    axes[1, 1].set_title('F1-Score Comparison')
    axes[1, 1].set_ylabel('F1-Score')
    axes[1, 1].legend()
    axes[1, 1].set_xticklabels(summary_df['Model'], rotation=45)
    
    plt.tight_layout()
    plt.show()


In [None]:
def analyze_feature_importance(best_models, X):
    """Analyze feature importance for tree-based models"""
    feature_names = X.columns
    
    fig, axes = plt.subplots(1, 3, figsize=(18, 6))
    
    # Decision Tree
    if hasattr(best_models['Decision Tree'], 'feature_importances_'):
        importances = best_models['Decision Tree'].feature_importances_
        indices = np.argsort(importances)[::-1][:10]  # Top 10 features
        
        axes[0].bar(range(len(indices)), importances[indices])
        axes[0].set_title('Decision Tree - Feature Importance')
        axes[0].set_xticks(range(len(indices)))
        axes[0].set_xticklabels([feature_names[i] for i in indices], rotation=45)
    
    # Random Forest
    if hasattr(best_models['Random Forest'], 'feature_importances_'):
        importances = best_models['Random Forest'].feature_importances_
        indices = np.argsort(importances)[::-1][:10]  # Top 10 features
        
        axes[1].bar(range(len(indices)), importances[indices])
        axes[1].set_title('Random Forest - Feature Importance')
        axes[1].set_xticks(range(len(indices)))
        axes[1].set_xticklabels([feature_names[i] for i in indices], rotation=45)
    
    # XGBoost
    if hasattr(best_models['XGBoost'], 'feature_importances_'):
        importances = best_models['XGBoost'].feature_importances_
        indices = np.argsort(importances)[::-1][:10]  # Top 10 features
        
        axes[2].bar(range(len(indices)), importances[indices])
        axes[2].set_title('XGBoost - Feature Importance')
        axes[2].set_xticks(range(len(indices)))
        axes[2].set_xticklabels([feature_names[i] for i in indices], rotation=45)
    
    plt.tight_layout()
    plt.show()


In [None]:
def main():
    """Main execution function"""
    print("=== Student Grade Classification Analysis ===\n")

    df = load_and_prepare_data()
    
    X, y, label_encoders = preprocess_data(df)
    

    print(f"\nDataset Info:")
    print(f"Features: {X.shape[1]}")
    print(f"Samples: {X.shape[0]}")
    print(f"Grade distribution:")
    print(df['gradeClass'].value_counts())
    
    plot_data_distribution(df)
    
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )
    
    print(f"\nTraining set size: {X_train.shape[0]}")
    print(f"Test set size: {X_test.shape[0]}")
    
    models = define_models()
    
    results, best_models = train_and_evaluate_models(X_train, X_test, y_train, y_test, models)
    
    summary_df = create_results_summary(results)
    print("\n=== MODEL PERFORMANCE SUMMARY ===")
    print(summary_df.round(4))
    
    plot_confusion_matrices(results, y_test)
    
    plot_model_comparison(summary_df)
    
    analyze_feature_importance(best_models, X)
    
    best_model_name = summary_df.loc[summary_df['Accuracy'].idxmax(), 'Model']
    best_accuracy = summary_df['Accuracy'].max()
    
    print(f"\n=== BEST MODEL ===")
    print(f"Best performing model: {best_model_name}")
    print(f"Best accuracy: {best_accuracy:.4f}")
    print(f"Best parameters: {results[best_model_name]['best_params']}")
    
    print(f"\n=== DETAILED CLASSIFICATION REPORT - {best_model_name} ===")
    y_pred_best = results[best_model_name]['y_pred']
    print(classification_report(y_test, y_pred_best))
    
    return results, best_models, summary_df


In [None]:
if __name__ == "__main__":
    results, best_models, summary_df = main()