In [2]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.cluster import KMeans
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, mean_squared_error, r2_score
from sklearn.decomposition import PCA
import ipywidgets as widgets
from IPython.display import display, clear_output, HTML
import warnings
warnings.filterwarnings('ignore')

# Set style for better visualizations
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette("husl")
%matplotlib inline

# Load the dataset
df = pd.read_csv('performance.csv')

print("="*60)
print("STUDENT PERFORMANCE DATA MINING PROJECT")
print("="*60)
print("\n‚úÖ Dataset loaded successfully!")
print(f"üìä Dataset Shape: {df.shape}")
print(f"üë• Total Students: {len(df)}")
print("\nUse the interactive widgets below to explore the data!")

STUDENT PERFORMANCE DATA MINING PROJECT

‚úÖ Dataset loaded successfully!
üìä Dataset Shape: (2392, 15)
üë• Total Students: 2392

Use the interactive widgets below to explore the data!


In [3]:
# Create interactive data explorer
def create_data_explorer():
    # Create widgets
    style = {'description_width': 'initial'}
    
    # Widget for selecting analysis type
    analysis_type = widgets.Dropdown(
        options=['Basic Statistics', 'Distribution Analysis', 'Correlation Analysis', 'Group Comparisons'],
        value='Basic Statistics',
        description='Analysis Type:',
        style=style,
        layout=widgets.Layout(width='400px')
    )
    
    # Widget for selecting columns
    column_selector = widgets.SelectMultiple(
        options=df.columns.tolist(),
        value=['GPA', 'StudyTimeWeekly', 'Absences'],
        description='Select Columns:',
        disabled=False,
        style=style,
        layout=widgets.Layout(width='400px', height='150px')
    )
    
    # Widget for grouping variable
    group_by = widgets.Dropdown(
        options=['None'] + ['Gender', 'Ethnicity', 'ParentalEducation'],
        value='None',
        description='Group By:',
        style=style,
        layout=widgets.Layout(width='400px')
    )
    
    # Widget for plot type
    plot_type = widgets.Dropdown(
        options=['Histogram', 'Box Plot', 'Scatter Plot', 'Bar Chart'],
        value='Histogram',
        description='Plot Type:',
        style=style,
        layout=widgets.Layout(width='400px')
    )
    
    # Output widget
    output = widgets.Output()
    
    # Update function
    def update_analysis(change):
        with output:
            clear_output(wait=True)
            
            if analysis_type.value == 'Basic Statistics':
                display(HTML("<h3>üìä Basic Statistics</h3>"))
                selected_cols = list(column_selector.value)
                if selected_cols:
                    display(df[selected_cols].describe().round(2))
                else:
                    display(df.describe().round(2))
                    
            elif analysis_type.value == 'Distribution Analysis':
                display(HTML("<h3>üìà Distribution Analysis</h3>"))
                selected_cols = list(column_selector.value)
                
                if len(selected_cols) > 0:
                    fig, axes = plt.subplots(1, len(selected_cols), figsize=(5*len(selected_cols), 4))
                    if len(selected_cols) == 1:
                        axes = [axes]
                    
                    for i, col in enumerate(selected_cols):
                        if df[col].dtype in ['int64', 'float64']:
                            axes[i].hist(df[col], bins=20, edgecolor='black', alpha=0.7)
                            axes[i].set_title(f'Distribution of {col}')
                            axes[i].set_xlabel(col)
                            axes[i].set_ylabel('Frequency')
                        else:
                            df[col].value_counts().plot(kind='bar', ax=axes[i])
                            axes[i].set_title(f'Distribution of {col}')
                            axes[i].tick_params(axis='x', rotation=45)
                    
                    plt.tight_layout()
                    plt.show()
                else:
                    print("Please select at least one column")
                    
            elif analysis_type.value == 'Correlation Analysis':
                display(HTML("<h3>üîó Correlation Analysis</h3>"))
                numeric_cols = df.select_dtypes(include=[np.number]).columns
                selected_cols = list(column_selector.value)
                
                if selected_cols:
                    numeric_selected = [col for col in selected_cols if col in numeric_cols]
                    if len(numeric_selected) > 1:
                        corr_matrix = df[numeric_selected].corr()
                        plt.figure(figsize=(8, 6))
                        sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', center=0, 
                                   square=True, linewidths=1)
                        plt.title('Correlation Matrix')
                        plt.show()
                        
                        # Show correlations with GPA if selected
                        if 'GPA' in numeric_selected:
                            print("\nüìà Correlations with GPA:")
                            gpa_corr = corr_matrix['GPA'].sort_values(ascending=False)
                            for col, corr in gpa_corr.items():
                                if col != 'GPA':
                                    print(f"  {col}: {corr:.3f}")
                    else:
                        print("Please select at least 2 numeric columns for correlation analysis")
                else:
                    print("Please select columns for correlation analysis")
                    
            elif analysis_type.value == 'Group Comparisons':
                display(HTML("<h3>üìä Group Comparisons</h3>"))
                
                if group_by.value != 'None':
                    group_col = group_by.value
                    selected_cols = list(column_selector.value)
                    
                    if selected_cols:
                        # Create mapping for better labels
                        if group_col == 'Gender':
                            df['Gender_Label'] = df['Gender'].map({0: 'Male', 1: 'Female'})
                            group_data = df.groupby('Gender_Label')[selected_cols].mean()
                        elif group_col == 'Ethnicity':
                            ethnicity_map = {0: 'Caucasian', 1: 'African American', 2: 'Asian', 3: 'Other'}
                            df['Ethnicity_Label'] = df['Ethnicity'].map(ethnicity_map)
                            group_data = df.groupby('Ethnicity_Label')[selected_cols].mean()
                        elif group_col == 'ParentalEducation':
                            edu_map = {0: 'None', 1: 'High School', 2: 'Some College', 3: 'Bachelor', 4: 'Higher'}
                            df['ParentalEdu_Label'] = df['ParentalEducation'].map(edu_map)
                            group_data = df.groupby('ParentalEdu_Label')[selected_cols].mean()
                        
                        display(group_data.round(2))
                        
                        # Create visualization
                        if plot_type.value == 'Bar Chart':
                            group_data.plot(kind='bar', figsize=(10, 6))
                            plt.title(f'Average Values by {group_col}')
                            plt.xlabel(group_col)
                            plt.ylabel('Average Value')
                            plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
                            plt.tight_layout()
                            plt.show()
                        elif plot_type.value == 'Box Plot':
                            fig, axes = plt.subplots(1, len(selected_cols), figsize=(5*len(selected_cols), 5))
                            if len(selected_cols) == 1:
                                axes = [axes]
                            
                            for i, col in enumerate(selected_cols):
                                if group_col == 'Gender':
                                    data_to_plot = [df[df['Gender']==0][col], df[df['Gender']==1][col]]
                                    axes[i].boxplot(data_to_plot, labels=['Male', 'Female'])
                                elif group_col == 'Ethnicity':
                                    data_to_plot = [df[df['Ethnicity']==i][col] for i in range(4)]
                                    axes[i].boxplot(data_to_plot, labels=['Caucasian', 'African American', 'Asian', 'Other'])
                                axes[i].set_title(f'{col} by {group_col}')
                                axes[i].set_ylabel(col)
                            
                            plt.tight_layout()
                            plt.show()
                    else:
                        print("Please select columns for comparison")
                else:
                    print("Please select a grouping variable")
    
    # Create UI layout
    ui = widgets.VBox([
        widgets.HTML("<h2>üîç Interactive Data Explorer</h2>"),
        widgets.HBox([analysis_type, group_by]),
        widgets.HBox([column_selector, plot_type]),
        output
    ])
    
    # Attach event handlers
    analysis_type.observe(update_analysis, names='value')
    column_selector.observe(update_analysis, names='value')
    group_by.observe(update_analysis, names='value')
    plot_type.observe(update_analysis, names='value')
    
    # Initial update
    update_analysis(None)
    
    return ui

# Display the data explorer
data_explorer = create_data_explorer()
display(data_explorer)

VBox(children=(HTML(value='<h2>üîç Interactive Data Explorer</h2>'), HBox(children=(Dropdown(description='Analys‚Ä¶

In [4]:
def create_model_trainer():
    # Create widgets
    style = {'description_width': 'initial'}
    
    # Problem type selection
    problem_type = widgets.Dropdown(
        options=['Classification (Grade Class)', 'Regression (GPA)'],
        value='Classification (Grade Class)',
        description='Problem Type:',
        style=style,
        layout=widgets.Layout(width='400px')
    )
    
    # Model selection
    model_selector = widgets.Dropdown(
        options=['Random Forest', 'Decision Tree', 'Logistic Regression (for classification)', 
                 'Linear Regression (for regression)'],
        value='Random Forest',
        description='Model:',
        style=style,
        layout=widgets.Layout(width='400px')
    )
    
    # Feature selection
    feature_selector = widgets.SelectMultiple(
        options=df.columns.tolist(),
        value=['StudyTimeWeekly', 'Absences', 'ParentalSupport', 'Tutoring', 
               'Extracurricular', 'Sports', 'Music', 'Volunteering'],
        description='Features:',
        disabled=False,
        style=style,
        layout=widgets.Layout(width='400px', height='200px')
    )
    
    # Test size slider
    test_size = widgets.FloatSlider(
        value=0.2,
        min=0.1,
        max=0.4,
        step=0.05,
        description='Test Size:',
        style=style,
        layout=widgets.Layout(width='400px')
    )
    
    # Random state
    random_state = widgets.IntSlider(
        value=42,
        min=0,
        max=100,
        step=1,
        description='Random State:',
        style=style,
        layout=widgets.Layout(width='400px')
    )
    
    # Train button
    train_button = widgets.Button(
        description='üöÄ Train Model',
        button_style='success',
        layout=widgets.Layout(width='200px')
    )
    
    # Output widget
    output = widgets.Output()
    
    # Training function
    def train_model(b):
        with output:
            clear_output(wait=True)
            
            # Get selected features
            features = list(feature_selector.value)
            if not features:
                print("‚ùå Please select at least one feature!")
                return
            
            # Prepare data
            X = df[features]
            
            if problem_type.value == 'Classification (Grade Class)':
                y = df['GradeClass']
                problem = 'classification'
            else:
                y = df['GPA']
                problem = 'regression'
            
            # Split data
            X_train, X_test, y_train, y_test = train_test_split(
                X, y, test_size=test_size.value, random_state=random_state.value
            )
            
            # Scale features
            scaler = StandardScaler()
            X_train_scaled = scaler.fit_transform(X_train)
            X_test_scaled = scaler.transform(X_test)
            
            # Train model based on selection
            display(HTML(f"<h3>üìä Training Results - {model_selector.value}</h3>"))
            
            if model_selector.value == 'Random Forest':
                if problem == 'classification':
                    model = RandomForestClassifier(n_estimators=100, random_state=random_state.value)
                    model.fit(X_train_scaled, y_train)
                    y_pred = model.predict(X_test_scaled)
                    
                    # Display results
                    print(f"‚úÖ Accuracy: {accuracy_score(y_test, y_pred):.4f}")
                    print("\nüìã Classification Report:")
                    print(classification_report(y_test, y_pred))
                    
                    # Feature importance
                    importance_df = pd.DataFrame({
                        'feature': features,
                        'importance': model.feature_importances_
                    }).sort_values('importance', ascending=False)
                    
                    plt.figure(figsize=(10, 5))
                    plt.subplot(1, 2, 1)
                    sns.barplot(x='importance', y='feature', data=importance_df)
                    plt.title('Feature Importance')
                    
                    plt.subplot(1, 2, 2)
                    cm = confusion_matrix(y_test, y_pred)
                    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
                    plt.title('Confusion Matrix')
                    
                    plt.tight_layout()
                    plt.show()
                    
                else:  # regression
                    model = RandomForestRegressor(n_estimators=100, random_state=random_state.value)
                    model.fit(X_train_scaled, y_train)
                    y_pred = model.predict(X_test_scaled)
                    
                    print(f"‚úÖ R¬≤ Score: {r2_score(y_test, y_pred):.4f}")
                    print(f"‚úÖ RMSE: {np.sqrt(mean_squared_error(y_test, y_pred)):.4f}")
                    
                    # Feature importance
                    importance_df = pd.DataFrame({
                        'feature': features,
                        'importance': model.feature_importances_
                    }).sort_values('importance', ascending=False)
                    
                    plt.figure(figsize=(12, 5))
                    plt.subplot(1, 2, 1)
                    sns.barplot(x='importance', y='feature', data=importance_df)
                    plt.title('Feature Importance')
                    
                    plt.subplot(1, 2, 2)
                    plt.scatter(y_test, y_pred, alpha=0.5)
                    plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
                    plt.xlabel('Actual GPA')
                    plt.ylabel('Predicted GPA')
                    plt.title('Actual vs Predicted')
                    
                    plt.tight_layout()
                    plt.show()
            
            elif model_selector.value == 'Decision Tree':
                if problem == 'classification':
                    model = DecisionTreeClassifier(max_depth=5, random_state=random_state.value)
                    model.fit(X_train_scaled, y_train)
                    y_pred = model.predict(X_test_scaled)
                    
                    print(f"‚úÖ Accuracy: {accuracy_score(y_test, y_pred):.4f}")
                    print("\nüìã Classification Report:")
                    print(classification_report(y_test, y_pred))
                    
                    # Visualize tree
                    plt.figure(figsize=(20, 10))
                    plot_tree(model, feature_names=features, class_names=[str(i) for i in range(5)], 
                             filled=True, rounded=True, max_depth=3, fontsize=10)
                    plt.title('Decision Tree Visualization (Limited to depth 3)')
                    plt.show()
                    
                else:
                    print("Decision Tree not recommended for regression with this UI. Using Random Forest instead.")
                    model = RandomForestRegressor(n_estimators=100, random_state=random_state.value)
                    model.fit(X_train_scaled, y_train)
                    y_pred = model.predict(X_test_scaled)
                    
                    print(f"‚úÖ R¬≤ Score: {r2_score(y_test, y_pred):.4f}")
                    print(f"‚úÖ RMSE: {np.sqrt(mean_squared_error(y_test, y_pred)):.4f}")
            
            elif model_selector.value == 'Logistic Regression (for classification)':
                if problem == 'classification':
                    model = LogisticRegression(max_iter=1000, random_state=random_state.value)
                    model.fit(X_train_scaled, y_train)
                    y_pred = model.predict(X_test_scaled)
                    
                    print(f"‚úÖ Accuracy: {accuracy_score(y_test, y_pred):.4f}")
                    print("\nüìã Classification Report:")
                    print(classification_report(y_test, y_pred))
                    
                    # Coefficients
                    coef_df = pd.DataFrame({
                        'feature': features,
                        'coefficient': model.coef_[0]
                    }).sort_values('coefficient', ascending=False)
                    
                    plt.figure(figsize=(10, 6))
                    sns.barplot(x='coefficient', y='feature', data=coef_df)
                    plt.title('Feature Coefficients')
                    plt.show()
                else:
                    print("‚ùå This model is for classification only. Please select Regression problem type.")
            
            elif model_selector.value == 'Linear Regression (for regression)':
                if problem == 'regression':
                    model = LinearRegression()
                    model.fit(X_train_scaled, y_train)
                    y_pred = model.predict(X_test_scaled)
                    
                    print(f"‚úÖ R¬≤ Score: {r2_score(y_test, y_pred):.4f}")
                    print(f"‚úÖ RMSE: {np.sqrt(mean_squared_error(y_test, y_pred)):.4f}")
                    
                    # Coefficients
                    coef_df = pd.DataFrame({
                        'feature': features,
                        'coefficient': model.coef_
                    }).sort_values('coefficient', ascending=False)
                    
                    plt.figure(figsize=(12, 5))
                    plt.subplot(1, 2, 1)
                    sns.barplot(x='coefficient', y='feature', data=coef_df)
                    plt.title('Feature Coefficients')
                    
                    plt.subplot(1, 2, 2)
                    plt.scatter(y_test, y_pred, alpha=0.5)
                    plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
                    plt.xlabel('Actual GPA')
                    plt.ylabel('Predicted GPA')
                    plt.title('Actual vs Predicted')
                    
                    plt.tight_layout()
                    plt.show()
                else:
                    print("‚ùå This model is for regression only. Please select Classification problem type.")
    
    # Connect button
    train_button.on_click(train_model)
    
    # Create UI layout
    ui = widgets.VBox([
        widgets.HTML("<h2>ü§ñ Interactive Model Trainer</h2>"),
        widgets.HBox([problem_type, model_selector]),
        feature_selector,
        widgets.HBox([test_size, random_state]),
        train_button,
        output
    ])
    
    return ui

# Display model trainer
model_trainer = create_model_trainer()
display(model_trainer)

VBox(children=(HTML(value='<h2>ü§ñ Interactive Model Trainer</h2>'), HBox(children=(Dropdown(description='Proble‚Ä¶

In [5]:
def create_risk_predictor():
    # Create widgets
    style = {'description_width': 'initial'}
    
    # Input widgets for student features
    study_time = widgets.FloatSlider(
        value=10.0,
        min=0,
        max=20,
        step=0.5,
        description='Study Time (hrs/week):',
        style=style,
        layout=widgets.Layout(width='500px')
    )
    
    absences = widgets.IntSlider(
        value=5,
        min=0,
        max=30,
        step=1,
        description='Absences:',
        style=style,
        layout=widgets.Layout(width='500px')
    )
    
    parental_support = widgets.Dropdown(
        options=[(f'Level {i}', i) for i in range(5)],
        value=2,
        description='Parental Support:',
        style=style,
        layout=widgets.Layout(width='500px')
    )
    
    tutoring = widgets.Checkbox(
        value=False,
        description='Receives Tutoring',
        style=style,
        layout=widgets.Layout(width='200px')
    )
    
    extracurricular = widgets.Checkbox(
        value=False,
        description='Extracurricular',
        style=style,
        layout=widgets.Layout(width='200px')
    )
    
    sports = widgets.Checkbox(
        value=False,
        description='Sports',
        style=style,
        layout=widgets.Layout(width='200px')
    )
    
    music = widgets.Checkbox(
        value=False,
        description='Music',
        style=style,
        layout=widgets.Layout(width='200px')
    )
    
    volunteering = widgets.Checkbox(
        value=False,
        description='Volunteering',
        style=style,
        layout=widgets.Layout(width='200px')
    )
    
    age = widgets.IntSlider(
        value=16,
        min=15,
        max=18,
        step=1,
        description='Age:',
        style=style,
        layout=widgets.Layout(width='500px')
    )
    
    gender = widgets.Dropdown(
        options=[('Male', 0), ('Female', 1)],
        value=0,
        description='Gender:',
        style=style,
        layout=widgets.Layout(width='500px')
    )
    
    ethnicity = widgets.Dropdown(
        options=[('Caucasian', 0), ('African American', 1), ('Asian', 2), ('Other', 3)],
        value=0,
        description='Ethnicity:',
        style=style,
        layout=widgets.Layout(width='500px')
    )
    
    parental_edu = widgets.Dropdown(
        options=[('None', 0), ('High School', 1), ('Some College', 2), ('Bachelor', 3), ('Higher', 4)],
        value=1,
        description='Parental Education:',
        style=style,
        layout=widgets.Layout(width='500px')
    )
    
    # Predict button
    predict_button = widgets.Button(
        description='üéØ Predict Risk',
        button_style='danger',
        layout=widgets.Layout(width='200px')
    )
    
    # Output widget
    output = widgets.Output()
    
    # Prediction function
    def predict_risk(b):
        with output:
            clear_output(wait=True)
            
            # Prepare features
            features = ['Age', 'Gender', 'Ethnicity', 'ParentalEducation', 'StudyTimeWeekly',
                       'Absences', 'Tutoring', 'ParentalSupport', 'Extracurricular',
                       'Sports', 'Music', 'Volunteering']
            
            # Create feature vector
            X_new = pd.DataFrame([[
                age.value,
                gender.value,
                ethnicity.value,
                parental_edu.value,
                study_time.value,
                absences.value,
                1 if tutoring.value else 0,
                parental_support.value,
                1 if extracurricular.value else 0,
                1 if sports.value else 0,
                1 if music.value else 0,
                1 if volunteering.value else 0
            ]], columns=features)
            
            # Train a model on all data for prediction
            X = df[features]
            y_gpa = df['GPA']
            
            # Train model
            model = RandomForestRegressor(n_estimators=100, random_state=42)
            model.fit(X, y_gpa)
            
            # Make prediction
            predicted_gpa = model.predict(X_new)[0]
            
            # Determine risk level
            if predicted_gpa >= 3.0:
                risk_level = "üü¢ Low Risk"
                risk_color = "green"
                recommendation = "Student is performing well. Encourage continued engagement."
            elif predicted_gpa >= 2.0:
                risk_level = "üü° Medium Risk"
                risk_color = "orange"
                recommendation = "Student shows some risk factors. Consider monitoring and support."
            else:
                risk_level = "üî¥ High Risk"
                risk_color = "red"
                recommendation = "Student needs immediate intervention and additional support."
            
            # Display results
            display(HTML(f"""
            <div style="background-color: #f0f0f0; padding: 20px; border-radius: 10px;">
                <h3>üéì Student Risk Assessment</h3>
                <hr>
                <table style="width: 100%;">
                    <tr>
                        <td><b>Predicted GPA:</b></td>
                        <td><span style="font-size: 24px; font-weight: bold;">{predicted_gpa:.2f}</span></td>
                    </tr>
                    <tr>
                        <td><b>Risk Level:</b></td>
                        <td><span style="color: {risk_color}; font-weight: bold;">{risk_level}</span></td>
                    </tr>
                    <tr>
                        <td><b>Recommendation:</b></td>
                        <td>{recommendation}</td>
                    </tr>
                </table>
            </div>
            """))
            
            # Show comparison with similar students
            display(HTML("<h4>üìä Student Profile Summary:</h4>"))
            
            # Find similar students
            similar_students = df[
                (abs(df['StudyTimeWeekly'] - study_time.value) < 3) &
                (abs(df['Absences'] - absences.value) < 5)
            ]
            
            if len(similar_students) > 0:
                avg_gpa_similar = similar_students['GPA'].mean()
                display(HTML(f"""
                <p>Based on {len(similar_students)} similar students:</p>
                <ul>
                    <li>Average GPA: {avg_gpa_similar:.2f}</li>
                    <li>Range: {similar_students['GPA'].min():.2f} - {similar_students['GPA'].max():.2f}</li>
                </ul>
                """))
    
    # Connect button
    predict_button.on_click(predict_risk)
    
    # Create UI layout with tabs
    tab = widgets.Tab()
    
    # Basic info tab
    basic_info = widgets.VBox([
        widgets.HTML("<h4>üìã Basic Information</h4>"),
        age,
        gender,
        ethnicity,
        parental_edu
    ])
    
    # Academic info tab
    academic_info = widgets.VBox([
        widgets.HTML("<h4>üìö Academic Information</h4>"),
        study_time,
        absences,
        parental_support,
        tutoring
    ])
    
    # Activities tab
    activities = widgets.VBox([
        widgets.HTML("<h4>‚öΩ Activities</h4>"),
        widgets.HBox([extracurricular, sports]),
        widgets.HBox([music, volunteering])
    ])
    
    tab.children = [basic_info, academic_info, activities]
    tab.set_title(0, 'Basic Info')
    tab.set_title(1, 'Academic')
    tab.set_title(2, 'Activities')
    
    # Main UI
    ui = widgets.VBox([
        widgets.HTML("<h2>üéØ Student Risk Predictor</h2>"),
        widgets.HTML("<p>Enter student information to predict academic risk:</p>"),
        tab,
        widgets.HBox([predict_button], layout=widgets.Layout(justify_content='center', padding='20px')),
        output
    ])
    
    return ui

# Display risk predictor
risk_predictor = create_risk_predictor()
display(risk_predictor)

VBox(children=(HTML(value='<h2>üéØ Student Risk Predictor</h2>'), HTML(value='<p>Enter student information to pr‚Ä¶

In [6]:
def create_clustering_explorer():
    # Create widgets
    style = {'description_width': 'initial'}
    
    # Number of clusters
    n_clusters = widgets.IntSlider(
        value=4,
        min=2,
        max=8,
        step=1,
        description='Number of Clusters:',
        style=style,
        layout=widgets.Layout(width='400px')
    )
    
    # Feature selection for clustering
    cluster_features = widgets.SelectMultiple(
        options=['StudyTimeWeekly', 'Absences', 'GPA', 'ParentalSupport', 
                 'Tutoring', 'Extracurricular', 'Sports', 'Music', 'Volunteering'],
        value=['StudyTimeWeekly', 'Absences', 'GPA'],
        description='Clustering Features:',
        disabled=False,
        style=style,
        layout=widgets.Layout(width='400px', height='200px')
    )
    
    # Visualization type
    viz_type = widgets.Dropdown(
        options=['2D PCA', '3D PCA (if available)', 'Feature Pairs'],
        value='2D PCA',
        description='Visualization:',
        style=style,
        layout=widgets.Layout(width='400px')
    )
    
    # Run clustering button
    cluster_button = widgets.Button(
        description='üîç Run Clustering',
        button_style='info',
        layout=widgets.Layout(width='200px')
    )
    
    # Output widget
    output = widgets.Output()
    
    # Clustering function
    def run_clustering(b):
        with output:
            clear_output(wait=True)
            
            # Get selected features
            features = list(cluster_features.value)
            if len(features) < 2:
                print("‚ùå Please select at least 2 features for clustering!")
                return
            
            # Prepare data
            X_cluster = df[features].copy()
            
            # Handle any missing values (none in this dataset)
            if X_cluster.isnull().sum().sum() > 0:
                X_cluster = X_cluster.fillna(X_cluster.mean())
            
            # Standardize
            scaler = StandardScaler()
            X_scaled = scaler.fit_transform(X_cluster)
            
            # Apply K-Means
            kmeans = KMeans(n_clusters=n_clusters.value, random_state=42, n_init=10)
            clusters = kmeans.fit_predict(X_scaled)
            
            # Add clusters to dataframe
            df_temp = df.copy()
            df_temp['Cluster'] = clusters
            
            # Analyze clusters
            display(HTML(f"<h3>üìä Cluster Analysis - {n_clusters.value} Clusters</h3>"))
            
            # Cluster statistics
            cluster_stats = df_temp.groupby('Cluster')[features].mean()
            cluster_stats['Count'] = df_temp.groupby('Cluster').size()
            cluster_stats['Percentage'] = (cluster_stats['Count'] / len(df_temp) * 100).round(1)
            
            display(cluster_stats.round(2))
            
            # Visualization
            if viz_type.value == '2D PCA':
                # Apply PCA for 2D visualization
                pca = PCA(n_components=2)
                X_pca = pca.fit_transform(X_scaled)
                
                plt.figure(figsize=(12, 5))
                
                plt.subplot(1, 2, 1)
                scatter = plt.scatter(X_pca[:, 0], X_pca[:, 1], c=clusters, cmap='viridis', 
                                      alpha=0.6, s=50)
                plt.colorbar(scatter, label='Cluster')
                plt.xlabel(f'PC1 ({pca.explained_variance_ratio_[0]:.2%})')
                plt.ylabel(f'PC2 ({pca.explained_variance_ratio_[1]:.2%})')
                plt.title('PCA Visualization of Clusters')
                
                # Cluster profiles
                plt.subplot(1, 2, 2)
                # Normalize for radar chart preparation
                normalized_stats = cluster_stats[features].copy()
                for col in normalized_stats.columns:
                    normalized_stats[col] = (normalized_stats[col] - normalized_stats[col].min()) / \
                                           (normalized_stats[col].max() - normalized_stats[col].min())
                
                # Simple bar chart of cluster characteristics
                normalized_stats.T.plot(kind='bar', ax=plt.gca())
                plt.title('Normalized Cluster Characteristics')
                plt.xlabel('Features')
                plt.ylabel('Normalized Value')
                plt.legend(title='Cluster', bbox_to_anchor=(1.05, 1))
                plt.xticks(rotation=45)
                
                plt.tight_layout()
                plt.show()
                
                print(f"Total explained variance: {pca.explained_variance_ratio_.sum():.2%}")
                
            elif viz_type.value == 'Feature Pairs' and len(features) >= 2:
                # Create pairplot of selected features with clusters
                plot_df = df_temp[features + ['Cluster']].copy()
                
                fig, axes = plt.subplots(len(features), len(features), figsize=(15, 15))
                
                for i, feat_i in enumerate(features):
                    for j, feat_j in enumerate(features):
                        if i == j:
                            # Diagonal - histogram
                            for cluster in range(n_clusters.value):
                                cluster_data = plot_df[plot_df['Cluster'] == cluster][feat_i]
                                axes[i, j].hist(cluster_data, alpha=0.5, bins=15, 
                                               label=f'Cluster {cluster}')
                            axes[i, j].set_xlabel(feat_i)
                            if i == 0:
                                axes[i, j].set_ylabel('Frequency')
                        else:
                            # Off-diagonal - scatter
                            for cluster in range(n_clusters.value):
                                cluster_data = plot_df[plot_df['Cluster'] == cluster]
                                axes[i, j].scatter(cluster_data[feat_j], cluster_data[feat_i], 
                                                  alpha=0.5, s=20, label=f'Cluster {cluster}')
                            axes[i, j].set_xlabel(feat_j)
                            axes[i, j].set_ylabel(feat_i)
                
                # Add legend
                handles, labels = axes[0, 1].get_legend_handles_labels()
                fig.legend(handles, labels, loc='upper right', bbox_to_anchor=(0.95, 0.95))
                
                plt.suptitle(f'Cluster Analysis - {n_clusters.value} Clusters', fontsize=16, y=1.02)
                plt.tight_layout()
                plt.show()
            
            # Cluster interpretation
            display(HTML("<h4>üìù Cluster Interpretation:</h4>"))
            
            for cluster in range(n_clusters.value):
                cluster_size = cluster_stats.loc[cluster, 'Count']
                cluster_pct = cluster_stats.loc[cluster, 'Percentage']
                
                # Determine cluster characteristics
                characteristics = []
                for feat in features:
                    feat_mean = cluster_stats.loc[cluster, feat]
                    overall_mean = df[feat].mean()
                    if feat_mean > overall_mean * 1.2:
                        characteristics.append(f"High {feat}")
                    elif feat_mean < overall_mean * 0.8:
                        characteristics.append(f"Low {feat}")
                
                if 'GPA' in features:
                    if cluster_stats.loc[cluster, 'GPA'] >= 3.0:
                        perf = "High Performing"
                    elif cluster_stats.loc[cluster, 'GPA'] >= 2.0:
                        perf = "Average Performing"
                    else:
                        perf = "Low Performing"
                    characteristics.insert(0, perf)
                
                display(HTML(f"""
                <div style="background-color: #e6f3ff; padding: 10px; margin: 5px; border-radius: 5px;">
                    <b>Cluster {cluster}:</b> {cluster_size} students ({cluster_pct}%)<br>
                    <b>Profile:</b> {', '.join(characteristics) if characteristics else 'Mixed characteristics'}
                </div>
                """))
    
    # Connect button
    cluster_button.on_click(run_clustering)
    
    # Create UI
    ui = widgets.VBox([
        widgets.HTML("<h2>üî¨ Interactive Clustering Explorer</h2>"),
        widgets.HTML("<p>Discover natural groupings in student data:</p>"),
        widgets.HBox([n_clusters, viz_type]),
        cluster_features,
        cluster_button,
        output
    ])
    
    return ui

# Display clustering explorer
clustering_explorer = create_clustering_explorer()
display(clustering_explorer)

VBox(children=(HTML(value='<h2>üî¨ Interactive Clustering Explorer</h2>'), HTML(value='<p>Discover natural group‚Ä¶

In [7]:
def create_dashboard():
    # Create tabs for different visualizations
    tab = widgets.Tab()
    
    # Tab 1: GPA Distribution
    def create_gpa_tab():
        output = widgets.Output()
        with output:
            fig, axes = plt.subplots(2, 2, figsize=(14, 10))
            
            # GPA Histogram
            axes[0, 0].hist(df['GPA'], bins=30, edgecolor='black', alpha=0.7, color='steelblue')
            axes[0, 0].set_title('GPA Distribution')
            axes[0, 0].set_xlabel('GPA')
            axes[0, 0].set_ylabel('Frequency')
            
            # GPA by Gender
            df.boxplot(column='GPA', by='Gender', ax=axes[0, 1])
            axes[0, 1].set_title('GPA by Gender')
            axes[0, 1].set_xlabel('Gender (0=Male, 1=Female)')
            
            # GPA by Age
            df.boxplot(column='GPA', by='Age', ax=axes[1, 0])
            axes[1, 0].set_title('GPA by Age')
            axes[1, 0].set_xlabel('Age')
            
            # GPA by Parental Education
            parental_edu_map = {0: 'None', 1: 'HS', 2: 'Some College', 3: 'Bachelor', 4: 'Higher'}
            df['ParentalEdu_Label'] = df['ParentalEducation'].map(parental_edu_map)
            df.boxplot(column='GPA', by='ParentalEdu_Label', ax=axes[1, 1])
            axes[1, 1].set_title('GPA by Parental Education')
            axes[1, 1].set_xlabel('Parental Education')
            axes[1, 1].tick_params(axis='x', rotation=45)
            
            plt.suptitle('GPA Analysis Dashboard', fontsize=14)
            plt.tight_layout()
            plt.show()
        
        return output
    
    # Tab 2: Academic Factors
    def create_academic_tab():
        output = widgets.Output()
        with output:
            fig, axes = plt.subplots(2, 3, figsize=(18, 10))
            
            # Study Time vs GPA
            axes[0, 0].scatter(df['StudyTimeWeekly'], df['GPA'], alpha=0.5, color='green')
            axes[0, 0].set_xlabel('Study Time (hours/week)')
            axes[0, 0].set_ylabel('GPA')
            axes[0, 0].set_title('Study Time vs GPA')
            
            # Absences vs GPA
            axes[0, 1].scatter(df['Absences'], df['GPA'], alpha=0.5, color='red')
            axes[0, 1].set_xlabel('Absences')
            axes[0, 1].set_ylabel('GPA')
            axes[0, 1].set_title('Absences vs GPA')
            
            # Parental Support vs GPA
            support_means = df.groupby('ParentalSupport')['GPA'].mean()
            axes[0, 2].bar(range(5), support_means.values, color='purple', edgecolor='black')
            axes[0, 2].set_xlabel('Parental Support Level')
            axes[0, 2].set_ylabel('Average GPA')
            axes[0, 2].set_title('Parental Support vs GPA')
            axes[0, 2].set_xticks(range(5))
            
            # Grade Class Distribution
            grade_counts = df['GradeClass'].value_counts().sort_index()
            axes[1, 0].bar(grade_counts.index, grade_counts.values, color='orange', edgecolor='black')
            axes[1, 0].set_xlabel('Grade Class (0=Highest, 4=Lowest)')
            axes[1, 0].set_ylabel('Count')
            axes[1, 0].set_title('Grade Class Distribution')
            
            # Activities Impact
            activities = ['Tutoring', 'Extracurricular', 'Sports', 'Music', 'Volunteering']
            impact_data = []
            for activity in activities:
                with_act = df[df[activity] == 1]['GPA'].mean()
                without_act = df[df[activity] == 0]['GPA'].mean()
                impact_data.append(with_act - without_act)
            
            axes[1, 1].bar(activities, impact_data, color='teal', edgecolor='black')
            axes[1, 1].set_xlabel('Activity')
            axes[1, 1].set_ylabel('GPA Difference')
            axes[1, 1].set_title('Impact of Activities on GPA')
            axes[1, 1].tick_params(axis='x', rotation=45)
            
            # Study Time Distribution by Grade Class
            for i in range(5):
                class_data = df[df['GradeClass'] == i]['StudyTimeWeekly']
                axes[1, 2].boxplot(class_data, positions=[i], widths=0.6)
            axes[1, 2].set_xlabel('Grade Class')
            axes[1, 2].set_ylabel('Study Time (hours/week)')
            axes[1, 2].set_title('Study Time Distribution by Grade Class')
            axes[1, 2].set_xticks(range(5))
            
            plt.tight_layout()
            plt.show()
        
        return output
    
    # Tab 3: Correlation Heatmap
    def create_correlation_tab():
        output = widgets.Output()
        with output:
            numeric_cols = df.select_dtypes(include=[np.number]).columns
            corr_matrix = df[numeric_cols].corr()
            
            plt.figure(figsize=(14, 12))
            sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', center=0,
                       square=True, linewidths=1, cbar_kws={"shrink": 0.8})
            plt.title('Correlation Matrix of All Numeric Features', fontsize=16)
            plt.tight_layout()
            plt.show()
            
            # Show top correlations with GPA
            print("\nüìà Top Correlations with GPA:")
            gpa_corr = corr_matrix['GPA'].sort_values(ascending=False)
            for col, corr in gpa_corr.items():
                if col != 'GPA':
                    print(f"  {col}: {corr:.3f}")
        
        return output
    
    # Tab 4: Student Demographics
    def create_demographics_tab():
        output = widgets.Output()
        with output:
            fig, axes = plt.subplots(2, 3, figsize=(18, 10))
            
            # Gender Distribution
            gender_counts = df['Gender'].value_counts()
            axes[0, 0].pie(gender_counts.values, labels=['Male', 'Female'], autopct='%1.1f%%',
                          colors=['lightblue', 'lightcoral'], startangle=90)
            axes[0, 0].set_title('Gender Distribution')
            
            # Ethnicity Distribution
            ethnicity_labels = ['Caucasian', 'African American', 'Asian', 'Other']
            ethnicity_counts = df['Ethnicity'].value_counts().sort_index()
            axes[0, 1].bar(ethnicity_labels, ethnicity_counts.values, color='skyblue', edgecolor='black')
            axes[0, 1].set_title('Ethnicity Distribution')
            axes[0, 1].set_xlabel('Ethnicity')
            axes[0, 1].set_ylabel('Count')
            axes[0, 1].tick_params(axis='x', rotation=45)
            
            # Age Distribution
            age_counts = df['Age'].value_counts().sort_index()
            axes[0, 2].bar(age_counts.index, age_counts.values, color='lightgreen', edgecolor='black')
            axes[0, 2].set_title('Age Distribution')
            axes[0, 2].set_xlabel('Age')
            axes[0, 2].set_ylabel('Count')
            
            # Parental Education Distribution
            edu_labels = ['None', 'HS', 'Some College', 'Bachelor', 'Higher']
            edu_counts = df['ParentalEducation'].value_counts().sort_index()
            axes[1, 0].bar(edu_labels, edu_counts.values, color='gold', edgecolor='black')
            axes[1, 0].set_title('Parental Education Distribution')
            axes[1, 0].set_xlabel('Education Level')
            axes[1, 0].set_ylabel('Count')
            axes[1, 0].tick_params(axis='x', rotation=45)
            
            # Activities Participation
            activities = ['Tutoring', 'Extracurricular', 'Sports', 'Music', 'Volunteering']
            participation = [df[activity].mean() * 100 for activity in activities]
            axes[1, 1].bar(activities, participation, color='coral', edgecolor='black')
            axes[1, 1].set_title('Activity Participation (%)')
            axes[1, 1].set_xlabel('Activity')
            axes[1, 1].set_ylabel('Participation (%)')
            axes[1, 1].tick_params(axis='x', rotation=45)
            
            # GPA by Ethnicity
            ethnicity_gpa = df.groupby('Ethnicity')['GPA'].mean()
            axes[1, 2].bar(ethnicity_labels, ethnicity_gpa.values, color='purple', edgecolor='black')
            axes[1, 2].set_title('Average GPA by Ethnicity')
            axes[1, 2].set_xlabel('Ethnicity')
            axes[1, 2].set_ylabel('Average GPA')
            axes[1, 2].tick_params(axis='x', rotation=45)
            
            plt.tight_layout()
            plt.show()
        
        return output
    
    # Set up tabs
    tab.children = [
        create_gpa_tab(),
        create_academic_tab(),
        create_correlation_tab(),
        create_demographics_tab()
    ]
    
    tab.set_title(0, 'üìä GPA Analysis')
    tab.set_title(1, 'üìö Academic Factors')
    tab.set_title(2, 'üîó Correlations')
    tab.set_title(3, 'üë• Demographics')
    
    return widgets.VBox([
        widgets.HTML("<h1 style='text-align: center; color: #2c3e50;'>üìà Student Performance Dashboard</h1>"),
        widgets.HTML("<p style='text-align: center;'>Interactive visualization of student performance data</p>"),
        tab
    ])

# Display dashboard
dashboard = create_dashboard()
display(dashboard)

VBox(children=(HTML(value="<h1 style='text-align: center; color: #2c3e50;'>üìà Student Performance Dashboard</h1‚Ä¶

In [10]:
# Fix for the missing DecisionTreeRegressor import
from sklearn.tree import DecisionTreeRegressor  # Add this missing import

def create_model_comparison():
    output = widgets.Output()
    
    with output:
        display(HTML("<h3>üìä Model Performance Comparison</h3>"))
        
        # Prepare data
        features = ['StudyTimeWeekly', 'Absences', 'ParentalSupport', 'Tutoring', 
                    'Extracurricular', 'Sports', 'Music', 'Volunteering']
        X = df[features]
        y_class = df['GradeClass']
        y_reg = df['GPA']
        
        # Split data
        X_train, X_test, y_train_class, y_test_class = train_test_split(
            X, y_class, test_size=0.2, random_state=42
        )
        _, _, y_train_reg, y_test_reg = train_test_split(
            X, y_reg, test_size=0.2, random_state=42
        )
        
        # Scale features
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)
        
        # Classification models
        class_models = {
            'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
            'Decision Tree': DecisionTreeClassifier(max_depth=5, random_state=42),
            'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42)
        }
        
        # Regression models - FIXED: DecisionTreeRegressor is now imported
        reg_models = {
            'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42),
            'Linear Regression': LinearRegression(),
            'Decision Tree': DecisionTreeRegressor(max_depth=5, random_state=42)  # Now works
        }
        
        # Evaluate classification models
        class_results = []
        for name, model in class_models.items():
            model.fit(X_train_scaled, y_train_class)
            y_pred = model.predict(X_test_scaled)
            accuracy = accuracy_score(y_test_class, y_pred)
            class_results.append({'Model': name, 'Accuracy': accuracy})
        
        # Evaluate regression models
        reg_results = []
        for name, model in reg_models.items():
            model.fit(X_train_scaled, y_train_reg)
            y_pred = model.predict(X_test_scaled)
            r2 = r2_score(y_test_reg, y_pred)
            rmse = np.sqrt(mean_squared_error(y_test_reg, y_pred))
            reg_results.append({'Model': name, 'R¬≤': r2, 'RMSE': rmse})
        
        # Display results
        fig, axes = plt.subplots(1, 2, figsize=(14, 5))
        
        # Classification comparison
        class_df = pd.DataFrame(class_results)
        axes[0].bar(class_df['Model'], class_df['Accuracy'], color=['skyblue', 'lightgreen', 'coral'])
        axes[0].set_title('Classification Model Accuracy')
        axes[0].set_xlabel('Model')
        axes[0].set_ylabel('Accuracy')
        axes[0].set_ylim(0, 1)
        for i, v in enumerate(class_df['Accuracy']):
            axes[0].text(i, v + 0.01, f'{v:.3f}', ha='center')
        
        # Regression comparison
        reg_df = pd.DataFrame(reg_results)
        x = np.arange(len(reg_df))
        width = 0.35
        axes[1].bar(x - width/2, reg_df['R¬≤'], width, label='R¬≤', color='skyblue')
        axes[1].bar(x + width/2, reg_df['RMSE']/4, width, label='RMSE/4', color='lightcoral')
        axes[1].set_title('Regression Model Performance')
        axes[1].set_xlabel('Model')
        axes[1].set_ylabel('Score')
        axes[1].set_xticks(x)
        axes[1].set_xticklabels(reg_df['Model'])
        axes[1].legend()
        
        plt.tight_layout()
        plt.show()
        
        # Display detailed results
        display(HTML("<h4>üìã Detailed Results:</h4>"))
        
        # Create HTML tables
        class_html = class_df.to_html(index=False, classes="table table-striped")
        reg_html = reg_df.round(4).to_html(index=False, classes="table table-striped")
        
        display(HTML(f"""
        <div style="display: flex; gap: 20px;">
            <div style="flex: 1; padding: 10px; background-color: #f8f9fa; border-radius: 5px;">
                <h5>Classification Models</h5>
                {class_html}
            </div>
            <div style="flex: 1; padding: 10px; background-color: #f8f9fa; border-radius: 5px;">
                <h5>Regression Models</h5>
                {reg_html}
            </div>
        </div>
        """))
        
        # Cross-validation results
        display(HTML("<h4>üîÑ Cross-Validation Results (5-fold):</h4>"))
        
        cv_results = []
        for name, model in class_models.items():
            scores = cross_val_score(model, X_train_scaled, y_train_class, cv=5, scoring='accuracy')
            cv_results.append({
                'Model': name,
                'Mean CV Score': scores.mean(),
                'Std CV Score': scores.std()
            })
        
        cv_df = pd.DataFrame(cv_results)
        display(cv_df.round(4))
    
    return widgets.VBox([
        widgets.HTML("<h2>üìä Model Performance Comparison</h2>"),
        output
    ])

In [11]:
# Create main dashboard with all components
main_tab = widgets.Tab()

# Create all components
data_explorer = create_data_explorer()
model_trainer = create_model_trainer()
risk_predictor = create_risk_predictor()
clustering_explorer = create_clustering_explorer()
dashboard = create_dashboard()
model_comparison = create_model_comparison()

# Set up tabs
main_tab.children = [
    data_explorer,
    dashboard,
    model_trainer,
    model_comparison,
    risk_predictor,
    clustering_explorer
]

# Set tab titles
main_tab.set_title(0, 'üîç Data Explorer')
main_tab.set_title(1, 'üìä Dashboard')
main_tab.set_title(2, 'ü§ñ Model Trainer')
main_tab.set_title(3, 'üìà Model Comparison')
main_tab.set_title(4, 'üéØ Risk Predictor')
main_tab.set_title(5, 'üî¨ Clustering')

# Display main dashboard
display(HTML("""
<div style="background-color: #2c3e50; color: white; padding: 20px; border-radius: 10px; margin-bottom: 20px;">
    <h1 style="text-align: center;">üéì Student Performance Analytics Platform</h1>
    <p style="text-align: center; font-size: 16px;">
        Comprehensive Data Mining Project for Student Performance Analysis
    </p>
</div>
"""))

display(main_tab)

# Project summary
display(HTML("""
<div style="background-color: #f8f9fa; padding: 20px; border-radius: 10px; margin-top: 20px;">
    <h3>üìã Project Overview</h3>
    <p>This interactive data mining platform provides comprehensive analysis of student performance data including:</p>
    <ul>
        <li><b>Data Exploration:</b> Interactive tools to explore student demographics and academic factors</li>
        <li><b>Visualization Dashboard:</b> Comprehensive visualizations of key patterns and relationships</li>
        <li><b>Predictive Modeling:</b> Train and compare multiple ML models for GPA and grade prediction</li>
        <li><b>Risk Assessment:</b> Real-time risk prediction for individual students</li>
        <li><b>Student Segmentation:</b> Discover natural groupings using clustering algorithms</li>
    </ul>
    <p><b>Dataset Size:</b> {} students with {} features</p>
</div>
""".format(len(df), len(df.columns))))

Tab(children=(VBox(children=(HTML(value='<h2>üîç Interactive Data Explorer</h2>'), HBox(children=(Dropdown(descr‚Ä¶

In [12]:
# Create export functionality
def create_export_tools():
    output = widgets.Output()
    
    with output:
        display(HTML("<h3>üì§ Export Options</h3>"))
        
        # Export processed data
        df_processed = df.copy()
        
        # Add derived features
        df_processed['TotalActivities'] = df[['Tutoring', 'Extracurricular', 'Sports', 'Music', 'Volunteering']].sum(axis=1)
        df_processed['GPA_Category'] = pd.cut(df['GPA'], 
                                              bins=[0, 1.0, 2.0, 3.0, 4.0],
                                              labels=['Failing', 'Below Average', 'Average', 'Good'])
        
        # Export buttons
        export_data = widgets.Button(
            description='üíæ Export Processed Data',
            button_style='primary',
            layout=widgets.Layout(width='250px', margin='5px')
        )
        
        export_stats = widgets.Button(
            description='üìä Export Statistics',
            button_style='success',
            layout=widgets.Layout(width='250px', margin='5px')
        )
        
        export_report = widgets.Button(
            description='üìÑ Export Summary Report',
            button_style='info',
            layout=widgets.Layout(width='250px', margin='5px')
        )
        
        export_output = widgets.Output()
        
        def on_export_data(b):
            with export_output:
                clear_output()
                df_processed.to_csv('student_performance_processed.csv', index=False)
                print("‚úÖ Data exported to 'student_performance_processed.csv'")
        
        def on_export_stats(b):
            with export_output:
                clear_output()
                stats = df_processed.describe()
                stats.to_csv('student_statistics.csv')
                print("‚úÖ Statistics exported to 'student_statistics.csv'")
        
        def on_export_report(b):
            with export_output:
                clear_output()
                
                # Generate summary report
                with open('student_performance_report.txt', 'w') as f:
                    f.write("="*60 + "\n")
                    f.write("STUDENT PERFORMANCE ANALYSIS REPORT\n")
                    f.write("="*60 + "\n\n")
                    
                    f.write(f"Total Students: {len(df)}\n")
                    f.write(f"Average GPA: {df['GPA'].mean():.3f}\n")
                    f.write(f"Median Study Time: {df['StudyTimeWeekly'].median():.2f} hours\n")
                    f.write(f"Average Absences: {df['Absences'].mean():.2f}\n\n")
                    
                    f.write("GPA Distribution:\n")
                    for cat in ['Good (3.0-4.0)', 'Average (2.0-2.99)', 'Below Average (1.0-1.99)', 'Failing (0-0.99)']:
                        if cat == 'Good (3.0-4.0)':
                            count = len(df[df['GPA'] >= 3.0])
                        elif cat == 'Average (2.0-2.99)':
                            count = len(df[(df['GPA'] >= 2.0) & (df['GPA'] < 3.0)])
                        elif cat == 'Below Average (1.0-1.99)':
                            count = len(df[(df['GPA'] >= 1.0) & (df['GPA'] < 2.0)])
                        else:
                            count = len(df[df['GPA'] < 1.0])
                        f.write(f"  {cat}: {count} ({count/len(df)*100:.1f}%)\n")
                    
                    f.write("\nKey Findings:\n")
                    f.write("  1. Study time and absences are strong predictors of GPA\n")
                    f.write("  2. Parental support significantly impacts student performance\n")
                    f.write("  3. Extracurricular activities have positive correlation with GPA\n")
                
                print("‚úÖ Report exported to 'student_performance_report.txt'")
        
        export_data.on_click(on_export_data)
        export_stats.on_click(on_export_stats)
        export_report.on_click(on_export_report)
        
        display(widgets.VBox([
            widgets.HBox([export_data, export_stats, export_report]),
            export_output
        ]))
    
    return widgets.VBox([
        widgets.HTML("<h2>üì• Export Tools</h2>"),
        output
    ])

# Add export tab
export_tab = create_export_tools()
main_tab.children = list(main_tab.children) + [export_tab]
main_tab.set_title(len(main_tab.children)-1, 'üì§ Export')

# Refresh display
display(main_tab)

Tab(children=(VBox(children=(HTML(value='<h2>üîç Interactive Data Explorer</h2>'), HBox(children=(Dropdown(descr‚Ä¶

In [13]:
# Display installation instructions
display(HTML("""
<div style="background-color: #e8f4f8; padding: 20px; border-radius: 10px; margin-top: 20px;">
    <h3>üì¶ Required Libraries</h3>
    <p>Run the following commands to install required packages:</p>
    <pre style="background-color: #f0f0f0; padding: 10px; border-radius: 5px;">
pip install pandas numpy matplotlib seaborn scikit-learn ipywidgets jupyter
jupyter nbextension enable --py widgetsnbextension
    </pre>
    
    <h3>üöÄ How to Use</h3>
    <ol>
        <li>Run all cells in the Jupyter notebook</li>
        <li>Use the tabs at the top to navigate between different analysis tools</li>
        <li>Adjust widgets and buttons to customize your analysis</li>
        <li>Export results using the Export tab</li>
    </ol>
    
    <h3>üìÅ Project Structure</h3>
    <ul>
        <li><b>Data Explorer:</b> Interactive data inspection and basic statistics</li>
        <li><b>Dashboard:</b> Comprehensive visualizations of student data</li>
        <li><b>Model Trainer:</b> Train and evaluate machine learning models</li>
        <li><b>Model Comparison:</b> Compare different algorithms</li>
        <li><b>Risk Predictor:</b> Predict individual student performance</li>
        <li><b>Clustering:</b> Discover student segments</li>
        <li><b>Export:</b> Save results and reports</li>
    </ul>
</div>
"""))