In [1]:
# Complete Student Performance Data Mining Project with UI
# Copy this entire code into ONE Jupyter notebook cell and run

# Import all required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import io
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor, plot_tree
from sklearn.cluster import KMeans
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, mean_squared_error, r2_score
from sklearn.decomposition import PCA
import ipywidgets as widgets
from IPython.display import display, clear_output, HTML
import warnings
warnings.filterwarnings('ignore')

# Set style
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette("husl")
%matplotlib inline

# Function to generate sample data
def generate_sample_data(n_students=2000):
    """Generate sample student data for demonstration"""
    np.random.seed(42)
    
    data = {
        'StudentID': range(1001, 1001 + n_students),
        'Age': np.random.choice([15, 16, 17, 18], n_students, p=[0.25, 0.25, 0.25, 0.25]),
        'Gender': np.random.choice([0, 1], n_students, p=[0.5, 0.5]),
        'Ethnicity': np.random.choice([0, 1, 2, 3], n_students, p=[0.4, 0.3, 0.2, 0.1]),
        'ParentalEducation': np.random.choice([0, 1, 2, 3, 4], n_students, p=[0.1, 0.3, 0.3, 0.2, 0.1]),
        'StudyTimeWeekly': np.random.uniform(0, 20, n_students),
        'Absences': np.random.poisson(5, n_students),
        'Tutoring': np.random.choice([0, 1], n_students, p=[0.7, 0.3]),
        'ParentalSupport': np.random.choice([0, 1, 2, 3, 4], n_students, p=[0.1, 0.2, 0.3, 0.3, 0.1]),
        'Extracurricular': np.random.choice([0, 1], n_students, p=[0.6, 0.4]),
        'Sports': np.random.choice([0, 1], n_students, p=[0.55, 0.45]),
        'Music': np.random.choice([0, 1], n_students, p=[0.65, 0.35]),
        'Volunteering': np.random.choice([0, 1], n_students, p=[0.7, 0.3])
    }
    
    df = pd.DataFrame(data)
    
    # Generate GPA based on features
    base_gpa = 2.0
    study_effect = df['StudyTimeWeekly'] * 0.08
    absence_effect = -df['Absences'] * 0.05
    support_effect = df['ParentalSupport'] * 0.15
    tutoring_effect = df['Tutoring'] * 0.3
    activities_effect = (df['Extracurricular'] + df['Sports'] + df['Music'] + df['Volunteering']) * 0.1
    
    noise = np.random.normal(0, 0.3, n_students)
    
    df['GPA'] = base_gpa + study_effect + absence_effect + support_effect + tutoring_effect + activities_effect + noise
    df['GPA'] = df['GPA'].clip(0, 4.0)
    
    # Generate GradeClass based on GPA
    df['GradeClass'] = pd.cut(df['GPA'], 
                              bins=[-0.1, 1.0, 2.0, 3.0, 3.5, 4.0],
                              labels=[4.0, 3.0, 2.0, 1.0, 0.0]).astype(float)
    
    return df

# Load data (use sample data for demonstration)
print("="*60)
print("STUDENT PERFORMANCE DATA MINING PROJECT")
print("="*60)

# Try to load your file, if not found use sample data
try:
    # Try different possible filenames
    possible_files = [
        'Student_performance_data_.csv',
        'Student_performance_data.csv',
        'student_performance_data_.csv',
        'student_performance_data.csv'
    ]
    
    df = None
    for file in possible_files:
        try:
            df = pd.read_csv(file)
            print(f"‚úÖ Loaded: {file}")
            break
        except:
            continue
    
    if df is None:
        print("‚ö†Ô∏è  Using generated sample data...")
        df = generate_sample_data()
        
except Exception as e:
    print(f"‚ö†Ô∏è  Error loading file: {e}")
    print("‚úÖ Using generated sample data instead")
    df = generate_sample_data()

print(f"\nüìä Dataset Shape: {df.shape}")
print(f"üë• Total Students: {len(df)}")

# Add derived columns
df['Gender_Label'] = df['Gender'].map({0: 'Male', 1: 'Female'})
df['Ethnicity_Label'] = df['Ethnicity'].map({0: 'Caucasian', 1: 'African American', 2: 'Asian', 3: 'Other'})
df['ParentalEdu_Label'] = df['ParentalEducation'].map({0: 'None', 1: 'HS', 2: 'Some College', 3: 'Bachelor', 4: 'Higher'})
df['TotalActivities'] = df[['Tutoring', 'Extracurricular', 'Sports', 'Music', 'Volunteering']].sum(axis=1)

# Display welcome message
display(HTML("""
<div style="background-color: #2c3e50; color: white; padding: 20px; border-radius: 10px; margin: 20px 0;">
    <h1 style="text-align: center;">üéì Student Performance Analytics Platform</h1>
    <p style="text-align: center; font-size: 16px;">
        Interactive Data Mining Project with 6 Analysis Tools
    </p>
</div>
"""))

# ============================================
# UI Component 1: Data Explorer
# ============================================
def create_data_explorer():
    analysis_type = widgets.Dropdown(
        options=['Basic Statistics', 'Distribution Analysis', 'Correlation Analysis', 'Group Comparisons'],
        value='Basic Statistics',
        description='Analysis:',
        layout=widgets.Layout(width='300px')
    )
    
    column_selector = widgets.SelectMultiple(
        options=df.columns.tolist(),
        value=['GPA', 'StudyTimeWeekly', 'Absences'],
        description='Columns:',
        layout=widgets.Layout(width='300px', height='150px')
    )
    
    output = widgets.Output()
    
    def update_analysis(change):
        with output:
            clear_output(wait=True)
            if analysis_type.value == 'Basic Statistics':
                display(df[list(column_selector.value)].describe().round(2))
            elif analysis_type.value == 'Correlation Analysis':
                numeric_cols = df.select_dtypes(include=[np.number]).columns
                selected = [c for c in column_selector.value if c in numeric_cols]
                if len(selected) > 1:
                    plt.figure(figsize=(8,6))
                    sns.heatmap(df[selected].corr(), annot=True, cmap='coolwarm', center=0)
                    plt.title('Correlation Matrix')
                    plt.show()
    
    analysis_type.observe(update_analysis)
    column_selector.observe(update_analysis)
    update_analysis(None)
    
    return widgets.VBox([
        widgets.HTML("<h3>üîç Data Explorer</h3>"),
        widgets.HBox([analysis_type, column_selector]),
        output
    ])

# ============================================
# UI Component 2: Model Trainer
# ============================================
def create_model_trainer():
    problem = widgets.Dropdown(
        options=['Classification', 'Regression'],
        value='Classification',
        description='Problem:',
        layout=widgets.Layout(width='200px')
    )
    
    model_type = widgets.Dropdown(
        options=['Random Forest', 'Decision Tree'],
        value='Random Forest',
        description='Model:',
        layout=widgets.Layout(width='200px')
    )
    
    train_btn = widgets.Button(description='üöÄ Train Model', button_style='success')
    output = widgets.Output()
    
    def train_model(b):
        with output:
            clear_output()
            features = ['StudyTimeWeekly', 'Absences', 'ParentalSupport', 'TotalActivities']
            X = df[features]
            
            if problem.value == 'Classification':
                y = df['GradeClass']
                X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
                
                scaler = StandardScaler()
                X_train = scaler.fit_transform(X_train)
                X_test = scaler.transform(X_test)
                
                if model_type.value == 'Random Forest':
                    model = RandomForestClassifier(n_estimators=100, random_state=42)
                else:
                    model = DecisionTreeClassifier(max_depth=5, random_state=42)
                
                model.fit(X_train, y_train)
                y_pred = model.predict(X_test)
                
                print(f"‚úÖ Accuracy: {accuracy_score(y_test, y_pred):.4f}")
                print("\nüìã Classification Report:")
                print(classification_report(y_test, y_pred))
                
            else:  # Regression
                y = df['GPA']
                X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
                
                scaler = StandardScaler()
                X_train = scaler.fit_transform(X_train)
                X_test = scaler.transform(X_test)
                
                if model_type.value == 'Random Forest':
                    model = RandomForestRegressor(n_estimators=100, random_state=42)
                else:
                    model = DecisionTreeRegressor(max_depth=5, random_state=42)
                
                model.fit(X_train, y_train)
                y_pred = model.predict(X_test)
                
                print(f"‚úÖ R¬≤ Score: {r2_score(y_test, y_pred):.4f}")
                print(f"‚úÖ RMSE: {np.sqrt(mean_squared_error(y_test, y_pred)):.4f}")
    
    train_btn.on_click(train_model)
    
    return widgets.VBox([
        widgets.HTML("<h3>ü§ñ Model Trainer</h3>"),
        widgets.HBox([problem, model_type]),
        train_btn,
        output
    ])

# ============================================
# UI Component 3: Risk Predictor
# ============================================
def create_risk_predictor():
    study = widgets.FloatSlider(value=10, min=0, max=20, description='Study Time:')
    absences = widgets.IntSlider(value=5, min=0, max=30, description='Absences:')
    support = widgets.IntSlider(value=2, min=0, max=4, description='Parent Support:')
    activities = widgets.IntSlider(value=2, min=0, max=5, description='Activities:')
    
    predict_btn = widgets.Button(description='üéØ Predict Risk', button_style='danger')
    output = widgets.Output()
    
    def predict(b):
        with output:
            clear_output()
            # Train a simple model
            features = ['StudyTimeWeekly', 'Absences', 'ParentalSupport', 'TotalActivities']
            X = df[features]
            y = df['GPA']
            
            model = RandomForestRegressor(n_estimators=100, random_state=42)
            model.fit(X, y)
            
            # Predict
            new_data = [[study.value, absences.value, support.value, activities.value]]
            pred_gpa = model.predict(new_data)[0]
            
            # Determine risk
            if pred_gpa >= 3.0:
                risk = "üü¢ LOW RISK"
                color = "green"
            elif pred_gpa >= 2.0:
                risk = "üü° MEDIUM RISK"
                color = "orange"
            else:
                risk = "üî¥ HIGH RISK"
                color = "red"
            
            display(HTML(f"""
            <div style="background: #f0f0f0; padding: 15px; border-radius: 10px;">
                <h3>Predicted GPA: {pred_gpa:.2f}</h3>
                <h3 style="color: {color};">{risk}</h3>
            </div>
            """))
    
    predict_btn.on_click(predict)
    
    return widgets.VBox([
        widgets.HTML("<h3>üéØ Student Risk Predictor</h3>"),
        study, absences, support, activities,
        predict_btn, output
    ])

# ============================================
# UI Component 4: Clustering Explorer
# ============================================
def create_clustering():
    n_clusters = widgets.IntSlider(value=4, min=2, max=8, description='Clusters:')
    run_btn = widgets.Button(description='üîç Run Clustering', button_style='info')
    output = widgets.Output()
    
    def run_cluster(b):
        with output:
            clear_output()
            features = ['StudyTimeWeekly', 'Absences', 'GPA']
            X = df[features]
            
            scaler = StandardScaler()
            X_scaled = scaler.fit_transform(X)
            
            kmeans = KMeans(n_clusters=n_clusters.value, random_state=42, n_init=10)
            clusters = kmeans.fit_predict(X_scaled)
            
            # PCA for visualization
            pca = PCA(n_components=2)
            X_pca = pca.fit_transform(X_scaled)
            
            plt.figure(figsize=(10, 5))
            plt.subplot(1,2,1)
            scatter = plt.scatter(X_pca[:,0], X_pca[:,1], c=clusters, cmap='viridis')
            plt.colorbar(scatter)
            plt.title('Student Clusters')
            
            plt.subplot(1,2,2)
            df['Cluster'] = clusters
            cluster_stats = df.groupby('Cluster')[features].mean()
            cluster_stats.plot(kind='bar', ax=plt.gca())
            plt.title('Cluster Characteristics')
            plt.xticks(rotation=0)
            
            plt.tight_layout()
            plt.show()
            
            display(cluster_stats.round(2))
    
    run_btn.on_click(run_cluster)
    
    return widgets.VBox([
        widgets.HTML("<h3>üî¨ Clustering Explorer</h3>"),
        n_clusters, run_btn, output
    ])

# ============================================
# UI Component 5: Dashboard
# ============================================
def create_dashboard():
    output = widgets.Output()
    
    with output:
        fig, axes = plt.subplots(2, 3, figsize=(15, 10))
        
        # GPA Distribution
        axes[0,0].hist(df['GPA'], bins=20, edgecolor='black')
        axes[0,0].set_title('GPA Distribution')
        
        # GPA by Gender
        df.boxplot(column='GPA', by='Gender', ax=axes[0,1])
        axes[0,1].set_title('GPA by Gender')
        
        # Study Time vs GPA
        axes[0,2].scatter(df['StudyTimeWeekly'], df['GPA'], alpha=0.5)
        axes[0,2].set_xlabel('Study Time')
        axes[0,2].set_ylabel('GPA')
        
        # Absences vs GPA
        axes[1,0].scatter(df['Absences'], df['GPA'], alpha=0.5, color='red')
        axes[1,0].set_xlabel('Absences')
        axes[1,0].set_ylabel('GPA')
        
        # Grade Class Distribution
        df['GradeClass'].value_counts().sort_index().plot(kind='bar', ax=axes[1,1])
        axes[1,1].set_title('Grade Class Distribution')
        
        # Activities Impact
        activities = ['Tutoring', 'Extracurricular', 'Sports', 'Music']
        impact = [df[df[a]==1]['GPA'].mean() - df[df[a]==0]['GPA'].mean() for a in activities]
        axes[1,2].bar(activities, impact)
        axes[1,2].set_title('Activity Impact on GPA')
        axes[1,2].tick_params(axis='x', rotation=45)
        
        plt.tight_layout()
        plt.show()
    
    return widgets.VBox([
        widgets.HTML("<h3>üìä Quick Dashboard</h3>"),
        output
    ])

# ============================================
# Create Main Tab Interface
# ============================================
tab = widgets.Tab()

# Create all components
tab.children = [
    create_data_explorer(),
    create_dashboard(),
    create_model_trainer(),
    create_risk_predictor(),
    create_clustering()
]

# Set tab titles
tab.set_title(0, 'üîç Data Explorer')
tab.set_title(1, 'üìä Dashboard')
tab.set_title(2, 'ü§ñ Model Trainer')
tab.set_title(3, 'üéØ Risk Predictor')
tab.set_title(4, 'üî¨ Clustering')

# Display the main interface
display(tab)

# Display instructions
display(HTML("""
<div style="background-color: #e8f4f8; padding: 15px; border-radius: 10px; margin-top: 20px;">
    <h3>üìå How to Use:</h3>
    <ul>
        <li>Click on different tabs to access various analysis tools</li>
        <li>Use sliders, dropdowns, and buttons to interact with the data</li>
        <li>Results will appear below each control</li>
        <li>All visualizations update in real-time</li>
    </ul>
    <p><b>Dataset:</b> {} students, {} features</p>
</div>
""".format(len(df), len(df.columns))))

STUDENT PERFORMANCE DATA MINING PROJECT
‚ö†Ô∏è  Using generated sample data...

üìä Dataset Shape: (2000, 15)
üë• Total Students: 2000


Tab(children=(VBox(children=(HTML(value='<h3>üîç Data Explorer</h3>'), HBox(children=(Dropdown(description='Anal‚Ä¶