# Interactive Data Science Notebook Demo

This notebook demonstrates the capabilities of the Smart Classroom LMS Jupyter notebook execution system. It includes:

- Data visualization with matplotlib and plotly
- Pandas DataFrame operations  
- NumPy mathematical computations
- Error handling and data validation
- Interactive widgets (where supported)

**Note**: This notebook will be executed on the EC2 instance at 16.16.214.215 and rendered as HTML output.

## 1. Import Required Libraries

First, we'll install and import all necessary libraries for data science operations.

In [None]:
# Install required packages (these will be automatically detected and installed)
!pip install matplotlib pandas numpy plotly seaborn scikit-learn
!pip install ipywidgets

In [None]:
# Import libraries with error handling
try:
    import pandas as pd
    import numpy as np
    import matplotlib.pyplot as plt
    import plotly.express as px
    import plotly.graph_objects as go
    import seaborn as sns
    from sklearn.datasets import make_classification, make_regression
    from sklearn.model_selection import train_test_split
    from sklearn.linear_model import LinearRegression
    from sklearn.metrics import r2_score
    import warnings
    warnings.filterwarnings('ignore')
    
    print("✅ All libraries imported successfully!")
    print(f"Pandas version: {pd.__version__}")
    print(f"NumPy version: {np.__version__}")
    print(f"Matplotlib version: {plt.matplotlib.__version__}")
    
except ImportError as e:
    print(f"❌ Error importing libraries: {e}")
    print("Please ensure all required packages are installed.")

# Set up plotting style
plt.style.use('default')
sns.set_palette("husl")

## 2. Generate Sample Dataset

Create synthetic datasets with different characteristics to demonstrate various visualization techniques.

In [None]:
# Generate diverse sample datasets
np.random.seed(42)

def create_sample_data():
    """Create various sample datasets for demonstration"""
    try:
        # Dataset 1: Sales data
        dates = pd.date_range('2023-01-01', periods=100, freq='D')
        sales_data = {
            'date': dates,
            'sales': np.random.normal(1000, 200, 100) + np.sin(np.arange(100) * 0.1) * 100,
            'region': np.random.choice(['North', 'South', 'East', 'West'], 100),
            'product': np.random.choice(['Product A', 'Product B', 'Product C'], 100)
        }
        df_sales = pd.DataFrame(sales_data)
        df_sales['sales'] = df_sales['sales'].clip(lower=0)  # Ensure positive sales
        
        # Dataset 2: Student performance data
        students_data = {
            'student_id': range(1, 201),
            'math_score': np.random.normal(75, 15, 200).clip(0, 100),
            'science_score': np.random.normal(78, 12, 200).clip(0, 100),
            'grade': np.random.choice(['A', 'B', 'C', 'D'], 200, p=[0.2, 0.3, 0.3, 0.2]),
            'study_hours': np.random.exponential(3, 200).clip(0, 10)
        }
        df_students = pd.DataFrame(students_data)
        
        # Dataset 3: Time series data
        time_data = {
            'timestamp': pd.date_range('2023-01-01', periods=365, freq='H')[:365],
            'temperature': 20 + 10 * np.sin(np.arange(365) * 2 * np.pi / 365) + np.random.normal(0, 2, 365),
            'humidity': 50 + 20 * np.cos(np.arange(365) * 2 * np.pi / 365) + np.random.normal(0, 5, 365)
        }
        df_time = pd.DataFrame(time_data)
        
        print("✅ Sample datasets created successfully!")
        print(f"Sales data shape: {df_sales.shape}")
        print(f"Student data shape: {df_students.shape}")
        print(f"Time series data shape: {df_time.shape}")
        
        return df_sales, df_students, df_time
        
    except Exception as e:
        print(f"❌ Error creating sample data: {e}")
        return None, None, None

# Create the datasets
df_sales, df_students, df_time = create_sample_data()

# Display basic information about the datasets
if df_sales is not None:
    print("\n📊 Sales Data Sample:")
    print(df_sales.head())
    print(f"\nData types:\n{df_sales.dtypes}")
else:
    print("❌ Failed to create datasets")

## 3. Create Basic Interactive Plots

Build various types of visualizations using matplotlib and plotly to demonstrate different chart types.

In [None]:
# Create matplotlib visualizations
def create_matplotlib_plots():
    """Create various matplotlib plots with error handling"""
    try:
        fig, axes = plt.subplots(2, 2, figsize=(15, 10))
        fig.suptitle('Data Science Visualization Demo - Matplotlib', fontsize=16, fontweight='bold')
        
        # Plot 1: Sales over time
        if df_sales is not None:
            axes[0, 0].plot(df_sales['date'], df_sales['sales'], color='blue', alpha=0.7)
            axes[0, 0].set_title('Sales Over Time')
            axes[0, 0].set_xlabel('Date')
            axes[0, 0].set_ylabel('Sales ($)')
            axes[0, 0].grid(True, alpha=0.3)
            axes[0, 0].tick_params(axis='x', rotation=45)
        
        # Plot 2: Student score distribution
        if df_students is not None:
            axes[0, 1].hist(df_students['math_score'], bins=20, alpha=0.7, color='green', label='Math')
            axes[0, 1].hist(df_students['science_score'], bins=20, alpha=0.7, color='orange', label='Science')
            axes[0, 1].set_title('Score Distribution')
            axes[0, 1].set_xlabel('Score')
            axes[0, 1].set_ylabel('Frequency')
            axes[0, 1].legend()
            axes[0, 1].grid(True, alpha=0.3)
        
        # Plot 3: Sales by region
        if df_sales is not None:
            region_sales = df_sales.groupby('region')['sales'].mean()
            axes[1, 0].bar(region_sales.index, region_sales.values, color=['red', 'blue', 'green', 'orange'])
            axes[1, 0].set_title('Average Sales by Region')
            axes[1, 0].set_xlabel('Region')
            axes[1, 0].set_ylabel('Average Sales ($)')
            axes[1, 0].grid(True, alpha=0.3)
        
        # Plot 4: Scatter plot of study hours vs math score
        if df_students is not None:
            scatter = axes[1, 1].scatter(df_students['study_hours'], df_students['math_score'], 
                                       c=df_students['science_score'], cmap='viridis', alpha=0.6)
            axes[1, 1].set_title('Study Hours vs Math Score')
            axes[1, 1].set_xlabel('Study Hours')
            axes[1, 1].set_ylabel('Math Score')
            axes[1, 1].grid(True, alpha=0.3)
            plt.colorbar(scatter, ax=axes[1, 1], label='Science Score')
        
        plt.tight_layout()
        plt.show()
        print("✅ Matplotlib plots created successfully!")
        
    except Exception as e:
        print(f"❌ Error creating matplotlib plots: {e}")

# Generate matplotlib plots
create_matplotlib_plots()

In [None]:
# Create interactive plotly visualizations
def create_plotly_plots():
    """Create interactive plotly visualizations"""
    try:
        # Interactive scatter plot
        if df_students is not None:
            fig1 = px.scatter(df_students, 
                            x='study_hours', 
                            y='math_score', 
                            color='grade',
                            size='science_score',
                            hover_data=['student_id', 'science_score'],
                            title='Interactive: Study Hours vs Math Score')
            fig1.update_layout(width=800, height=500)
            fig1.show()
            
        # Interactive line plot for sales
        if df_sales is not None:
            fig2 = px.line(df_sales, 
                          x='date', 
                          y='sales', 
                          color='region',
                          title='Interactive: Sales Trends by Region')
            fig2.update_layout(width=800, height=500)
            fig2.show()
            
        # Interactive box plot
        if df_students is not None:
            fig3 = px.box(df_students, 
                         x='grade', 
                         y='math_score',
                         title='Interactive: Math Score Distribution by Grade')
            fig3.update_layout(width=800, height=500)
            fig3.show()
            
        print("✅ Plotly interactive plots created successfully!")
        
    except Exception as e:
        print(f"❌ Error creating plotly plots: {e}")

# Generate plotly plots
create_plotly_plots()

## 4. Data Validation and Error Handling

Implement comprehensive error handling for data validation and processing edge cases.

In [None]:
# Data validation and error handling functions
def validate_dataframe(df, name="DataFrame"):
    """Comprehensive data validation function"""
    try:
        if df is None:
            raise ValueError(f"{name} is None")
        
        if df.empty:
            raise ValueError(f"{name} is empty")
        
        print(f"✅ {name} Validation Results:")
        print(f"  - Shape: {df.shape}")
        print(f"  - Memory usage: {df.memory_usage(deep=True).sum() / 1024:.2f} KB")
        
        # Check for missing values
        missing_counts = df.isnull().sum()
        if missing_counts.any():
            print(f"  - Missing values found:")
            for col, count in missing_counts[missing_counts > 0].items():
                print(f"    {col}: {count} ({count/len(df)*100:.1f}%)")
        else:
            print("  - No missing values found")
        
        # Check data types
        print(f"  - Data types: {df.dtypes.value_counts().to_dict()}")
        
        # Check for duplicates
        duplicate_count = df.duplicated().sum()
        if duplicate_count > 0:
            print(f"  - ⚠️  {duplicate_count} duplicate rows found")
        else:
            print("  - No duplicate rows")
            
        return True
        
    except Exception as e:
        print(f"❌ Error validating {name}: {e}")
        return False

def safe_plot_creation(plot_func, *args, **kwargs):
    """Wrapper function for safe plot creation"""
    try:
        return plot_func(*args, **kwargs)
    except Exception as e:
        print(f"❌ Error creating plot: {e}")
        print("📋 Troubleshooting tips:")
        print("  - Check if data is not None")
        print("  - Verify column names exist")
        print("  - Ensure data types are appropriate")
        return None

# Validate all our datasets
print("🔍 Data Validation Report\n" + "="*50)
validate_dataframe(df_sales, "Sales Data")
print()
validate_dataframe(df_students, "Student Data") 
print()
validate_dataframe(df_time, "Time Series Data")

# Test error handling with problematic data
print("\n🧪 Testing Error Handling\n" + "="*50)

# Test with None data
print("Testing with None data:")
validate_dataframe(None, "None DataFrame")

# Test with empty DataFrame
print("\nTesting with empty DataFrame:")
empty_df = pd.DataFrame()
validate_dataframe(empty_df, "Empty DataFrame")

# Test with data containing missing values
print("\nTesting with missing values:")
test_data = pd.DataFrame({
    'A': [1, 2, None, 4, 5],
    'B': [None, 2, 3, None, 5],
    'C': [1, 2, 3, 4, 5]
})
validate_dataframe(test_data, "Test Data with Missing Values")

## 5. Machine Learning Demonstration

Showcase basic machine learning capabilities with visualization of results.

In [None]:
# Machine Learning Demonstration
def ml_demonstration():
    """Demonstrate basic machine learning with visualization"""
    try:
        print("🤖 Machine Learning Demo\n" + "="*30)
        
        # Generate synthetic dataset for regression
        X, y = make_regression(n_samples=100, n_features=1, noise=10, random_state=42)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        
        # Train a simple linear regression model
        model = LinearRegression()
        model.fit(X_train, y_train)
        
        # Make predictions
        y_pred = model.predict(X_test)
        r2 = r2_score(y_test, y_pred)
        
        print(f"✅ Model trained successfully!")
        print(f"📊 R² Score: {r2:.3f}")
        print(f"📏 Coefficient: {model.coef_[0]:.3f}")
        print(f"📐 Intercept: {model.intercept_:.3f}")
        
        # Visualize results
        plt.figure(figsize=(12, 5))
        
        # Plot 1: Training data and model
        plt.subplot(1, 2, 1)
        plt.scatter(X_train, y_train, alpha=0.6, color='blue', label='Training Data')
        X_line = np.linspace(X.min(), X.max(), 100).reshape(-1, 1)
        y_line = model.predict(X_line)
        plt.plot(X_line, y_line, color='red', linewidth=2, label='Model')
        plt.xlabel('Feature')
        plt.ylabel('Target')
        plt.title('Linear Regression Model')
        plt.legend()
        plt.grid(True, alpha=0.3)
        
        # Plot 2: Predictions vs Actual
        plt.subplot(1, 2, 2)
        plt.scatter(y_test, y_pred, alpha=0.6, color='green')
        plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', linewidth=2)
        plt.xlabel('Actual Values')
        plt.ylabel('Predicted Values')
        plt.title(f'Predictions vs Actual (R² = {r2:.3f})')
        plt.grid(True, alpha=0.3)
        
        plt.tight_layout()
        plt.show()
        
        return model, r2
        
    except Exception as e:
        print(f"❌ Error in ML demonstration: {e}")
        return None, None

# Run ML demonstration
model, score = ml_demonstration()

# Additional analysis if model was created successfully
if model is not None:
    print("\n📈 Model Analysis:")
    print(f"  - The model explains {score*100:.1f}% of the variance in the data")
    print(f"  - For every unit increase in the feature, target increases by {model.coef_[0]:.2f}")
    
    # Predict new values
    new_values = np.array([[-2], [0], [2]])
    predictions = model.predict(new_values)
    print("\n🔮 Sample Predictions:")
    for val, pred in zip(new_values.flatten(), predictions):
        print(f"  - Input: {val:4.1f} → Prediction: {pred:6.1f}")
else:
    print("❌ Model creation failed - skipping additional analysis")

## 6. Summary and Performance Analysis

Final summary of the notebook execution and system performance metrics.

In [None]:
# Summary and Performance Analysis
import time
import sys
from datetime import datetime

def generate_summary():
    """Generate a comprehensive summary of the notebook execution"""
    try:
        print("📋 NOTEBOOK EXECUTION SUMMARY")
        print("=" * 50)
        
        # System information
        print(f"🕒 Execution completed at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
        print(f"🐍 Python version: {sys.version.split()[0]}")
        
        # Data summary
        data_summary = {
            'Sales Dataset': df_sales.shape if df_sales is not None else "Failed to create",
            'Student Dataset': df_students.shape if df_students is not None else "Failed to create", 
            'Time Series Dataset': df_time.shape if df_time is not None else "Failed to create"
        }
        
        print(f"\n📊 Dataset Summary:")
        for name, shape in data_summary.items():
            if isinstance(shape, tuple):
                print(f"  - {name}: {shape[0]} rows × {shape[1]} columns")
            else:
                print(f"  - {name}: {shape}")
        
        # Visualization summary
        print(f"\n📈 Visualizations Created:")
        print(f"  - ✅ Matplotlib plots (4 subplots)")
        print(f"  - ✅ Plotly interactive plots (3 charts)")
        print(f"  - ✅ Machine learning visualization (2 plots)")
        
        # Error handling summary
        print(f"\n🛡️  Error Handling Tested:")
        print(f"  - ✅ Data validation functions")
        print(f"  - ✅ Import error handling") 
        print(f"  - ✅ Plot creation error handling")
        print(f"  - ✅ Machine learning error handling")
        
        # Performance metrics
        if model is not None and score is not None:
            print(f"\n🤖 Machine Learning Results:")
            print(f"  - Model Type: Linear Regression")
            print(f"  - R² Score: {score:.3f}")
            print(f"  - Status: {'✅ Good' if score > 0.7 else '⚠️ Moderate' if score > 0.5 else '❌ Poor'}")
        
        print(f"\n🎯 Smart Classroom LMS Features Demonstrated:")
        print(f"  - ✅ Jupyter notebook execution on EC2")
        print(f"  - ✅ Automatic package installation")
        print(f"  - ✅ Interactive visualization support")
        print(f"  - ✅ Comprehensive error handling")
        print(f"  - ✅ HTML output generation")
        
        print(f"\n🌟 Execution Status: SUCCESS!")
        print(f"   All major components executed without critical errors.")
        
    except Exception as e:
        print(f"❌ Error generating summary: {e}")

# Generate final summary
generate_summary()

# Memory usage analysis
try:
    total_memory = 0
    if df_sales is not None:
        total_memory += df_sales.memory_usage(deep=True).sum()
    if df_students is not None:  
        total_memory += df_students.memory_usage(deep=True).sum()
    if df_time is not None:
        total_memory += df_time.memory_usage(deep=True).sum()
    
    print(f"\n💾 Memory Usage: {total_memory / 1024:.2f} KB")
    print(f"   Memory efficiency: {'✅ Good' if total_memory < 100000 else '⚠️ Moderate'}")
    
except Exception as e:
    print(f"❌ Error calculating memory usage: {e}")

print(f"\n" + "=" * 50)
print(f"🎉 Demo notebook completed successfully!")
print(f"📝 This output will be converted to HTML and displayed in the LMS.")