In [None]:
import missingno as msno
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
import plotly.express as px
import plotly.io as pio
pio.renderers.default = 'iframe'
import plotly.subplots as sp
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import seaborn as sns
from typing import List
import plotly.io as pio
pio.renderers.default = 'colab'  # instead of 'browser'

**Load the Dataset**

In [None]:
# Load the 'Student Performance Factors' dataset and display the first 5 rows to inspect the data structure
df = pd.read_csv('/content/StudentPerformanceFactors.csv')
df.head()

Unnamed: 0,Hours_Studied,Attendance,Parental_Involvement,Access_to_Resources,Extracurricular_Activities,Sleep_Hours,Previous_Scores,Motivation_Level,Internet_Access,Tutoring_Sessions,Family_Income,Teacher_Quality,School_Type,Peer_Influence,Physical_Activity,Learning_Disabilities,Parental_Education_Level,Distance_from_Home,Gender,Exam_Score
0,23,84,Low,High,No,7,73,Low,Yes,0,Low,Medium,Public,Positive,3,No,High School,Near,Male,67
1,19,64,Low,Medium,No,8,59,Low,Yes,2,Medium,Medium,Public,Negative,4,No,College,Moderate,Female,61
2,24,98,Medium,Medium,Yes,7,91,Medium,Yes,2,Medium,Medium,Public,Neutral,4,No,Postgraduate,Near,Male,74
3,29,89,Low,Medium,Yes,8,98,Medium,Yes,1,Medium,Medium,Public,Negative,4,No,High School,Moderate,Male,71
4,19,92,Medium,Medium,Yes,6,65,Medium,Yes,3,Medium,High,Public,Neutral,4,No,College,Near,Female,70


In [None]:
df.shape

(6607, 20)

In [None]:
# Check the dimensions of the dataset (number of rows and columns)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6607 entries, 0 to 6606
Data columns (total 20 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   Hours_Studied               6607 non-null   int64 
 1   Attendance                  6607 non-null   int64 
 2   Parental_Involvement        6607 non-null   object
 3   Access_to_Resources         6607 non-null   object
 4   Extracurricular_Activities  6607 non-null   object
 5   Sleep_Hours                 6607 non-null   int64 
 6   Previous_Scores             6607 non-null   int64 
 7   Motivation_Level            6607 non-null   object
 8   Internet_Access             6607 non-null   object
 9   Tutoring_Sessions           6607 non-null   int64 
 10  Family_Income               6607 non-null   object
 11  Teacher_Quality             6529 non-null   object
 12  School_Type                 6607 non-null   object
 13  Peer_Influence              6607 non-null   obje

In [None]:
# Generate descriptive statistics of numerical columns (count, mean, std, min, quartiles, max)
df.describe()

Unnamed: 0,Hours_Studied,Attendance,Sleep_Hours,Previous_Scores,Tutoring_Sessions,Physical_Activity,Exam_Score
count,6607.0,6607.0,6607.0,6607.0,6607.0,6607.0,6607.0
mean,19.975329,79.977448,7.02906,75.070531,1.493719,2.96761,67.235659
std,5.990594,11.547475,1.46812,14.399784,1.23057,1.031231,3.890456
min,1.0,60.0,4.0,50.0,0.0,0.0,55.0
25%,16.0,70.0,6.0,63.0,1.0,2.0,65.0
50%,20.0,80.0,7.0,75.0,1.0,3.0,67.0
75%,24.0,90.0,8.0,88.0,2.0,4.0,69.0
max,44.0,100.0,10.0,100.0,8.0,6.0,101.0


In [None]:
# Display the column names of the DataFrame to understand the dataset's structure
df.columns

Index(['Hours_Studied', 'Attendance', 'Parental_Involvement',
       'Access_to_Resources', 'Extracurricular_Activities', 'Sleep_Hours',
       'Previous_Scores', 'Motivation_Level', 'Internet_Access',
       'Tutoring_Sessions', 'Family_Income', 'Teacher_Quality', 'School_Type',
       'Peer_Influence', 'Physical_Activity', 'Learning_Disabilities',
       'Parental_Education_Level', 'Distance_from_Home', 'Gender',
       'Exam_Score'],
      dtype='object')

In [None]:
# Select numerical columns (float and integer types)
numerical_cols = df.select_dtypes(include=['float64', 'int64']).columns.tolist()

# Select categorical columns (object/string types)
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()

# Print results with clear formatting
print("\n" + "="*50)
print(f"{len(numerical_cols)} NUMERICAL COLUMNS:")
print("="*50)
print(numerical_cols)

print("\n" + "="*50)
print(f"{len(categorical_cols)} CATEGORICAL COLUMNS:")
print("="*50)
print(categorical_cols)


7 NUMERICAL COLUMNS:
['Hours_Studied', 'Attendance', 'Sleep_Hours', 'Previous_Scores', 'Tutoring_Sessions', 'Physical_Activity', 'Exam_Score']

13 CATEGORICAL COLUMNS:
['Parental_Involvement', 'Access_to_Resources', 'Extracurricular_Activities', 'Motivation_Level', 'Internet_Access', 'Family_Income', 'Teacher_Quality', 'School_Type', 'Peer_Influence', 'Learning_Disabilities', 'Parental_Education_Level', 'Distance_from_Home', 'Gender']


In [None]:
# Calculate missing values per column and sort descending
missing_values = df.isnull().sum().sort_values(ascending=False)

# Filter to only show columns with missing values
missing_values = missing_values[missing_values > 0]

# Print formatted results
print("\n" + "="*50)
print("MISSING VALUE ANALYSIS")
print("="*50)
if len(missing_values) == 0:
    print("✅ No missing values found in the dataset")
else:
    print(f"⚠️ {len(missing_values)} columns contain missing values:\n")
    print(missing_values.to_string())  # to_string() removes dtype info

    # Calculate percentage missing
    print("\n" + "-"*50)
    print("PERCENTAGE MISSING:")
    print("-"*50)
    print((df[missing_values.index].isnull().mean()*100).round(2).to_string() + "%")


MISSING VALUE ANALYSIS
⚠️ 3 columns contain missing values:

Parental_Education_Level    90
Teacher_Quality             78
Distance_from_Home          67

--------------------------------------------------
PERCENTAGE MISSING:
--------------------------------------------------
Parental_Education_Level    1.36
Teacher_Quality             1.18
Distance_from_Home          1.01%


In [None]:
def check_col(
    df,
    columns: List[str],
    rows: int = 2,
    cols: int = 2,
    color_palette: str = 'Pastel',
    template: str = 'plotly_white',
    fig_height: int = 1000,
    fig_width: int = 800
) -> None:
    """
    Visualize distributions of categorical columns in a grid layout.

    Parameters:
    - df: Input DataFrame
    - columns: List of column names to visualize
    - rows: Number of rows in subplot grid
    - cols: Number of columns in subplot grid
    - color_palette: Plotly color palette name
    - template: Plotly template name
    - fig_height: Figure height in pixels
    - fig_width: Figure width in pixels
    """
    num_plots = len(columns)

    # Create subplot figure
    fig = sp.make_subplots(
        rows=rows,
        cols=cols,
        subplot_titles=columns,
        horizontal_spacing=0.1,
        vertical_spacing=0.15
    )

    # Generate a plot for each column
    for idx, col in enumerate(columns):
        # Calculate subplot position
        row_pos = (idx // cols) + 1
        col_pos = (idx % cols) + 1

        # Get value counts and ensure proper column names
        count_data = (df[col]
                     .value_counts()
                     .reset_index()
                     .rename(columns={'index': 'category', col: 'count'}))

        # Create bar plot
        bar = px.bar(
            count_data,
            x='category',  # Use the renamed column
            y='count',
            color='category',  # Use the renamed column
            text_auto=True,
            color_discrete_sequence=getattr(px.colors.qualitative, color_palette)
        )

        # Add traces to subplot
        for trace in bar.data:
            trace.update(
                textposition='outside',
                textfont_size=12,
                showlegend=False
            )
            fig.add_trace(trace, row=row_pos, col=col_pos)

    # Update layout
    fig.update_layout(
        height=fig_height,
        width=fig_width,
        template=template,
        margin=dict(t=100),
        title_text="Categorical Variable Distributions",
        title_x=0.5
    )

    # Adjust subplot titles
    fig.update_annotations(
        font_size=14,
        yshift=20
    )

    fig.show()

In [None]:
# Define imputation values with data type safety
imputation_values = {
    "Teacher_Quality": "Medium",                  # Categorical (ordinal)
    "Parental_Education_Level": "High School",    # Categorical (nominal)
    "Distance_from_Home": "Near"                  # Categorical (ordinal)
}

# Execute imputation with validation
try:
    # Create pre-imputation missing value report
    pre_impute_missing = df[imputation_values.keys()].isnull().sum()

    # Perform the imputation
    df.fillna(imputation_values, inplace=True)

    # Verify completion
    post_impute_missing = df[imputation_values.keys()].isnull().sum()

    # Generate report
    print("\n" + "="*50)
    print("IMPUTATION REPORT")
    print("="*50)
    print(f"{pre_impute_missing.sum()} missing values replaced")
    print("-"*30)
    print("Before Imputation:".ljust(20), pre_impute_missing.to_string())
    print("After Imputation:".ljust(20), post_impute_missing.to_string())

except Exception as e:
    print(f"\n⚠️ Imputation Error: {str(e)}")
    # Consider reverting changes here if inplace=True causes issues


IMPUTATION REPORT
235 missing values replaced
------------------------------
Before Imputation:   Teacher_Quality             78
Parental_Education_Level    90
Distance_from_Home          67
After Imputation:    Teacher_Quality             0
Parental_Education_Level    0
Distance_from_Home          0


In [None]:
# Check missing values
missing_values = df.isnull().sum()
has_missing = missing_values > 0
print("Missing Values:\n", missing_values[has_missing] if has_missing.any() else "No missing values")

# Check columns
def check_cols(df, expected_n=None):
    cols = df.columns
    print(f"\nDataset shape: {len(df)} rows, {len(cols)} columns")
    if expected_n and len(cols) != expected_n:
        print(f"Warning: Expected {expected_n} columns, found {len(cols)}")
    return list(cols)


Missing Values:
 No missing values


In [None]:
# Cap exam scores at maximum 100 by replacing 101 with 100
df['Exam_Score'] = df['Exam_Score'].replace(101, 100)

In [None]:
# Set up visualization for numerical features
num_cols = df.select_dtypes(exclude='object')
cols = 2
rows = (len(num_cols.columns) + cols - 1) // cols  # More robust row calculation

# Create subplot grid
fig = sp.make_subplots(
    rows=rows,
    cols=cols,
    subplot_titles=num_cols.columns,
    vertical_spacing=0.1,
    horizontal_spacing=0.1
)

# Custom color palette
colors = ['#404546', '#4C6B73', '#7492C1', '#8A7D4C', '#A7B7A7', '#A69585', '#6D7F8C']

# Plot each numerical feature
for i, column in enumerate(num_cols.columns, 1):
    row = (i - 1) // cols + 1
    col = (i - 1) % cols + 1

    # Get sorted value counts
    counts = df[column].value_counts().sort_index().reset_index()
    counts.columns = ['value', 'count']

    # Add trace with improved visualization
    fig.add_trace(
        go.Scatter(
            x=counts['value'],
            y=counts['count'],
            mode='lines+markers',
            line=dict(
                color=colors[i % len(colors)],
                width=2,
                shape='spline',
                smoothing=0.7
            ),
            fill='tozeroy',
            marker=dict(size=6),
            name=column,
            hovertemplate=f"<b>{column}</b><br>Value: %{{x}}<br>Count: %{{y}}<extra></extra>"
        ),
        row=row,
        col=col
    )

# Update layout
fig.update_layout(
    title_text="Numerical Features Distribution",
    height=300 * rows,  # Dynamic height based on rows
    width=1000,
    showlegend=False,
    template='plotly_white',
    margin=dict(t=100, b=50)
)

# Update axes for all subplots
fig.update_xaxes(title_text="Value", showgrid=True)
fig.update_yaxes(title_text="Count", showgrid=True)

fig.show()

In [None]:
# Visualize distribution of categorical features
cat_cols = df.select_dtypes(include='object')
cols = 2
rows = (len(cat_cols.columns) + cols - 1) // cols  # More robust row calculation

# Extended color palette
colors = ['#404546', '#4C6B73', '#7492C1', '#8A7D4C', '#A7B7A7', '#A69585', '#6D7F8C',
          '#4F7D9E', '#82A1B2', '#A19B8F', '#B59A6A', '#9A7A48', '#706E4C']

# Create subplot grid with adjusted spacing
fig = sp.make_subplots(
    rows=rows,
    cols=cols,
    subplot_titles=cat_cols.columns,
    vertical_spacing=0.15,
    horizontal_spacing=0.1
)

# Plot each categorical feature
for i, column in enumerate(cat_cols.columns, 1):
    row = (i - 1) // cols + 1
    col = (i - 1) % cols + 1

    # Get value counts and sort by count (descending)
    count_df = df[column].value_counts().reset_index()
    count_df.columns = ['category', 'count']
    count_df = count_df.sort_values('count', ascending=False)

    # Add bar plot with improved formatting
    fig.add_trace(
        go.Bar(
            x=count_df['category'],
            y=count_df['count'],
            name=column,
            marker_color=colors[i % len(colors)],
            text=count_df['count'],
            texttemplate='%{text:,}',
            textposition='auto',
            hovertemplate='<b>%{x}</b><br>Count: %{y:,}<extra></extra>'
        ),
        row=row,
        col=col
    )

# Update layout with dynamic height
fig.update_layout(
    title_text='Categorical Features Distribution',
    title_x=0.5,
    height=400 * rows,  # Dynamic height based on rows
    width=1000,
    showlegend=False,  # Disabled as subplot titles suffice
    template='plotly_white',
    margin=dict(t=100, b=50, l=50, r=50),
    hoverlabel=dict(
        bgcolor='white',
        font_size=12,
        font_family='Arial'
    )
)

# Format axes for all subplots
fig.update_xaxes(tickangle=45, tickfont=dict(size=10))
fig.update_yaxes(title_text='Count')

fig.show()

In [None]:
# Enhanced scatter plot with available data
fig = px.scatter(
    df,
    x="Hours_Studied",
    y="Exam_Score",
    color="Attendance",
    color_continuous_scale=px.colors.sequential.Viridis,
    title="Exam Performance: Study Hours vs Attendance Impact",
    labels={
        "Hours_Studied": "Weekly Study Hours",
        "Exam_Score": "Exam Score (100-point scale)",
        "Attendance": "Attendance Rate (%)"
    },
    hover_data=["Gender", "Previous_Scores"],  # Using existing columns
    trendline="lowess"
)

# Calculate attendance impact statistics
attendance_impact = df.groupby(pd.cut(df["Attendance"], bins=5))["Exam_Score"].agg(
    ['mean', 'median', 'count']
).rename_axis("Attendance_Range").reset_index()

# Format plot layout
fig.update_layout(
    width=700,
    height=600,
    template='plotly_white',
    title_x=0.5,
    title_font=dict(size=18),
    coloraxis_colorbar=dict(
        title="Attendance %",
        thickness=20,
        len=0.75
    ),
    xaxis=dict(title_font=dict(size=14)),
    yaxis=dict(title_font=dict(size=14))
)

# Add informative annotations
fig.add_annotation(
    text="Higher attendance correlates with<br>better exam performance",
    xref="paper", yref="paper",
    x=0.05, y=0.95,
    showarrow=False,
    font=dict(size=12)
)

fig.show()

# Display attendance impact analysis
print("\n📊 Attendance Impact Analysis:")
print("="*50)
print(attendance_impact.to_markdown(tablefmt="grid", index=False))
print("\nKey Insights:")
print("-"*50)
print(f"• Performance difference between highest and lowest attendance groups: "
      f"{attendance_impact['mean'].iloc[-1] - attendance_impact['mean'].iloc[0]:.1f} points")
print(f"• Strongest correlation in {attendance_impact.iloc[-2]['Attendance_Range']} range")


📊 Attendance Impact Analysis:
+--------------------+---------+----------+---------+
| Attendance_Range   |    mean |   median |   count |
| (59.96, 68.0]      | 64.0969 |       64 |    1403 |
+--------------------+---------+----------+---------+
| (68.0, 76.0]       | 65.9284 |       66 |    1327 |
+--------------------+---------+----------+---------+
| (76.0, 84.0]       | 67.2094 |       67 |    1366 |
+--------------------+---------+----------+---------+
| (84.0, 92.0]       | 69.0736 |       69 |    1250 |
+--------------------+---------+----------+---------+
| (92.0, 100.0]      | 70.3093 |       70 |    1261 |
+--------------------+---------+----------+---------+

Key Insights:
--------------------------------------------------
• Performance difference between highest and lowest attendance groups: 6.2 points
• Strongest correlation in (84.0, 92.0] range


In [None]:
# Create a box plot to analyze exam score distribution by parental involvement and resource access
fig = px.box(
    df,
    x="Parental_Involvement",
    y="Exam_Score",
    color="Access_to_Resources",
    category_orders={
        "Parental_Involvement": ["Low", "Medium", "High"],  # Ordered categories
        "Access_to_Resources": ["Poor", "Fair", "Good"]      # Consistent ordering
    },
    color_discrete_sequence=px.colors.qualitative.D3,       # Color-blind friendly palette
    title="Exam Score Distribution by Parental Involvement and Resource Access",
    labels={
        "Parental_Involvement": "Level of Parental Involvement",
        "Exam_Score": "Exam Score (0-100)",
        "Access_to_Resources": "Resource Access Level"
    },
    hover_data=["Gender", "School_Type"]                    # Additional context in tooltips
)

# Enhance layout and readability
fig.update_layout(
    width=800,                                             # Slightly wider for better spacing
    height=600,
    template='plotly_white',
    title_x=0.5,                                           # Center-aligned title
    title_font=dict(size=18),
    boxmode='group',                                       # Grouped box plot
    xaxis_title_font=dict(size=14),
    yaxis_title_font=dict(size=14),
    legend_title_font=dict(size=12)
)

# Add reference line for passing score
fig.add_hline(
    y=70,
    line_dash="dot",
    annotation_text="Passing Threshold",
    annotation_position="bottom right"
)

fig.show()

In [None]:
# Convert to categorical with proper ordering
df['Tutoring_Sessions'] = pd.Categorical(
    df['Tutoring_Sessions'],
    categories=sorted(df['Tutoring_Sessions'].unique()),
    ordered=True
)

# Create interactive 3D visualization
fig = px.scatter_3d(
    df,
    x="Tutoring_Sessions",
    y="Family_Income",
    z="Exam_Score",
    color="Tutoring_Sessions",
    color_discrete_sequence=px.colors.sequential.Blues_r,
    title="Exam Score Relationship: Tutoring Sessions vs Family Income",
    labels={
        "Tutoring_Sessions": "Tutoring Sessions",
        "Family_Income": "Family Income (USD)",
        "Exam_Score": "Exam Score (0-100)"
    },
    hover_data=["School_Type", "Parental_Education_Level"],
    symbol="Access_to_Resources"
)

# Enhanced layout with annotations
fig.update_layout(
    width=800,
    height=700,
    template='plotly_dark',
    title_x=0.5,
    title_font=dict(size=18),
    scene=dict(
        xaxis_title_font=dict(size=14),
        yaxis_title_font=dict(size=14),
        zaxis_title_font=dict(size=14),
        camera=dict(eye=dict(x=1.5, y=1.5, z=0.8)),
        annotations=[
            dict(
                x=0,
                y=0,
                z=70,
                text="Passing Threshold",
                showarrow=False,
                font=dict(color="yellow", size=12),
                bgcolor="rgba(0,0,0,0.5)"
            )
        ]
    ),
    legend_title_text="Tutoring<br>Sessions"
)

# Add reference line at passing score
fig.update_scenes(
    zaxis=dict(
        showspikes=True,
        spikesides=False,
        spikethickness=1,
        showgrid=True,
        range=[0, 100]
    )
)

fig.show()

In [None]:
# Create interactive treemap with valid hover data
fig = px.treemap(
    df,
    path=["Motivation_Level", "Peer_Influence"],
    values="Exam_Score",
    color="Exam_Score",
    color_continuous_scale='Purp_r',  # Reversed purple scale
    title="Exam Performance: Motivation Level & Peer Influence Impact",
    hover_data=["School_Type", "Gender"],  # Using existing columns
    labels={"Exam_Score": "Avg Exam Score"},
    width=800,
    height=650
)

# Enhanced layout configuration
fig.update_layout(
    margin=dict(t=50, l=25, r=25, b=25),
    template='ggplot2',
    title_x=0.5,
    title_font=dict(size=18),
    coloraxis_colorbar=dict(
        title="Avg Score",
        thickness=20,
        len=0.75
    )
)

# Add count annotations using customdata
fig.update_traces(
    textinfo="label+value+percent entry",
    texttemplate="<b>%{label}</b><br>Avg: %{value}<br>%{percentEntry:.1%}",
    textfont=dict(size=14),
    marker=dict(line=dict(width=0.5, color='white')),
    customdata=df.groupby(["Motivation_Level", "Peer_Influence"]).size().reset_index(name='count')
)

# Show the figure
fig.show()

# Create enhanced cross-tabulation analysis
print("\n📊 Motivation vs Peer Influence Distribution:")
print("="*50)
pm = pd.crosstab(
    df["Motivation_Level"],
    df["Peer_Influence"],
    margins=True,
    margins_name="Total"
).style\
    .background_gradient(cmap='Purples', axis=1)\
    .format(precision=0)\
    .set_caption("Student Count by Motivation and Peer Influence")

display(pm)

# Calculate performance metrics
print("\n🔍 Performance Analysis:")
print("="*50)
performance_matrix = df.groupby(["Motivation_Level", "Peer_Influence"])["Exam_Score"]\
    .agg(['mean', 'count'])\
    .rename(columns={'mean': 'Avg Score', 'count': 'Students'})\
    .style\
    .background_gradient(subset=['Avg Score'], cmap='Purples')\
    .format({'Avg Score': '{:.1f}', 'Students': '{:.0f}'})

display(performance_matrix)


📊 Motivation vs Peer Influence Distribution:


Peer_Influence,Negative,Neutral,Positive,Total
Motivation_Level,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
High,286,500,533,1319
Low,417,737,783,1937
Medium,674,1355,1322,3351
Total,1377,2592,2638,6607



🔍 Performance Analysis:


Unnamed: 0_level_0,Unnamed: 1_level_0,Avg Score,Students
Motivation_Level,Peer_Influence,Unnamed: 2_level_1,Unnamed: 3_level_1
High,Negative,67.2,286
High,Neutral,67.7,500
High,Positive,68.0,533
Low,Negative,66.3,417
Low,Neutral,66.7,737
Low,Positive,67.1,783
Medium,Negative,66.4,674
Medium,Neutral,67.3,1355
Medium,Positive,67.8,1322


In [None]:
# Create enhanced box plot comparing exam scores by parental education and learning disabilities
fig = px.box(
    df,
    x="Parental_Education_Level",
    y="Exam_Score",
    color="Learning_Disabilities",
    color_discrete_sequence=px.colors.qualitative.D3,
    category_orders={
        "Parental_Education_Level": sorted(df["Parental_Education_Level"].unique()),
        "Learning_Disabilities": ["Yes", "No"]  # Consistent ordering
    },
    title="Exam Score Distribution by Parental Education Level and Learning Disability Status",
    labels={
        "Parental_Education_Level": "Highest Parental Education",
        "Exam_Score": "Exam Score (0-100)",
        "Learning_Disabilities": "Learning Disability"
    },
    hover_data=["Gender", "Tutoring_Sessions"]  # Additional context
)

# Professional layout configuration
fig.update_layout(
    width=800,
    height=600,
    template='plotly_white',
    title_x=0.5,
    title_font=dict(size=18),
    boxmode='group',  # Grouped box plot
    xaxis_title_font=dict(size=14),
    yaxis_title_font=dict(size=14),
    legend_title_font=dict(size=12),
    margin=dict(t=80, b=80, l=80, r=80)  # Balanced margins
)

# Add reference line for passing score
fig.add_hline(
    y=70,
    line_dash="dot",
    line_color="red",
    annotation_text="Passing Threshold (70)",
    annotation_position="bottom right",
    annotation_font=dict(size=12)
)

# Improve hover template
fig.update_traces(
    hovertemplate="<b>%{x}</b><br>"
                 "Score: %{y}<br>"
                 "Disability: %{fullData.name}<br>"
                 "<extra></extra>"
)

fig.show()

# Generate statistical summary
print("\n📊 Performance Summary:")
print("="*50)
stats = df.groupby(["Parental_Education_Level", "Learning_Disabilities"])["Exam_Score"]\
    .agg(['mean', 'median', 'count'])\
    .rename(columns={'mean': 'Average', 'median': 'Median', 'count': 'Students'})

display(stats.style\
    .background_gradient(subset=['Average', 'Median'], cmap='Blues')\
    .format({'Average': '{:.1f}', 'Median': '{:.1f}', 'Students': '{:.0f}'}))


📊 Performance Summary:


Unnamed: 0_level_0,Unnamed: 1_level_0,Average,Median,Students
Parental_Education_Level,Learning_Disabilities,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
College,No,67.4,67.0,1794
College,Yes,66.1,66.0,195
High School,No,67.0,67.0,2946
High School,Yes,66.1,66.0,367
Postgraduate,No,68.1,68.0,1172
Postgraduate,Yes,67.0,67.0,133


In [None]:
# Create enhanced violin plot comparing exam scores by school type and teacher quality
fig = px.violin(
    df,
    x="School_Type",
    y="Exam_Score",
    color="Teacher_Quality",
    box=True,  # Show box plot inside violin
    points="all",  # Show all data points
    violinmode='group',  # Group violins together
    color_discrete_sequence=px.colors.diverging.BrBG,
    title="Exam Score Distribution by School Type and Teacher Quality",
    labels={
        "School_Type": "Type of School",
        "Exam_Score": "Exam Score (0-100)",
        "Teacher_Quality": "Teacher Quality Rating"
    },
    hover_data=["Parental_Education_Level", "Hours_Studied"],  # Additional context
    category_orders={
        "Teacher_Quality": ["Poor", "Average", "Good", "Excellent"],
        "School_Type": sorted(df["School_Type"].unique())
    }
)

# Professional layout configuration
fig.update_layout(
    width=900,  # Slightly wider for better spacing
    height=650,
    template='plotly_white',
    title_x=0.5,
    title_font=dict(size=20),
    xaxis_title_font=dict(size=14),
    yaxis_title_font=dict(size=14),
    legend_title_font=dict(size=12),
    margin=dict(t=100, b=100, l=100, r=50),
    showlegend=True,
    legend=dict(
        orientation="h",
        yanchor="bottom",
        y=1.02,
        xanchor="right",
        x=1
    )
)

# Add reference lines
fig.add_hline(
    y=70,
    line_dash="dot",
    line_color="red",
    annotation_text="Passing Threshold",
    annotation_position="bottom right"
)

fig.add_hline(
    y=90,
    line_dash="dot",
    line_color="green",
    annotation_text="Excellent Threshold",
    annotation_position="top right"
)

# Customize hover template
fig.update_traces(
    hovertemplate="<b>%{x}</b><br>"
                 "Score: %{y}<br>"
                 "Teacher Quality: %{fullData.name}<br>"
                 "<extra></extra>",
    meanline_visible=True  # Show mean line
)

fig.show()

# Generate performance statistics
print("\n📊 Performance by School Type and Teacher Quality:")
print("="*60)
stats = df.groupby(["School_Type", "Teacher_Quality"])["Exam_Score"]\
    .agg(['mean', 'median', 'std', 'count'])\
    .rename(columns={
        'mean': 'Average',
        'median': 'Median',
        'std': 'Std Dev',
        'count': 'Students'
    })

display(stats.style\
    .background_gradient(subset=['Average', 'Median'], cmap='BrBG')\
    .format({'Average': '{:.1f}', 'Median': '{:.1f}', 'Std Dev': '{:.2f}', 'Students': '{:.0f}'}))


📊 Performance by School Type and Teacher Quality:


Unnamed: 0_level_0,Unnamed: 1_level_0,Average,Median,Std Dev,Students
School_Type,Teacher_Quality,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Private,High,67.5,67.0,3.69,600
Private,Low,67.0,67.0,3.64,194
Private,Medium,67.2,67.0,3.96,1215
Public,High,67.8,68.0,4.09,1347
Public,Low,66.6,66.0,3.96,463
Public,Medium,67.0,67.0,3.77,2788


In [None]:
# Create scatter matrix with guaranteed valid color scale
fig = px.scatter_matrix(
    df,
    dimensions=["Physical_Activity", "Sleep_Hours", "Exam_Score"],
    color="Exam_Score",
    color_continuous_scale="Blues_r",  # Using basic supported scale
    title="Relationships Between Physical Activity, Sleep Hours, and Exam Scores",
    labels={
        "Physical_Activity": "Physical Activity (hrs/week)",
        "Sleep_Hours": "Sleep Hours (per night)",
        "Exam_Score": "Exam Score (0-100)"
    },
    hover_data=["Gender", "School_Type"],
    symbol="Motivation_Level"
)

# Layout configuration
fig.update_layout(
    width=900,
    height=700,
    template='plotly_white',
    title_x=0.5,
    title_font=dict(size=18),
    coloraxis_colorbar=dict(
        title="Exam Score",
        thickness=20,
        len=0.75
    ),
    margin=dict(t=100, b=50, l=50, r=50)
)

# Style traces
fig.update_traces(
    diagonal_visible=True,
    showupperhalf=True,
    showlowerhalf=True,
    marker=dict(
        size=6,
        opacity=0.7,
        line=dict(width=0.5, color='white')
    )
)

fig.show()

# Correlation matrix with safe color scale
print("\n📊 Correlation Matrix:")
print("="*40)
corr_matrix = df[["Physical_Activity", "Sleep_Hours", "Exam_Score"]].corr()

# Use basic colormap that exists in all environments
try:
    display(corr_matrix.style
        .background_gradient(cmap='Blues_r')
        .format("{:.2f}")
        .set_caption("Pearson Correlation Coefficients"))
except:
    display(corr_matrix.style
        .background_gradient(cmap='Blues')
        .format("{:.2f}")
        .set_caption("Pearson Correlation Coefficients"))

# Key correlations
print("\n🔍 Key Relationships:")
print("="*40)
print(f"Physical Activity vs Exam Score: r = {corr_matrix.loc['Physical_Activity', 'Exam_Score']:.2f}")
print(f"Sleep Hours vs Exam Score: r = {corr_matrix.loc['Sleep_Hours', 'Exam_Score']:.2f}")
print(f"Physical Activity vs Sleep Hours: r = {corr_matrix.loc['Physical_Activity', 'Sleep_Hours']:.2f}")


📊 Correlation Matrix:


Unnamed: 0,Physical_Activity,Sleep_Hours,Exam_Score
Physical_Activity,1.0,-0.0,0.03
Sleep_Hours,-0.0,1.0,-0.02
Exam_Score,0.03,-0.02,1.0



🔍 Key Relationships:
Physical Activity vs Exam Score: r = 0.03
Sleep Hours vs Exam Score: r = -0.02
Physical Activity vs Sleep Hours: r = -0.00


In [None]:
# Create scatter matrix with universally supported color scale
fig = px.scatter_matrix(
    df,
    dimensions=["Physical_Activity", "Sleep_Hours", "Exam_Score"],
    color="Exam_Score",
    color_continuous_scale="Blues",  # Using basic supported scale
    title="Exam Scores by Physical Activity and Sleep Patterns",
    labels={
        "Physical_Activity": "Physical Activity (hrs/week)",
        "Sleep_Hours": "Sleep Duration (hrs/night)",
        "Exam_Score": "Exam Score"
    },
    hover_data=["Gender", "School_Type"]  # Additional context
)

# Single consolidated layout update
fig.update_layout(
    width=850,
    height=650,
    template='plotly_white',
    title_x=0.5,
    title_font=dict(size=18),
    margin=dict(t=80, b=60, l=60, r=60)
)

# Enhanced marker styling
fig.update_traces(
    diagonal_visible=True,
    marker=dict(
        size=7,
        opacity=0.8,
        line=dict(width=0.3, color='white')
    ),
    hovertemplate="<b>%{xaxis.title.text}:</b> %{x}<br>" +
                 "<b>%{yaxis.title.text}:</b> %{y}<br>" +
                 "<b>Exam Score:</b> %{marker.color}<extra></extra>"
)

fig.show()

# Generate and display correlation matrix
print("\n📊 Correlation Analysis:")
print("="*40)
corr_matrix = df[["Physical_Activity", "Sleep_Hours", "Exam_Score"]].corr()
display(corr_matrix.style
    .background_gradient(cmap='Blues')
    .format("{:.2f}")
    .set_caption("Pearson Correlation Coefficients"))


📊 Correlation Analysis:


Unnamed: 0,Physical_Activity,Sleep_Hours,Exam_Score
Physical_Activity,1.0,-0.0,0.03
Sleep_Hours,-0.0,1.0,-0.02
Exam_Score,0.03,-0.02,1.0


**The EDA Section is Finished!**

In [None]:
df.head()

Unnamed: 0,Hours_Studied,Attendance,Parental_Involvement,Access_to_Resources,Extracurricular_Activities,Sleep_Hours,Previous_Scores,Motivation_Level,Internet_Access,Tutoring_Sessions,Family_Income,Teacher_Quality,School_Type,Peer_Influence,Physical_Activity,Learning_Disabilities,Parental_Education_Level,Distance_from_Home,Gender,Exam_Score
0,23,84,Low,High,No,7,73,Low,Yes,0,Low,Medium,Public,Positive,3,No,High School,Near,Male,67
1,19,64,Low,Medium,No,8,59,Low,Yes,2,Medium,Medium,Public,Negative,4,No,College,Moderate,Female,61
2,24,98,Medium,Medium,Yes,7,91,Medium,Yes,2,Medium,Medium,Public,Neutral,4,No,Postgraduate,Near,Male,74
3,29,89,Low,Medium,Yes,8,98,Medium,Yes,1,Medium,Medium,Public,Negative,4,No,High School,Moderate,Male,71
4,19,92,Medium,Medium,Yes,6,65,Medium,Yes,3,Medium,High,Public,Neutral,4,No,College,Near,Female,70


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6607 entries, 0 to 6606
Data columns (total 20 columns):
 #   Column                      Non-Null Count  Dtype   
---  ------                      --------------  -----   
 0   Hours_Studied               6607 non-null   int64   
 1   Attendance                  6607 non-null   int64   
 2   Parental_Involvement        6607 non-null   object  
 3   Access_to_Resources         6607 non-null   object  
 4   Extracurricular_Activities  6607 non-null   object  
 5   Sleep_Hours                 6607 non-null   int64   
 6   Previous_Scores             6607 non-null   int64   
 7   Motivation_Level            6607 non-null   object  
 8   Internet_Access             6607 non-null   object  
 9   Tutoring_Sessions           6607 non-null   category
 10  Family_Income               6607 non-null   object  
 11  Teacher_Quality             6607 non-null   object  
 12  School_Type                 6607 non-null   object  
 13  Peer_Influence    