# Student Performance Analysis
## Cognitive Skills & Student Performance Dashboard

This notebook analyzes the relationship between cognitive skills and academic performance using a synthetic student dataset.

### Dataset Features:
- **student_id**: Unique identifier for each student
- **name**: Student name
- **class**: Academic class/subject
- **comprehension**: Reading and understanding ability (0-100)
- **attention**: Ability to focus on tasks (0-100)
- **focus**: Sustained concentration capability (0-100)
- **retention**: Memory and recall ability (0-100)
- **assessment_score**: Academic performance score (0-100)
- **engagement_time**: Daily study engagement in minutes

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import warnings
warnings.filterwarnings('ignore')

# Set style for better visualizations
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("Libraries imported successfully!")

In [None]:
# Load the student dataset
df = pd.read_csv('../src/data/students.csv')

# Display basic information about the dataset
print("Dataset Shape:", df.shape)
print("\nFirst 5 rows:")
print(df.head())

print("\nDataset Info:")
print(df.info())

print("\nBasic Statistics:")
print(df.describe())

## 1. Exploratory Data Analysis

In [None]:
# Check for missing values
print("Missing values:")
print(df.isnull().sum())

# Class distribution
print("\nClass distribution:")
print(df['class'].value_counts())

In [None]:
# Visualize distributions of cognitive skills and assessment scores
fig, axes = plt.subplots(2, 3, figsize=(15, 10))
fig.suptitle('Distribution of Cognitive Skills and Performance Metrics', fontsize=16)

# Cognitive skills
cognitive_skills = ['comprehension', 'attention', 'focus', 'retention', 'assessment_score', 'engagement_time']
colors = ['skyblue', 'lightgreen', 'lightcoral', 'lightyellow', 'lightpink', 'lightgray']

for i, (skill, color) in enumerate(zip(cognitive_skills, colors)):
    row = i // 3
    col = i % 3
    axes[row, col].hist(df[skill], bins=20, alpha=0.7, color=color, edgecolor='black')
    axes[row, col].set_title(f'Distribution of {skill.replace("_", " ").title()}')
    axes[row, col].set_xlabel(skill.replace('_', ' ').title())
    axes[row, col].set_ylabel('Frequency')

plt.tight_layout()
plt.show()

## 2. Correlation Analysis

In [None]:
# Calculate correlation matrix for numerical features
numerical_features = ['comprehension', 'attention', 'focus', 'retention', 'assessment_score', 'engagement_time']
correlation_matrix = df[numerical_features].corr()

# Create correlation heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0, 
            square=True, linewidths=0.5, cbar_kws={"shrink": .8})
plt.title('Correlation Matrix: Cognitive Skills vs Performance', fontsize=14)
plt.tight_layout()
plt.show()

print("\nCorrelation with Assessment Score:")
correlations_with_score = correlation_matrix['assessment_score'].sort_values(ascending=False)
for feature, corr in correlations_with_score.items():
    if feature != 'assessment_score':
        print(f"{feature.replace('_', ' ').title()}: {corr:.3f}")

In [None]:
# Scatter plots showing relationships between cognitive skills and assessment scores
fig, axes = plt.subplots(2, 2, figsize=(15, 12))
fig.suptitle('Cognitive Skills vs Assessment Score', fontsize=16)

cognitive_features = ['comprehension', 'attention', 'focus', 'retention']
colors = ['blue', 'green', 'red', 'orange']

for i, (feature, color) in enumerate(zip(cognitive_features, colors)):
    row = i // 2
    col = i % 2
    
    axes[row, col].scatter(df[feature], df['assessment_score'], alpha=0.6, color=color)
    
    # Add trend line
    z = np.polyfit(df[feature], df['assessment_score'], 1)
    p = np.poly1d(z)
    axes[row, col].plot(df[feature], p(df[feature]), "r--", alpha=0.8)
    
    axes[row, col].set_xlabel(feature.replace('_', ' ').title())
    axes[row, col].set_ylabel('Assessment Score')
    axes[row, col].set_title(f'{feature.replace("_", " ").title()} vs Assessment Score')
    
    # Add correlation coefficient
    corr = df[feature].corr(df['assessment_score'])
    axes[row, col].text(0.05, 0.95, f'r = {corr:.3f}', 
                       transform=axes[row, col].transAxes, 
                       bbox=dict(boxstyle='round', facecolor='white', alpha=0.8))

plt.tight_layout()
plt.show()

## 3. Performance Analysis by Class

In [None]:
# Box plots showing assessment score distribution by class
plt.figure(figsize=(14, 8))
sns.boxplot(data=df, x='class', y='assessment_score')
plt.xticks(rotation=45, ha='right')
plt.title('Assessment Score Distribution by Class', fontsize=14)
plt.xlabel('Class')
plt.ylabel('Assessment Score')
plt.tight_layout()
plt.show()

# Calculate mean scores by class
class_performance = df.groupby('class')['assessment_score'].agg(['mean', 'std', 'count']).round(2)
class_performance = class_performance.sort_values('mean', ascending=False)
print("\nPerformance by Class (sorted by mean score):")
print(class_performance)

## 4. Machine Learning Model for Assessment Score Prediction

In [None]:
# Prepare features for ML model
feature_columns = ['comprehension', 'attention', 'focus', 'retention', 'engagement_time']
X = df[feature_columns]
y = df['assessment_score']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training set size: {X_train.shape[0]}")
print(f"Testing set size: {X_test.shape[0]}")

In [None]:
# Train and evaluate Linear Regression model
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

# Make predictions
y_pred_lr = lr_model.predict(X_test)

# Calculate metrics
mse_lr = mean_squared_error(y_test, y_pred_lr)
r2_lr = r2_score(y_test, y_pred_lr)

print("Linear Regression Results:")
print(f"Mean Squared Error: {mse_lr:.3f}")
print(f"R² Score: {r2_lr:.3f}")

# Feature importance (coefficients)
feature_importance_lr = pd.DataFrame({
    'feature': feature_columns,
    'coefficient': lr_model.coef_
}).sort_values('coefficient', key=abs, ascending=False)

print("\nFeature Coefficients (Linear Regression):")
for _, row in feature_importance_lr.iterrows():
    print(f"{row['feature']}: {row['coefficient']:.3f}")

In [None]:
# Train and evaluate Random Forest model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Make predictions
y_pred_rf = rf_model.predict(X_test)

# Calculate metrics
mse_rf = mean_squared_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)

print("Random Forest Results:")
print(f"Mean Squared Error: {mse_rf:.3f}")
print(f"R² Score: {r2_rf:.3f}")

# Feature importance
feature_importance_rf = pd.DataFrame({
    'feature': feature_columns,
    'importance': rf_model.feature_importances_
}).sort_values('importance', ascending=False)

print("\nFeature Importance (Random Forest):")
for _, row in feature_importance_rf.iterrows():
    print(f"{row['feature']}: {row['importance']:.3f}")

In [None]:
# Visualize model performance
fig, axes = plt.subplots(1, 2, figsize=(15, 6))

# Linear Regression
axes[0].scatter(y_test, y_pred_lr, alpha=0.6, color='blue')
axes[0].plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
axes[0].set_xlabel('Actual Assessment Score')
axes[0].set_ylabel('Predicted Assessment Score')
axes[0].set_title(f'Linear Regression\nR² = {r2_lr:.3f}')

# Random Forest
axes[1].scatter(y_test, y_pred_rf, alpha=0.6, color='green')
axes[1].plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
axes[1].set_xlabel('Actual Assessment Score')
axes[1].set_ylabel('Predicted Assessment Score')
axes[1].set_title(f'Random Forest\nR² = {r2_rf:.3f}')

plt.tight_layout()
plt.show()

# Model comparison
model_comparison = pd.DataFrame({
    'Model': ['Linear Regression', 'Random Forest'],
    'MSE': [mse_lr, mse_rf],
    'R² Score': [r2_lr, r2_rf]
})

print("\nModel Comparison:")
print(model_comparison)

## 5. Student Clustering for Learning Personas

In [None]:
# Prepare data for clustering (cognitive skills only)
clustering_features = ['comprehension', 'attention', 'focus', 'retention']
X_cluster = df[clustering_features]

# Standardize the features
scaler = StandardScaler()
X_cluster_scaled = scaler.fit_transform(X_cluster)

# Determine optimal number of clusters using elbow method
inertias = []
k_range = range(2, 11)

for k in k_range:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(X_cluster_scaled)
    inertias.append(kmeans.inertia_)

# Plot elbow curve
plt.figure(figsize=(10, 6))
plt.plot(k_range, inertias, 'bo-')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Inertia')
plt.title('Elbow Method for Optimal k')
plt.xticks(k_range)
plt.grid(True, alpha=0.3)
plt.show()

print("Inertia values for different k:")
for k, inertia in zip(k_range, inertias):
    print(f"k={k}: {inertia:.2f}")

In [None]:
# Perform clustering with optimal k (let's use k=4 based on elbow method)
optimal_k = 4
kmeans = KMeans(n_clusters=optimal_k, random_state=42)
cluster_labels = kmeans.fit_predict(X_cluster_scaled)

# Add cluster labels to dataframe
df['cluster'] = cluster_labels

# Analyze cluster characteristics
cluster_analysis = df.groupby('cluster')[clustering_features + ['assessment_score', 'engagement_time']].mean().round(2)

print("Cluster Characteristics:")
print(cluster_analysis)

print("\nCluster Sizes:")
print(df['cluster'].value_counts().sort_index())

In [None]:
# Visualize clusters using PCA
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_cluster_scaled)

plt.figure(figsize=(12, 8))
colors = ['red', 'blue', 'green', 'orange', 'purple', 'brown']

for i in range(optimal_k):
    mask = cluster_labels == i
    plt.scatter(X_pca[mask, 0], X_pca[mask, 1], 
               c=colors[i], label=f'Cluster {i}', alpha=0.7, s=50)

# Plot cluster centers
centers_pca = pca.transform(kmeans.cluster_centers_)
plt.scatter(centers_pca[:, 0], centers_pca[:, 1], 
           c='black', marker='x', s=200, linewidths=3, label='Centroids')

plt.xlabel(f'First Principal Component (explained variance: {pca.explained_variance_ratio_[0]:.1%})')
plt.ylabel(f'Second Principal Component (explained variance: {pca.explained_variance_ratio_[1]:.1%})')
plt.title('Student Clusters in Cognitive Skills Space (PCA)')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

print(f"Total explained variance: {sum(pca.explained_variance_ratio_):.1%}")

In [None]:
# Create learning personas based on clusters
persona_names = {
    0: "Struggling Learners",
    1: "Average Performers", 
    2: "High Achievers",
    3: "Inconsistent Learners"
}

# Add persona names to the analysis
cluster_analysis_with_names = cluster_analysis.copy()
cluster_analysis_with_names.index = [persona_names.get(i, f"Cluster {i}") for i in cluster_analysis_with_names.index]

print("Learning Personas Analysis:")
print(cluster_analysis_with_names)

# Visualize persona characteristics
fig, ax = plt.subplots(figsize=(12, 8))
cluster_analysis_with_names[clustering_features].plot(kind='bar', ax=ax)
plt.title('Learning Personas: Cognitive Skills Profile')
plt.xlabel('Learning Personas')
plt.ylabel('Average Score')
plt.xticks(rotation=45, ha='right')
plt.legend(title='Cognitive Skills', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()

## 6. Key Insights and Findings

In [None]:
# Generate comprehensive insights
insights = []

# Correlation insights
strongest_corr = correlations_with_score.drop('assessment_score').idxmax()
strongest_corr_value = correlations_with_score.drop('assessment_score').max()
insights.append(f"1. **Strongest Predictor**: {strongest_corr.replace('_', ' ').title()} shows the strongest correlation with assessment scores (r = {strongest_corr_value:.3f})")

# Model performance insights
best_model = "Random Forest" if r2_rf > r2_lr else "Linear Regression"
best_r2 = max(r2_rf, r2_lr)
insights.append(f"2. **Model Performance**: {best_model} achieves the best prediction accuracy with R² = {best_r2:.3f}")

# Class performance insights
best_class = class_performance.index[0]
worst_class = class_performance.index[-1]
insights.append(f"3. **Class Performance**: {best_class} students perform best (avg: {class_performance.loc[best_class, 'mean']:.1f}), while {worst_class} students need more support (avg: {class_performance.loc[worst_class, 'mean']:.1f})")

# Clustering insights
cluster_sizes = df['cluster'].value_counts().sort_index()
largest_cluster = cluster_sizes.idxmax()
insights.append(f"4. **Learning Personas**: {optimal_k} distinct learning personas identified, with '{persona_names[largest_cluster]}' being the largest group ({cluster_sizes[largest_cluster]} students)")

# Engagement insights
eng_corr = df['engagement_time'].corr(df['assessment_score'])
insights.append(f"5. **Engagement Impact**: Study engagement time shows {'moderate' if abs(eng_corr) > 0.3 else 'weak'} correlation with performance (r = {eng_corr:.3f})")

print("=" * 60)
print("KEY FINDINGS AND INSIGHTS")
print("=" * 60)

for insight in insights:
    print(insight)
    print()

print("=" * 60)
print("RECOMMENDATIONS")
print("=" * 60)

recommendations = [
    "• Focus on developing comprehension skills as they show the strongest impact on performance",
    "• Implement targeted interventions for students in lower-performing classes",
    "• Use personalized learning approaches based on the identified learning personas",
    "• Monitor and support students in the 'Struggling Learners' persona group",
    "• Leverage the Random Forest model for early identification of at-risk students"
]

for rec in recommendations:
    print(rec)

print("\n" + "=" * 60)

In [None]:
# Export key results for dashboard
dashboard_data = {
    'summary_stats': df[numerical_features].describe().to_dict(),
    'correlations': correlation_matrix.to_dict(),
    'class_performance': class_performance.to_dict(),
    'model_performance': {
        'linear_regression': {'r2': r2_lr, 'mse': mse_lr},
        'random_forest': {'r2': r2_rf, 'mse': mse_rf}
    },
    'cluster_analysis': cluster_analysis.to_dict(),
    'feature_importance': feature_importance_rf.to_dict('records')
}

# Save results
import json
with open('../src/data/analysis_results.json', 'w') as f:
    json.dump(dashboard_data, f, indent=2)

print("Analysis results exported to '../src/data/analysis_results.json'")
print("\nAnalysis Complete! 🎉")