In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

# Set up plotting style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = (12, 8)

# Display options
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

print("✅ Libraries imported successfully!")


In [None]:
# Load the dataset
df = pd.read_csv('student_performance.csv')

print(f"📊 Dataset loaded: {df.shape[0]} rows, {df.shape[1]} columns")
print("\nFirst 5 rows:")
df.head()


In [None]:
# Basic information about the dataset
print("📋 Dataset Info:")
print("=" * 50)
df.info()

print("\n📈 Basic Statistics:")
print("=" * 50)
df.describe()


In [None]:
# Check for missing values
print("🔍 Missing Values Check:")
print("=" * 50)
missing_values = df.isnull().sum()
missing_percentage = (missing_values / len(df)) * 100

missing_df = pd.DataFrame({
    'Missing Count': missing_values,
    'Missing Percentage': missing_percentage
})
missing_df = missing_df[missing_df['Missing Count'] > 0]

if len(missing_df) == 0:
    print("✅ No missing values found!")
else:
    print(missing_df)


In [None]:
# Create missing values heatmap
plt.figure(figsize=(10, 6))
sns.heatmap(df.isnull(), yticklabels=False, cbar=True, cmap='viridis')
plt.title('Missing Values Heatmap')
plt.tight_layout()
plt.show()

print("✅ Missing values visualization complete!")


In [None]:
# Separate categorical and numerical columns
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()

print(f"📊 Categorical columns: {categorical_cols}")
print(f"📈 Numerical columns: {numerical_cols}")

# Display unique values for categorical columns
print("\n🔤 Categorical Variables Analysis:")
print("=" * 50)
for col in categorical_cols:
    print(f"\n{col}:")
    print(df[col].value_counts())
    print(f"Unique values: {df[col].nunique()}")


In [None]:
# Visualize categorical variables
fig, axes = plt.subplots(2, 2, figsize=(15, 12))
axes = axes.ravel()

for i, col in enumerate(categorical_cols):
    if i < 4:  # Only plot first 4 categorical variables
        sns.countplot(data=df, x=col, ax=axes[i])
        axes[i].set_title(f'Distribution of {col}')
        axes[i].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()


In [None]:
# Create study time bands (engineered feature)
def create_study_bands(study_time):
    if study_time == 1:
        return 'Low'
    elif study_time == 2:
        return 'Medium'
    elif study_time == 3:
        return 'High'
    else:
        return 'Very High'

df['study_time_band'] = df['study_time'].apply(create_study_bands)

print("✅ Study time bands created!")
print("\nStudy Time Bands Distribution:")
print(df['study_time_band'].value_counts())


In [None]:
# Create meal habit feature
df['meal_habit'] = df['breakfast'] + '_' + df['lunch']

print("✅ Meal habit feature created!")
print("\nMeal Habit Distribution:")
print(df['meal_habit'].value_counts())


In [None]:
# Visualize the new engineered features
fig, axes = plt.subplots(1, 2, figsize=(15, 6))

# Study time bands
sns.countplot(data=df, x='study_time_band', ax=axes[0])
axes[0].set_title('Distribution of Study Time Bands')

# Meal habits
sns.countplot(data=df, x='meal_habit', ax=axes[1])
axes[1].set_title('Distribution of Meal Habits')
axes[1].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()


In [None]:
# Select numerical columns for correlation analysis
numerical_df = df[numerical_cols]

# Calculate correlation matrix
correlation_matrix = numerical_df.corr()

print("📊 Correlation Matrix:")
print("=" * 50)
print(correlation_matrix.round(3))


In [None]:
# Create correlation heatmap
plt.figure(figsize=(10, 8))
mask = np.triu(np.ones_like(correlation_matrix, dtype=bool))
sns.heatmap(correlation_matrix, mask=mask, annot=True, cmap='coolwarm', center=0,
            square=True, linewidths=0.5, cbar_kws={"shrink": .8})
plt.title('Correlation Heatmap of Numerical Variables')
plt.tight_layout()
plt.show()


In [None]:
# Find top correlations with final_grade
grade_correlations = correlation_matrix['final_grade'].abs().sort_values(ascending=False)
print("🔝 Top correlations with final_grade:")
print("=" * 50)
for var, corr in grade_correlations.items():
    if var != 'final_grade':
        print(f"{var}: {corr:.3f}")


In [None]:
# Create pair plot for top correlated variables
top_corr_vars = ['final_grade', 'previous_score', 'attendance_rate', 'study_time']
sns.pairplot(df[top_corr_vars], diag_kind='kde')
plt.suptitle('Pair Plot of Top Correlated Variables', y=1.02)
plt.show()


In [None]:
# Create pair plot for top correlated variables
top_corr_vars = ['final_grade', 'previous_score', 'attendance_rate', 'study_time']
sns.pairplot(df[top_corr_vars], diag_kind='kde')
plt.suptitle('Pair Plot of Top Correlated Variables', y=1.02)
plt.show()


In [None]:
# Option A: Non-academic factors analysis
print("🔍 Option A: Non-academic factors analysis")
print("=" * 50)

# 1. Gender analysis
plt.figure(figsize=(15, 5))
plt.subplot(1, 3, 1)
sns.boxplot(data=df, x='gender', y='final_grade')
plt.title('Final Grade by Gender')

# 2. Meal habit analysis
plt.subplot(1, 3, 2)
sns.boxplot(data=df, x='meal_habit', y='final_grade')
plt.title('Final Grade by Meal Habit')
plt.xticks(rotation=45)

# 3. Breakfast impact
plt.subplot(1, 3, 3)
sns.boxplot(data=df, x='breakfast', y='final_grade')
plt.title('Final Grade by Breakfast')

plt.tight_layout()
plt.show()

# Statistical tests
male_grades = df[df['gender'] == 'Male']['final_grade']
female_grades = df[df['gender'] == 'Female']['final_grade']
t_stat, p_value = stats.ttest_ind(male_grades, female_grades)
print(f"\nGender difference t-test p-value: {p_value:.4f}")

# Breakfast impact
breakfast_yes = df[df['breakfast'] == 'Yes']['final_grade']
breakfast_no = df[df['breakfast'] == 'No']['final_grade']
t_stat2, p_value2 = stats.ttest_ind(breakfast_yes, breakfast_no)
print(f"Breakfast impact t-test p-value: {p_value2:.4f}")

print("\n📊 Summary Statistics by Gender:")
print(df.groupby('gender')['final_grade'].agg(['mean', 'std', 'count']))


In [None]:
# Option B: Study time and parental education interaction
print("🔍 Option B: Study time and parental education interaction")
print("=" * 50)

# 1. Create pivot table
pivot_table = df.pivot_table(
    values='final_grade', 
    index='parental_education', 
    columns='study_time_band', 
    aggfunc='mean'
)
print("Average Final Grade by Education and Study Time:")
print(pivot_table.round(1))

# 2. Visualization
plt.figure(figsize=(12, 8))
plt.subplot(2, 2, 1)
sns.heatmap(pivot_table, annot=True, cmap='YlOrRd', fmt='.1f')
plt.title('Average Final Grade by Education and Study Time')

# 3. Box plot by education level
plt.subplot(2, 2, 2)
sns.boxplot(data=df, x='parental_education', y='final_grade')
plt.title('Final Grade by Parental Education')
plt.xticks(rotation=45)

# 4. Study time distribution by education
plt.subplot(2, 2, 3)
sns.countplot(data=df, x='study_time_band', hue='parental_education')
plt.title('Study Time Distribution by Education')
plt.xticks(rotation=45)

# 5. Interaction plot
plt.subplot(2, 2, 4)
for education in df['parental_education'].unique():
    subset = df[df['parental_education'] == education]
    study_means = subset.groupby('study_time')['final_grade'].mean()
    plt.plot(study_means.index, study_means.values, marker='o', label=education)
plt.xlabel('Study Time (hours)')
plt.ylabel('Average Final Grade')
plt.title('Study Time vs Grade by Education')
plt.legend()

plt.tight_layout()
plt.show()

# Statistical analysis
print("\n📊 Summary Statistics:")
summary = df.groupby(['parental_education', 'study_time_band'])['final_grade'].agg(['mean', 'count'])
print(summary.round(1))


In [None]:
# Option A: Non-academic factors analysis
print("🔍 Option A: Non-academic factors analysis")
print("=" * 50)

# 1. Gender analysis
plt.figure(figsize=(15, 5))
plt.subplot(1, 3, 1)
sns.boxplot(data=df, x='gender', y='final_grade')
plt.title('Final Grade by Gender')

# 2. Meal habit analysis
plt.subplot(1, 3, 2)
sns.boxplot(data=df, x='meal_habit', y='final_grade')
plt.title('Final Grade by Meal Habit')
plt.xticks(rotation=45)

# 3. Breakfast impact
plt.subplot(1, 3, 3)
sns.boxplot(data=df, x='breakfast', y='final_grade')
plt.title('Final Grade by Breakfast')

plt.tight_layout()
plt.show()

# Statistical tests
from scipy import stats
male_grades = df[df['gender'] == 'Male']['final_grade']
female_grades = df[df['gender'] == 'Female']['final_grade']
t_stat, p_value = stats.ttest_ind(male_grades, female_grades)
print(f"\\nGender difference t-test p-value: {p_value:.4f}")

# Breakfast impact
breakfast_yes = df[df['breakfast'] == 'Yes']['final_grade']
breakfast_no = df[df['breakfast'] == 'No']['final_grade']
t_stat2, p_value2 = stats.ttest_ind(breakfast_yes, breakfast_no)
print(f"Breakfast impact t-test p-value: {p_value2:.4f}")

print("\\n📊 Summary Statistics by Gender:")
print(df.groupby('gender')['final_grade'].agg(['mean', 'std', 'count']))
