# Task 1: Student Performance Indicator

This notebook builds a simple regression model to predict students' exam scores and explores key factors.

- Load dataset from `data/` (CSV/XLS/XLSX automatically detected)
- Quick EDA and cleaning
- Visualizations: distribution, study hours vs score, and a well-organized correlation matrix
- Linear Regression model with evaluation metrics (R², RMSE, MAE)
- Feature importance overview
- Optional: Polynomial regression on study hours

> Dataset used: any file in `data/` that matches common student-performance names (e.g., `Student_Performance_Factors.csv`).


In [None]:
# Imports and utility functions
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from typing import Optional, Tuple
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error


def find_dataset_path() -> Optional[Tuple[str, str]]:
	data_dir = 'data'
	preferred = os.path.join(data_dir, 'Student_Performance_Factors.csv')
	if os.path.exists(preferred):
		return preferred, 'csv'
	if not os.path.isdir(data_dir):
		return None
	candidates = []
	for root, _, files in os.walk(data_dir):
		for name in files:
			lower = name.lower()
			if lower.endswith('.csv'):
				candidates.append((os.path.join(root, name), 'csv', lower))
			elif lower.endswith('.xlsx') or lower.endswith('.xls'):
				candidates.append((os.path.join(root, name), 'excel', lower))
	if not candidates:
		return None
	keywords = ['student', 'performance', 'score', 'exam']
	def score(n: str) -> int:
		return sum(1 for k in keywords if k in n)
	best = max(candidates, key=lambda t: (score(t[2]), t[2]))
	return best[0], best[1]


def infer_target_column(df: pd.DataFrame) -> Optional[str]:
	name_priority = ['exam_score', 'score', 'marks', 'mark', 'grade', 'final', 'result']
	lower_map = {c.lower(): c for c in df.columns}
	for key in name_priority:
		for col_lower, original in lower_map.items():
			if key in col_lower and pd.api.types.is_numeric_dtype(df[original]):
				return original
	if 'Exam_Score' in df.columns:
		return 'Exam_Score'
	numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
	return numeric_cols[0] if numeric_cols else None


In [None]:
# Load dataset
found = find_dataset_path()
if not found:
	raise FileNotFoundError("Dataset not found. Place CSV/XLSX in data/.")
path, fmt = found
if fmt == 'csv':
	df = pd.read_csv(path)
else:
	df = pd.read_excel(path)
print(f"Loaded: {path} | shape={df.shape}")
df.head()


In [None]:
# Basic cleaning
# Attempt numeric conversion for object columns
for c in df.columns:
	if df[c].dtype == 'object':
		conv = pd.to_numeric(df[c], errors='ignore')
		if not conv.equals(df[c]):
			df[c] = conv

# Impute missing values
for c in df.columns:
	if pd.api.types.is_numeric_dtype(df[c]):
		df[c] = df[c].fillna(df[c].median())
	else:
		if df[c].isnull().any():
			m = df[c].mode(dropna=True)
			if not m.empty:
				df[c] = df[c].fillna(m.iloc[0])

df.isnull().sum()


In [None]:
# Target selection
if 'Exam_Score' in df.columns:
	target = 'Exam_Score'
else:
	target = infer_target_column(df)
	assert target is not None, "No numeric target found"
print('Target:', target)


In [None]:
# Visualizations
import matplotlib as mpl
mpl.rcParams['figure.figsize'] = (12, 4)

fig = plt.figure(figsize=(12, 4))

plt.subplot(1, 3, 1)
plt.hist(df[target], bins=20, alpha=0.7)
plt.title(f'Distribution of {target}')
plt.xlabel(target)
plt.ylabel('Frequency')

study_col = None
for col in df.columns:
	if 'hour' in col.lower() and 'stud' in col.lower():
		study_col = col
		break

if study_col:
	plt.subplot(1, 3, 2)
	plt.scatter(df[study_col], df[target], alpha=0.6)
	plt.title(f'{study_col} vs {target}')
	plt.xlabel(study_col)
	plt.ylabel(target)

plt.subplot(1, 3, 3)
numeric_df = df.select_dtypes(include=[np.number])
if numeric_df.shape[1] >= 2:
	corr = numeric_df.corr(numeric_only=True)
	if target in corr.columns:
		order = corr[target].abs().sort_values(ascending=False).index.tolist()
		top_n = min(8, len(order))
		order = order[:top_n]
		if target in order:
			order = [c for c in order if c != target] + [target]
		corr = corr.loc[order, order]
	sns.heatmap(corr, annot=True, fmt='.2f', cmap='coolwarm', vmin=-1, vmax=1, center=0, square=True,
			linewidths=0.5, linecolor='lightgray', cbar_kws={'shrink': 0.7})
	plt.xticks(rotation=45, ha='right')
	plt.yticks(rotation=0)
	plt.title('Correlation Matrix')
else:
	plt.text(0.5, 0.5, 'Not enough numeric columns', ha='center', va='center')
	plt.axis('off')

plt.tight_layout()
plt.show()


In [None]:
# Modeling
numeric_features = df.select_dtypes(include=[np.number])
if target in numeric_features.columns:
	X = numeric_features.drop(columns=[target])
else:
	df[target] = pd.to_numeric(df[target], errors='coerce').fillna(df[target].median())
	numeric_features = df.select_dtypes(include=[np.number])
	X = numeric_features.drop(columns=[target], errors='ignore')
y = df[target]

cat_cols = df.select_dtypes(include=['object']).columns
if len(cat_cols) > 0:
	df_enc = pd.get_dummies(df, columns=cat_cols, drop_first=True)
	X = df_enc.select_dtypes(include=[np.number]).drop(columns=[target], errors='ignore')
	y = df_enc[target]

assert X.shape[1] > 0, 'No usable features after preprocessing'

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae = mean_absolute_error(y_test, y_pred)

print({'R2': r2, 'RMSE': rmse, 'MAE': mae})


In [None]:
# Predictions plots
fig = plt.figure(figsize=(10, 4))
plt.subplot(1, 2, 1)
plt.scatter(y_test, y_pred, alpha=0.6)
plt.xlabel('Actual')
plt.ylabel('Predicted')
plt.title('Actual vs Predicted')
min_val = min(y_test.min(), y_pred.min())
max_val = max(y_test.max(), y_pred.max())
plt.plot([min_val, max_val], [min_val, max_val], 'r--', linewidth=1)

plt.subplot(1, 2, 2)
residuals = y_test - y_pred
plt.hist(residuals, bins=20, alpha=0.7)
plt.title('Residuals Distribution')
plt.xlabel('Residual')
plt.ylabel('Frequency')
plt.tight_layout()
plt.show()
