In [ ]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib

# Load dataset (relative path for portability)
df = pd.read_csv('student.csv')
df.head()

In [ ]:
print('Total number of students:', len(df))
print('\nColumns:', df.columns.tolist())
print('\nDataset Info:')
df.info()
print('\nSummary Statistics:')
display(df.describe())

In [ ]:
# Correlation Heatmap
plt.figure(figsize=(10,6))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Heatmap')
plt.show()

In [ ]:
# Define features (X) and target (y)
X = df.drop('G3', axis=1)  # Assuming 'G3' is the target (final grade)
y = df['G3']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [ ]:
# Initialize models
models = {
    'Linear Regression': LinearRegression(),
    'Ridge Regression': Ridge(alpha=1.0),
    'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42)
}

results = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)
    results[name] = {'MAE': mae, 'MSE': mse, 'RMSE': rmse, 'R2': r2}

# Display results
results_df = pd.DataFrame(results).T
results_df

In [ ]:
# Compare Actual vs Predicted for the best model
best_model_name = results_df['R2'].idxmax()
best_model = models[best_model_name]
y_pred_best = best_model.predict(X_test)

plt.figure(figsize=(8,6))
sns.scatterplot(x=y_test, y=y_pred_best, alpha=0.7)
plt.xlabel('Actual Grades')
plt.ylabel('Predicted Grades')
plt.title(f'Actual vs Predicted ({best_model_name})')
plt.show()

In [ ]:
# Save the best performing model
joblib.dump(best_model, f'{best_model_name.replace(" ", "_").lower()}_model.pkl')
print(f'Best model "{best_model_name}" saved successfully!')

### 📌 Conclusion
- Multiple models were tested (Linear Regression, Ridge Regression, Random Forest).
- The results table shows performance metrics (MAE, MSE, RMSE, R²).
- The best performing model is automatically identified and saved.
- Random Forest usually performs best due to its ability to capture nonlinear patterns, but results depend on dataset characteristics.

✅ This notebook is now portable, more robust, and professional.