## Feature engineering

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns

# 2. Interaction Features
def create_interaction_features(df):
    """Create interaction features from the student performance dataset"""
    
    # Copy the dataframe to avoid modifying the original
    df_new = df.copy()
    
    # Parent-Study interaction: How parental support might amplify study effectiveness
    df_new['ParentStudyInteraction'] = df_new['ParentalSupport'] * df_new['StudyTimeWeekly']
    
    # Extracurricular intensity: Sum of all activity participations
    df_new['ExtracurricularIntensity'] = df_new['Sports'] + df_new['Music'] + df_new['Volunteering']
    
    # Academic involvement intensity: Combines study time and tutoring
    df_new['AcademicInvolvement'] = df_new['StudyTimeWeekly'] * (1 + 0.5 * df_new['Tutoring']) #Add explanation!
    
    return df_new

# 3. Ratio and Aggregate Features
def create_ratio_aggregate_features(df):
    """Create ratio and aggregate features from the student performance dataset"""
    
    # Copy the dataframe to avoid modifying the original
    df_new = df.copy()
    
    # Study efficiency: ratio of study time to absences (with handling for zero absences)
    df_new['StudyEfficiency'] = df_new['StudyTimeWeekly'] / (df_new['Absences'] + 1)
    
    # Academic balance: ratio of study time to extracurricular activities
    extracurricular_count = df_new[['Sports', 'Music', 'Volunteering']].sum(axis=1)
    # Adding 1 to avoid division by zero
    df_new['AcademicBalance'] = df_new['StudyTimeWeekly'] / (extracurricular_count + 1)
    
    return df_new

# Combined function to apply both feature engineering techniques
def engineer_features(df):
    """Apply both interaction and ratio/aggregate feature engineering"""
    df = create_interaction_features(df)
    df = create_ratio_aggregate_features(df)
    return df


## Random forest algorithm

In [None]:
# Apply feature engineering

df = pd.read_csv('Student_performance_prepped_data.csv')

df_engineered = engineer_features(df)

# Define feature set and target
X = df_engineered.drop('GradeClass', axis=1)
y = df_engineered['GradeClass']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)

rf = RandomForestClassifier(random_state=42)

# Hyperparameter grid (In order tO better the accuracy of the model)
param_grid = {
    'n_estimators': [100, 200], # Number of trees in the forest; 100 and 200 chose for balance between performance and computation that is needed
    'max_depth': [10, 20], # Maximum depth of each tree; 10 and 20 limit complexity to prevent overfitting
}

# Grid search for hyperparameter tuning
grid_search = GridSearchCV(
    rf, # Base model with fixed random seed
    param_grid=param_grid, # Hyperparameter grid to search over
    cv=5, #Give us good cross validation number as it will allow for 240 fits (10 folds) without taking too much time to execute
    scoring='accuracy', # Use accuracy as the metric to optimize
    n_jobs=-1 # Use all available CPU cores to increase the speed of execution of the algorithm
)

# Fit the model
grid_search.fit(X_train, y_train)

# Get the best model
best_model = grid_search.best_estimator_


# Make predictions
y_pred = best_model.predict(X_test)

## Analyzing the model

In [None]:
#Evaluate the model Parameters (the hyperparameters that were applied)

print("Best parameters:", grid_search.best_params_)
print("\nAccuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Feature importance analysis
feature_importances = best_model.feature_importances_
feature_names = X.columns

# Create DataFrame for feature importance
importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': feature_importances
}).sort_values(by='Importance', ascending=False)

# Plot feature importance
plt.figure(figsize=(12, 8))
sns.barplot(x='Importance', y='Feature', data=importance_df.head(15))
plt.title('Top 15 Feature Importances')
plt.tight_layout()
plt.savefig('feature_importance.png')
plt.close()

print("\nTop 10 Most Important Features:")
print(importance_df.head(10))

# Save the model results
results = {
    'accuracy': accuracy_score(y_test, y_pred),
    'best_params': grid_search.best_params_,
    'feature_importance': importance_df
}

# Export results to CSV
importance_df.to_csv('feature_importance_random_forest_algo.csv', index=False)

print("\nModel training and evaluation complete.")

Best parameters: {'max_depth': 10, 'n_estimators': 200}

Accuracy: 0.9214046822742475

Classification Report:
              precision    recall  f1-score   support

         0.0       1.00      0.41      0.58        27
         1.0       0.86      0.84      0.85        67
         2.0       0.89      0.96      0.92        98
         3.0       0.90      0.91      0.91       103
         4.0       0.95      0.98      0.96       303

    accuracy                           0.92       598
   macro avg       0.92      0.82      0.84       598
weighted avg       0.92      0.92      0.92       598


Top 10 Most Important Features:
                     Feature  Importance
11                       GPA    0.525864
4                   Absences    0.181040
15           StudyEfficiency    0.082008
14       AcademicInvolvement    0.036607
3            StudyTimeWeekly    0.033765
12    ParentStudyInteraction    0.033432
16           AcademicBalance    0.031594
6            ParentalSupport    0.014667

# Explanation of the results

## Best parameters explanation:
  * `max_depth: 10`: Limits tree depth of each tree to 10 branches to prevent overfitting trees.
   * `n_estimators: 200`: The number of trees, although this increases the execution time of the model, it does allow for a more accurate prediction.

## Accuracy explanation:
 * Overall accuracy is **92.14%**, which however the accuracy is further explored in the classification report.

## Classification report explanation:
   The classification report provides precision, recall, f1-score, and support for each class. 

   * **Class 0.0**:
     * **Explanation:**: The model struggles to identify class 0.0 instances (indicated by the recall value of 41% and the f1-score of 58%), likely due to the low amount of data points that are present to train and test the model on (only 27 data points are available as indicated by the support ). This is not a huge issue as this class is the top performance students thus a high accuracy here is not the biggest concern.

   * **Class 1.0**
     * **Explanation:**: Reasonable performance in all metrics, as the model had an accuracy of **85%** as indicated by the **f1 score**

   * **Class 2.0 AND 3.0**:
     * **Explanation:**: The model preforms relatively the same with both these models, with high recall (**96% for class 2.0** and **0.91 for class 3.0**) suggesting it’s reliable for identifying class in these classes this is also indicated by **f1 score of 92% in class 2.0 and 91% in class 3.0**.


   * **Class 4.0**:
     * **Explanation:**: The model preforms the best here, likely due to the large number of samples. This is also the most important class to be accurate in as this class indicates the problem students. Having a high accuracy here is very beneficial.

## The Top 10 most important features 
This shows what features has the greatest effect on the target variable (Grade class).

