In [1]:
# Import dependencies
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

## Import data

In [2]:
# Read pre-processed csv file
df = pd.read_csv('data/preprocessed_grades_data.csv')
df.head()

Unnamed: 0,Age,Attendance (%),Midterm_Score,Final_Score,Assignments_Avg,Quizzes_Avg,Participation_Score,Projects_Score,Total_Score,Study_Hours_per_Week,...,Department_CS,Department_Engineering,Department_Mathematics,Extracurricular_Activities_No,Extracurricular_Activities_Yes,Internet_Access_at_Home_No,Internet_Access_at_Home_Yes,Family_Income_Level_High,Family_Income_Level_Low,Family_Income_Level_Medium
0,22.0,52.29,55.03,57.82,84.22,74.06,3.99,85.9,56.09,6.2,...,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
1,18.0,97.27,97.23,45.8,65.644,94.24,8.32,55.65,50.64,19.0,...,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
2,24.0,57.19,67.05,93.68,67.7,85.7,5.05,73.79,70.3,20.7,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
3,24.0,95.15,47.79,80.63,66.06,93.51,6.54,92.12,61.63,24.8,...,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0
4,23.0,54.18,46.59,78.89,96.85,83.7,5.97,68.42,66.13,15.4,...,1.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0


In [3]:
# Define features
X = df.copy()
X = X.drop(columns=['Encoded_Grade'])
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 26 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Age                             5000 non-null   float64
 1   Attendance (%)                  5000 non-null   float64
 2   Midterm_Score                   5000 non-null   float64
 3   Final_Score                     5000 non-null   float64
 4   Assignments_Avg                 5000 non-null   float64
 5   Quizzes_Avg                     5000 non-null   float64
 6   Participation_Score             5000 non-null   float64
 7   Projects_Score                  5000 non-null   float64
 8   Total_Score                     5000 non-null   float64
 9   Study_Hours_per_Week            5000 non-null   float64
 10  Stress_Level (1-10)             5000 non-null   float64
 11  Sleep_Hours_per_Night           5000 non-null   float64
 12  Parent_Education_Level          50

In [4]:
# Define target vector
y = df['Encoded_Grade']

## Split data into test/training sets

In [6]:
# Splitting into Train and Test sets using train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=47)

In [7]:
# Creating StandardScaler instance
scaler = StandardScaler()

In [8]:
# Fitting Standard Scaler
X_scaler = scaler.fit(X_train)

In [9]:
# Scaling data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

## Fitting the Random Forest Model - Attempt 1

In [11]:
# Initiate RF
rf_model = RandomForestClassifier(n_estimators=500, max_depth=8, random_state=47)

In [13]:
# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

In [14]:
# Making predictions using the testing data
predictions = rf_model.predict(X_test_scaled)

In [17]:
# Calculating the accuracy score
acc_score = accuracy_score(y_test, predictions)

# Displaying results
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Accuracy Score : 0.3704
Classification Report
              precision    recall  f1-score   support

         0.0       0.54      0.70      0.61       368
         1.0       0.26      0.04      0.06       243
         2.0       0.14      0.05      0.07       213
         3.0       0.30      0.57      0.40       214
         4.0       0.24      0.30      0.27       212

    accuracy                           0.37      1250
   macro avg       0.30      0.33      0.28      1250
weighted avg       0.33      0.37      0.32      1250



## Not great!