In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error


In [2]:
# import your data
data = pd.read_csv('mock_data.csv')

In [7]:
data.head()

Unnamed: 0,student_id,student_name,c_internal_exam1_marks,c_internal_exam2_marks,c_attendance_percentage,c_final_exam_marks,c_preparation_time_daily_hours,java_internal_exam1_marks,java_internal_exam2_marks,java_attendance_percentage,...,cn_internal_exam1_marks,cn_internal_exam2_marks,cn_attendance_percentage,cn_final_exam_marks,cn_preparation_time_daily_hours,dsa_internal_exam1_marks,dsa_internal_exam2_marks,dsa_attendance_percentage,dsa_final_exam_marks,dsa_preparation_time_daily_hours
0,1,Marcel Rack,2,18,23.46,14,11.68,35,29,34.43,...,33,38,78.81,25,3.37,11,38,21.86,88,16.76
1,2,Barrett Pitfield,26,25,6.42,84,7.94,31,21,2.45,...,2,49,26.49,1,12.34,0,28,77.7,91,13.11
2,3,Marley Perin,17,32,55.67,29,16.15,6,20,54.37,...,50,48,52.05,58,18.47,32,8,11.64,89,23.69
3,4,Dewain McPhelim,7,11,73.16,20,9.23,28,30,69.54,...,25,39,62.62,50,19.42,16,14,86.84,70,22.3
4,5,Maryjo Grundy,45,6,23.29,97,10.8,1,1,37.48,...,22,8,91.7,61,10.92,25,33,62.14,95,8.54


In [8]:
data.describe()

Unnamed: 0,student_id,c_internal_exam1_marks,c_internal_exam2_marks,c_attendance_percentage,c_final_exam_marks,c_preparation_time_daily_hours,java_internal_exam1_marks,java_internal_exam2_marks,java_attendance_percentage,java_final_exam_marks,...,cn_internal_exam1_marks,cn_internal_exam2_marks,cn_attendance_percentage,cn_final_exam_marks,cn_preparation_time_daily_hours,dsa_internal_exam1_marks,dsa_internal_exam2_marks,dsa_attendance_percentage,dsa_final_exam_marks,dsa_preparation_time_daily_hours
count,60.0,60.0,60.0,60.0,60.0,60.0,60.0,60.0,60.0,60.0,...,60.0,60.0,60.0,60.0,60.0,60.0,60.0,60.0,60.0,60.0
mean,30.5,24.383333,24.8,46.862333,54.216667,13.377667,21.3,25.066667,47.344833,50.616667,...,24.95,23.633333,42.972,51.733333,11.6475,25.95,22.383333,45.608,51.483333,13.437167
std,17.464249,15.578224,13.834126,31.40698,31.222597,6.362492,14.304557,16.118067,32.18019,31.589578,...,13.517628,15.508299,28.27733,29.52957,6.592724,15.082176,15.569518,28.013407,30.46281,7.455795
min,1.0,1.0,0.0,0.12,0.0,0.91,1.0,0.0,0.58,1.0,...,1.0,0.0,0.87,0.0,0.35,0.0,0.0,0.5,0.0,0.02
25%,15.75,9.75,15.0,19.32,29.75,8.6075,7.75,8.75,20.055,23.0,...,14.0,9.0,21.4925,28.75,5.83,15.75,6.75,20.935,25.75,7.6825
50%,30.5,24.0,23.5,41.88,55.0,13.44,20.5,27.0,40.07,50.0,...,24.5,22.0,42.91,56.0,12.28,25.5,20.5,42.125,50.5,13.855
75%,45.25,37.25,35.25,75.0425,84.75,18.595,32.25,38.0,77.875,81.75,...,36.0,37.25,63.365,78.25,17.8875,37.5,34.25,72.2325,74.25,21.0325
max,60.0,50.0,50.0,99.59,99.0,23.97,49.0,49.0,98.65,100.0,...,50.0,50.0,96.26,100.0,22.45,50.0,50.0,92.89,97.0,23.81


In [10]:
''' finding the names of the top 10 students based on the average of final semester marks for all subjects.
    We can use this data later for evaluating the model.
'''


# Target columns
target_cols = ['c_final_exam_marks', 'java_final_exam_marks', 'python_final_exam_marks', 'cn_final_exam_marks', 'dsa_final_exam_marks']

# Calculate the average final semester marks for each student
data['avg_final_marks'] = data[target_cols].mean(axis=1)

# Sort the data based on the average final semester marks in descending order
sorted_data = data.sort_values(by='avg_final_marks', ascending=False)

# Print the names of the top 10 performers
print("Top 10 Performers (based on average final semester marks):")
top_10_students = sorted_data['student_name'].head(10)
for i, student in enumerate(top_10_students, start=1):
    print(f"{i}. {student}")

Top 10 Performers (based on average final semester marks):
1. Maryjo Grundy
2. Isis Kimpton
3. Rosabella Goldis
4. Indira Arnet
5. Sheelah Glenister
6. Iggie Gemeau
7. Barrett Pitfield
8. Beryle Buckbee
9. Keefer Lowis
10. Francoise Hillum


In [3]:
# splitting variables into features and targets
features = data.drop(['student_name', 'c_final_exam_marks', 'java_final_exam_marks', 'python_final_exam_marks', 'cn_final_exam_marks', 'dsa_final_exam_marks'], axis=1)
targets = data[['c_final_exam_marks', 'java_final_exam_marks', 'python_final_exam_marks', 'cn_final_exam_marks', 'dsa_final_exam_marks']]

In [11]:
features.head()

Unnamed: 0,student_id,c_internal_exam1_marks,c_internal_exam2_marks,c_attendance_percentage,c_preparation_time_daily_hours,java_internal_exam1_marks,java_internal_exam2_marks,java_attendance_percentage,java_preparation_time_daily_hours,python_internal_exam1_marks,...,python_attendance_percentage,python_preparation_time_daily_hours,cn_internal_exam1_marks,cn_internal_exam2_marks,cn_attendance_percentage,cn_preparation_time_daily_hours,dsa_internal_exam1_marks,dsa_internal_exam2_marks,dsa_attendance_percentage,dsa_preparation_time_daily_hours
0,1,2,18,23.46,11.68,35,29,34.43,18.94,16,...,88.14,13.38,33,38,78.81,3.37,11,38,21.86,16.76
1,2,26,25,6.42,7.94,31,21,2.45,15.11,32,...,4.15,20.07,2,49,26.49,12.34,0,28,77.7,13.11
2,3,17,32,55.67,16.15,6,20,54.37,23.43,44,...,38.24,9.49,50,48,52.05,18.47,32,8,11.64,23.69
3,4,7,11,73.16,9.23,28,30,69.54,20.73,48,...,0.2,0.99,25,39,62.62,19.42,16,14,86.84,22.3
4,5,45,6,23.29,10.8,1,1,37.48,5.48,36,...,93.88,10.9,22,8,91.7,10.92,25,33,62.14,8.54


In [12]:
targets.head()

Unnamed: 0,c_final_exam_marks,java_final_exam_marks,python_final_exam_marks,cn_final_exam_marks,dsa_final_exam_marks
0,14,40,100,25,88
1,84,85,88,1,91
2,29,96,17,58,89
3,20,58,43,50,70
4,97,91,48,61,95


In [4]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, targets, test_size=0.2, random_state=42)


In [5]:
# Create a Random Forest Regressor model for each subject in the data
c_model = RandomForestRegressor()
java_model = RandomForestRegressor()
python_model = RandomForestRegressor()
cn_model = RandomForestRegressor()
dsa_model = RandomForestRegressor()

In [13]:
c_model

In [14]:
# Train the models for each subhject
c_model.fit(X_train, y_train['c_final_exam_marks'])
java_model.fit(X_train, y_train['java_final_exam_marks'])
python_model.fit(X_train, y_train['python_final_exam_marks'])
cn_model.fit(X_train, y_train['cn_final_exam_marks'])
dsa_model.fit(X_train, y_train['dsa_final_exam_marks'])

In [15]:

# Evaluate the models on the testing set
c_preds = c_model.predict(X_test)
java_preds = java_model.predict(X_test)
python_preds = python_model.predict(X_test)
cn_preds = cn_model.predict(X_test)
dsa_preds = dsa_model.predict(X_test)


In [16]:
# predicted values
c_preds

array([43.64, 69.63, 48.73, 44.47, 85.  , 67.44, 48.34, 71.5 , 60.68,
       66.69, 55.56, 52.87])

In [17]:
# calculating RMSE values, lower values indicate better accuracy

c_rmse = mean_squared_error(y_test['c_final_exam_marks'], c_preds, squared=False)
java_rmse = mean_squared_error(y_test['java_final_exam_marks'], java_preds, squared=False)
python_rmse = mean_squared_error(y_test['python_final_exam_marks'], python_preds, squared=False)
cn_rmse = mean_squared_error(y_test['cn_final_exam_marks'], cn_preds, squared=False)
dsa_rmse = mean_squared_error(y_test['dsa_final_exam_marks'], dsa_preds, squared=False)




In [18]:
# printing the RMSE values
print(f"C RMSE: {c_rmse}")
print(f"Java RMSE: {java_rmse}")
print(f"Python RMSE: {python_rmse}")
print(f"CN RMSE: {cn_rmse}")
print(f"DSA RMSE: {dsa_rmse}")


C RMSE: 36.520186769328916
Java RMSE: 30.016390800138957
Python RMSE: 35.889372266266605
CN RMSE: 35.5959231045729
DSA RMSE: 36.70884316709894


In [22]:
student_name = "Maryjo Grundy"
student_row = data.loc[data['student_name'] == student_name]
feature_cols= ['student_name', 'c_final_exam_marks', 'java_final_exam_marks', 'python_final_exam_marks', 'cn_final_exam_marks', 'dsa_final_exam_marks']
if not student_row.empty:
    # Extract the feature data for the student
    student_features = student_row[feature_cols]

    # Make predictions for the student
    c_final_marks = c_model.predict(student_features)
    java_final_marks = java_model.predict(student_features)
    python_final_marks = python_model.predict(student_features)
    cn_final_marks = cn_model.predict(student_features)
    dsa_final_marks = dsa_model.predict(student_features)

    # Get the actual marks from the data
    actual_c_marks = student_row['c_final_exam_marks'].values[0]
    actual_java_marks = student_row['java_final_exam_marks'].values[0]
    actual_python_marks = student_row['python_final_exam_marks'].values[0]
    actual_cn_marks = student_row['cn_final_exam_marks'].values[0]
    actual_dsa_marks = student_row['dsa_final_exam_marks'].values[0]

    print(f"Predicted final semester marks for {student_name}:")
    print(f"C: {c_final_marks[0]:.2f} (Actual: {actual_c_marks})")
    print(f"Java: {java_final_marks[0]:.2f} (Actual: {actual_java_marks})")
    print(f"Python: {python_final_marks[0]:.2f} (Actual: {actual_python_marks})")
    print(f"CN: {cn_final_marks[0]:.2f} (Actual: {actual_cn_marks})")
    print(f"DSA: {dsa_final_marks[0]:.2f} (Actual: {actual_dsa_marks})")

    print("\nDifference between actual and predicted marks:")
    print(f"C: {abs(c_final_marks[0] - actual_c_marks):.2f}")
    print(f"Java: {abs(java_final_marks[0] - actual_java_marks):.2f}")
    print(f"Python: {abs(python_final_marks[0] - actual_python_marks):.2f}")
    print(f"CN: {abs(cn_final_marks[0] - actual_cn_marks):.2f}")
    print(f"DSA: {abs(dsa_final_marks[0] - actual_dsa_marks):.2f}")

else:
    print(f"Student '{student_name}' not found in the data.")

ValueError: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- c_final_exam_marks
- cn_final_exam_marks
- dsa_final_exam_marks
- java_final_exam_marks
- python_final_exam_marks
- ...
Feature names seen at fit time, yet now missing:
- c_attendance_percentage
- c_internal_exam1_marks
- c_internal_exam2_marks
- c_preparation_time_daily_hours
- cn_attendance_percentage
- ...
