In [None]:
import pandas as pd
import numpy as np

# Define the syllabus for each grade
syllabus = {
    'Kindergarten': ['Introduction to numbers', 'Patterns and sequencing', 'Measurement', 'Logical reasoning and problem-solving'],
    'Grade 1': ['Number operations', 'Place value', 'Fractions', 'Geometry', 'Time and money', 'Data handling'],
    'Grade 2': ['Number operations', 'Place value', 'Fractions', 'Geometry', 'Time and money', 'Data handling'],
    'Grade 3': ['Number operations', 'Place value', 'Fractions', 'Geometry', 'Time and money', 'Data handling'],
    'Grade 4': ['Whole numbers', 'Decimals', 'Fractions', 'Percentage', 'Mensuration', 'Symmetry', 'Introduction to algebra'],
    'Grade 5': ['Whole numbers', 'Decimals', 'Fractions', 'Percentage', 'Mensuration', 'Symmetry', 'Introduction to algebra'],
    'Grade 6': ['Rational numbers', 'Exponents and powers', 'Linear equations', 'Introduction to graphs', 'Mensuration', 'Pythagoras theorem', 'Ratio and proportion', 'Triangles', 'Introduction to statistics'],
    'Grade 7': ['Rational numbers', 'Exponents and powers', 'Linear equations', 'Introduction to graphs', 'Mensuration', 'Pythagoras theorem', 'Ratio and proportion', 'Triangles', 'Introduction to statistics'],
    'Grade 8': ['Rational numbers', 'Exponents and powers', 'Linear equations', 'Introduction to graphs', 'Mensuration', 'Pythagoras theorem', 'Ratio and proportion', 'Triangles', 'Introduction to statistics'],
    'Grade 9': ['Algebra', 'Linear equations', 'Quadratic equations', 'Trigonometry', 'Circles', 'Coordinate geometry', 'Mensuration', 'Statistics'],
    'Grade 10': ['Algebra', 'Linear equations', 'Quadratic equations', 'Trigonometry', 'Circles', 'Coordinate geometry', 'Mensuration', 'Statistics'],
    'Grade 11': ['Sets and relations', 'Permutations and combinations', 'Binomial theorem', 'Matrices and determinants', 'Calculus', 'Coordinate geometry', 'Vectors', 'Probability', 'Advanced Statistics'],
    'Grade 12': ['Sets and relations', 'Permutations and combinations', 'Binomial theorem', 'Matrices and determinants', 'Calculus', 'Coordinate geometry', 'Vectors', 'Probability', 'Advanced Statistics']
}

# Generate synthetic dataset
np.random.seed(0)
num_students = 100000
data = {
    'student_name': [f'Student_{i}' for i in range(1, num_students + 1)],
    'grade': np.random.choice(list(syllabus.keys()), size=num_students),
    'total_ques': 15
}

# Generate random quiz results (0 for wrong, 1 for right) for each topic in the syllabus
data['no_of_wrong_answer'] = np.random.randint(0, 16, size=num_students)
data['no_of_right_answer'] = 15 - data['no_of_wrong_answer']
data['ability_score'] = np.round((data['no_of_right_answer'] / 15) * 10, 2)

df = pd.DataFrame(data)
# Generate topics lacking for each student based on the number of wrong answers
for index, row in df.iterrows():
    num_wrong = row['no_of_wrong_answer']
    grade_topics = syllabus[row['grade']]
    if num_wrong > len(grade_topics):
        num_wrong = len(grade_topics)
    topics_lacking = np.random.choice(grade_topics, size=num_wrong, replace=False)
    df.at[index, 'topics_lacking'] = ', '.join(topics_lacking)

# Calculate expected score based on ability score and topics lacking importance
df['expected_score'] = df.apply(lambda x: (x['ability_score'] + (10 - len(x['topics_lacking'].split(', '))) * 10), axis=1)

# Display the first few rows of the synthetic dataset
df.head()


Unnamed: 0,student_name,grade,total_ques,no_of_wrong_answer,no_of_right_answer,ability_score,topics_lacking,expected_score
0,Student_1,Grade 12,15,9,6,4.0,"Advanced Statistics, Matrices and determinants...",14.0
1,Student_2,Grade 5,15,4,11,7.33,"Decimals, Symmetry, Whole numbers, Mensuration",67.33
2,Student_3,Kindergarten,15,9,6,4.0,"Logical reasoning and problem-solving, Measure...",64.0
3,Student_4,Grade 3,15,9,6,4.0,"Data handling, Number operations, Time and mon...",44.0
4,Student_5,Grade 11,15,7,8,5.33,"Permutations and combinations, Vectors, Sets a...",35.33


In [19]:
df.to_csv('performance_pred.csv', index=False)

In [None]:
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, mean_absolute_percentage_error
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

# Encode the 'topics_lacking' column
encoder = OneHotEncoder()
topics_encoded = encoder.fit_transform(df['topics_lacking'].str.get_dummies(', '))

# Combine the encoded topics with the other features
X = pd.concat([df[['no_of_wrong_answer', 'no_of_right_answer', 'ability_score']], pd.DataFrame(topics_encoded.toarray())], axis=1)
y = df['expected_score']
X.columns = X.columns.astype(str)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Define the XGBoost model
model = XGBRegressor(objective ='reg:squarederror', n_estimators=20, max_depth=5, learning_rate=0.1)

# Fit the model
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Calculate and print evaluation metrics
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
mape = mean_absolute_percentage_error(y_test, y_pred)

print(f'Mean Squared Error: {mse}')
print(f'R-squared: {r2}')
print(f'Mean Absolute Error: {mae}')
print(f'Mean Absolute Percentage Error: {mape}')


Mean Squared Error: 13.579397728442473
R-squared: 0.9839812969390841
Mean Absolute Error: 3.111338289855957
Mean Absolute Percentage Error: 0.11613598877984062
