# **Imports &#8595;**

In [3]:
import numpy as np
import pandas as pd 
import os
import re
import sklearn
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
import torch
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModel
from sklearn.preprocessing import OneHotEncoder
from scipy.sparse import hstack
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

from sklearn.utils.class_weight import compute_class_weight
from sklearn.model_selection import GridSearchCV


  from .autonotebook import tqdm as notebook_tqdm


# **Load Dataset &#8595;**

In [4]:
DATA_PATH = "datasets/eedi-mining-misconceptions-in-mathematics"
EXTERNAL_DATA_PATH = "datasets/eedi-external-dataset"

# train_df = pd.read_csv(f'{EXTERNAL_DATA_PATH}/all_train.csv', index_col='QuestionId') #this contains the original dataset + an external dataset generated by a LLM
train_df = pd.read_csv(f'{DATA_PATH}/train.csv', index_col='QuestionId') 
test_df = pd.read_csv(f'{DATA_PATH}/test.csv')
misconceptions_df = pd.read_csv(f'{DATA_PATH}/misconception_mapping.csv')

pd.options.display.max_colwidth = 300
display(train_df.head(5))
pd.options.display.max_colwidth = 50

Unnamed: 0_level_0,ConstructId,ConstructName,SubjectId,SubjectName,CorrectAnswer,QuestionText,AnswerAText,AnswerBText,AnswerCText,AnswerDText,MisconceptionAId,MisconceptionBId,MisconceptionCId,MisconceptionDId
QuestionId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0,856,Use the order of operations to carry out calculations involving powers,33,BIDMAS,A,\[\r\n3 \times 2+4-5\r\n\]\r\nWhere do the brackets need to go to make the answer equal \( 13 \) ?,\( 3 \times(2+4)-5 \),\( 3 \times 2+(4-5) \),\( 3 \times(2+4-5) \),Does not need brackets,,,,1672.0
1,1612,Simplify an algebraic fraction by factorising the numerator,1077,Simplifying Algebraic Fractions,D,"Simplify the following, if possible: \( \frac{m^{2}+2 m-3}{m-3} \)",\( m+1 \),\( m+2 \),\( m-1 \),Does not simplify,2142.0,143.0,2142.0,
2,2774,Calculate the range from a list of data,339,Range and Interquartile Range from a List of Data,B,"Tom and Katie are discussing the \( 5 \) plants with these heights:\r\n\( 24 \mathrm{~cm}, 17 \mathrm{~cm}, 42 \mathrm{~cm}, 26 \mathrm{~cm}, 13 \mathrm{~cm} \)\r\nTom says if all the plants were cut in half, the range wouldn't change.\r\nKatie says if all the plants grew by \( 3 \mathrm{~cm} \)...",Only\r\nTom,Only\r\nKatie,Both Tom and Katie,Neither is correct,1287.0,,1287.0,1073.0
3,2377,Recall and use the intersecting diagonals properties of a rectangle,88,Properties of Quadrilaterals,C,The angles highlighted on this rectangle with different length sides can never be... ![A rectangle with the diagonals drawn in. The angle on the right hand side at the centre is highlighted in red and the angle at the bottom at the centre is highlighted in yellow.](),acute,obtuse,\( 90^{\circ} \),Not enough information,1180.0,1180.0,,1180.0
4,3387,Substitute positive integer values into formulae involving powers or roots,67,Substitution into Formula,A,The equation \( f=3 r^{2}+3 \) is used to find values in the table below. What is the value covered by the star? \begin{tabular}{|c|c|c|c|c|}\r\n\hline\( r \) & \( 1 \) & \( 2 \) & \( 3 \) & \( 4 \) \\\r\n\hline\( f \) & \( 6 \) & \( 15 \) & \( \color{gold}\bigstar \) & \\\r\n\hline\r\n\end{tabu...,\( 30 \),\( 27 \),\( 51 \),\( 24 \),,,,1818.0


# **Data Preprocessing &#8595;**

In [5]:
def clean(example, columns):
    """
    Cleans the example from the Dataset
    Args:
        example: an example from the Dataset
        columns: columns that will be cleaned

    Returns: update example containing 'clean' columns

    """
    for col in columns:
        text = example[f'{col}']

        # Empty text
        if type(text) not in (str, np.str_) or text=='':
            example[f'clean_{col}'] = ''
            return example

        # 'text' from the example can be of type numpy.str_, let's convert it to a python str
        text = str(text).lower()

        # Clean the text
        text = re.sub("\"", " ", text) # removes the " from certain texts
        text = re.sub("\n", " ", text) # removes the multiple "\n" 
        text = re.sub(r"(\\\w+)(\W)", r" \1 \2", text) # matches with the LaTeX commands like "\hline{}",... and transforms them to " \hline {}"
        text = re.sub(r"([\(|\{|\[|\|])", r" \1", text) # matches every opening parenthesis types and puts spaces on their left
        text = re.sub(r"([\)|\}|\]])", r"\1 ", text) # matches every closing parenthesis types and puts spaces on their right
        text = re.sub(r"\\(?![a-zA-Z])", " ", text) # removes every backslash that is not the start of a LaTeX command
        text = re.sub(r"\( | \)", "", text) # removes the parentheses that appear sometimes from nowhere 
        text = re.sub(r"\[ | \]", "", text) # removes the parentheses that appear sometimes from nowhere
        
        text = re.sub(r" +", " ", text) # cleans the double spaces made by above substitutions
        # Update the example with the cleaned text
        example[f'clean_{col}'] = text.strip()
    return example

# testing_data = {
#     'QuestionText': ["This is a question with a newline\nin the middle"],
#     'AnswerAText': ["Answer A\nwith newline and \\table[test]"],
#     'AnswerBText': ["Answer B\nwith newline and \hline(uwo)"],
#     'AnswerCText': ["Answer C\nwith newline and \color{gold}"],
#     'AnswerDText': ["Answer D\nwith newline and \\begin{tabular}"]
# }
# df = pd.DataFrame(testing_data)
# df = df.apply(clean, axis = 1, columns = columns_to_clean)
# display(df.head(1))

columns_to_clean = ['QuestionText', 'AnswerAText', 'AnswerBText', 'AnswerCText', 'AnswerDText']
train_df = train_df.apply(clean, axis = 1, columns = columns_to_clean)

# Adjust column order
new_order = ['ConstructId', 'ConstructName', 'SubjectId', 'SubjectName', 'CorrectAnswer']
for col in columns_to_clean:
    new_order.append(col)
    new_order.append(f'clean_{col}')
new_order.extend(['MisconceptionAId', 'MisconceptionBId', 'MisconceptionCId', 'MisconceptionDId'])
train_df = train_df[new_order]


display_train_df = train_df[['QuestionText', 'clean_QuestionText','AnswerAText', 'clean_AnswerAText', 'AnswerBText', 'clean_AnswerBText', 'AnswerCText', 'clean_AnswerCText', 'AnswerDText', 'clean_AnswerDText']]
pd.options.display.max_colwidth = 300
display(display_train_df.head(1))
pd.options.display.max_colwidth = 50

Unnamed: 0_level_0,QuestionText,clean_QuestionText,AnswerAText,clean_AnswerAText,AnswerBText,clean_AnswerBText,AnswerCText,clean_AnswerCText,AnswerDText,clean_AnswerDText
QuestionId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,\[\r\n3 \times 2+4-5\r\n\]\r\nWhere do the brackets need to go to make the answer equal \( 13 \) ?,[\r 3 \times 2+4-5\r \r where do the brackets need to go to make the answer equal 13 ?,\( 3 \times(2+4)-5 \),3 \times (2+4) -5,\( 3 \times 2+(4-5) \),3 \times 2+ (4-5),\( 3 \times(2+4-5) \),3 \times (2+4-5),Does not need brackets,does not need brackets


# **Reshape Dataset For Training &#8595;**

In [6]:
# train_df columns: QuestionID, ConstructID, ConstructName, CorrectAnswer, SubjectId, SubjectName, QuestionText, Answer[A/B/C/D]Text, Misconception[A/B/C/D]Id

reshaped_data = []
for _, row in train_df.iterrows():
    for answer, misconception_id in zip(
        ['clean_AnswerAText', 'clean_AnswerBText', 'clean_AnswerCText', 'clean_AnswerDText'],
        ['MisconceptionAId', 'MisconceptionBId', 'MisconceptionCId', 'MisconceptionDId']
    ): # turn the data into a format where each datapoint (row) represents an answer choice (i.e there are now 4 datapoints for each question)
        reshaped_data.append({
            'QuestionText': row['clean_QuestionText'],
            'AnswerText': row[answer],
            'MisconceptionId': row[misconception_id],
            'SubjectName': row['SubjectName'],
            'ConstructName': row['ConstructName']
        })

reshaped_df = pd.DataFrame(reshaped_data)
display(reshaped_df.head())

# removed columns: QuestionId, ConstructId, CorrectAnswer, SubjectId
# other changes: Answer[A/B/C/D]Text are now in separate datapoints along with their associated Misconception[A/B/C/D]Texts 

Unnamed: 0,QuestionText,AnswerText,MisconceptionId,SubjectName,ConstructName
0,[\r 3 \times 2+4-5\r \r where do the brackets ...,3 \times (2+4) -5,,BIDMAS,Use the order of operations to carry out calcu...
1,[\r 3 \times 2+4-5\r \r where do the brackets ...,3 \times 2+ (4-5),,BIDMAS,Use the order of operations to carry out calcu...
2,[\r 3 \times 2+4-5\r \r where do the brackets ...,3 \times (2+4-5),,BIDMAS,Use the order of operations to carry out calcu...
3,[\r 3 \times 2+4-5\r \r where do the brackets ...,does not need brackets,1672.0,BIDMAS,Use the order of operations to carry out calcu...
4,"simplify the following, if possible: \frac {m^...",m+1,2142.0,Simplifying Algebraic Fractions,Simplify an algebraic fraction by factorising ...


# **TF-IDF & OneHot Encoding&#8595;**

In [7]:
# remove NaN values (dropping all datapoints that do not have misconceptions assigned to them)
# P.S. that means we are also deleting all the rows (answer choices) that are correct
# P.P.S. unless somehow there are correct answers that have misconceptions associated with them
print(reshaped_df['MisconceptionId'].isnull().sum())  # 10582 NaN values yikes :/
reshaped_df = reshaped_df.dropna(subset=['MisconceptionId'])
print(reshaped_df['MisconceptionId'].isnull().sum())  # 0 now yippie


3106
0


In [8]:
# use TF-IDF vectorizer for text data (5000 terms from QuestionText + AnswerText)
vectorizer = TfidfVectorizer(max_features=5000) 
reshaped_df['CombinedText'] = reshaped_df['QuestionText'] + " " + reshaped_df['AnswerText']
X_tfidf = vectorizer.fit_transform(reshaped_df['CombinedText'])

In [9]:
# use One hot encoding for categorical data (create a "column" for each unique subject and construct and represent each row with 0 and 1)
encoder = OneHotEncoder(sparse_output=False)
categorical_features = encoder.fit_transform(reshaped_df[['SubjectName', 'ConstructName']])

# Combine all features
X = hstack([X_tfidf, categorical_features])  
y = reshaped_df['MisconceptionId']
 
print(X.shape) # print shape of X to see total number of features (should be 5000 from tfidf + the number of uniqueconstructs and subjects)


X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42) #Split the data into training and testing


(4370, 3610)


# **Random Forest Training&#8595;**

In [10]:
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train, y_train)


# y_pred = rf_classifier.predict(X)
# print(classification_report(y_test, y_pred))


# **Testing&#8595;**

In [11]:
def map_at_25(y_true, y_pred_probs, top_k=25):
    
    map_25 = 0.0
    for true_label, pred_prob in zip(y_true, y_pred_probs):
        # Get top_k predictions
        top_preds = np.argsort(pred_prob)[::-1][:top_k]
        
        if not true_label:
            continue
        
        score = 0.0
        hits = 0
        for i, pred in enumerate(top_preds, start=1):
            if pred == true_label:
                hits += 1
                score += hits / i  # Precision at i
        
        # Average Precision at 25
        map_25 += score / min(1, top_k)
    
    return map_25 / len(y_true)

def ndcg_at_25(y_true, y_pred_probs, k=25):
    ndcg = 0.0
    for true_label, pred_prob in zip(y_true, y_pred_probs):
        top_preds = np.argsort(pred_prob)[::-1][:k]
        if not true_label:
            continue
        
        dcg = 0.0
        for i, pred in enumerate(top_preds, start=1):
            if pred == true_label:
                dcg += 1 / np.log2(i + 1)  # Discounted gain

        ideal_dcg = 1 / np.log2(1 + 1)  # Ideal DCG when correct at rank 1
        ndcg += dcg / ideal_dcg

    return ndcg / len(y_true)

def precision_at_25(y_true, y_pred_probs, k=25):
    precision = 0.0
    for true_label, pred_prob in zip(y_true, y_pred_probs):
        top_preds = np.argsort(pred_prob)[::-1][:k]
        if not true_label:
            continue
        
        correct = 1 if true_label in top_preds else 0
        precision += correct / k

    return precision / len(y_true)

def recall_at_25(y_true, y_pred_probs, k=25):
    recall = 0.0
    for true_label, pred_prob in zip(y_true, y_pred_probs):
        top_preds = np.argsort(pred_prob)[::-1][:k]
        if not true_label:
            continue
        
        correct = 1 if true_label in top_preds else 0
        recall += correct

    return recall / len(y_true)

def f1_at_25(y_true, y_pred_probs, k=25):
    precision = precision_at_25(y_true, y_pred_probs, k)
    recall = recall_at_25(y_true, y_pred_probs, k)
    if precision + recall == 0:
        return 0.0
    return 2 * (precision * recall) / (precision + recall)


In [12]:
y_val_pred_probs = rf_classifier.predict_proba(X_val)
y_val_true = list(y_val)

map25_score = map_at_25(y_val_true, y_val_pred_probs)
ndcg_score = ndcg_at_25(y_val_true, y_val_pred_probs, k=25)
precision_score = precision_at_25(y_val_true, y_val_pred_probs, k=25)
recall_score = recall_at_25(y_val_true, y_val_pred_probs, k=25)
f1_score = f1_at_25(y_val_true, y_val_pred_probs, k=25)

print(f"MAP@25 Score: {map25_score}")
print(f"NDCG@25: {ndcg_score}")
print(f"Precision@25: {precision_score}")
print(f"Recall@25: {recall_score}")
print(f"F1@25: {f1_score}")


MAP@25 Score: 0.0018796992481203006
NDCG@25: 0.0022474406028658933
Precision@25: 0.00013729977116704804
Recall@25: 0.003432494279176201
F1@25: 0.0002640380214750924


In [13]:
y_pred_probs = rf_classifier.predict_proba(X_val)

# print predictions
for idx, (true_label, pred_prob) in enumerate(zip(y_val, y_pred_probs)):
    # Get top 25 predictions and probabilities
    top_preds = np.argsort(pred_prob)[::-1][:25]
    top_probs = pred_prob[top_preds]
    
    # Check if true is within top 25
    in_top_25 = true_label in top_preds
    
    print(f"Example {idx + 1}")
    print(f"True Label: {true_label}")
    print("Top 25 Predictions (Misconception ID: Probability):")
    for pred, prob in zip(top_preds, top_probs):
        print(f"ID {pred}: {prob:.4f}")
    print(f"True Label in Top 25: {in_top_25}\n")

    # Number of questions to print
    if idx == 10:  
        break

Example 1
True Label: 2030.0
Top 25 Predictions (Misconception ID: Probability):
ID 1052: 0.4100
ID 308: 0.0900
ID 272: 0.0300
ID 1269: 0.0300
ID 597: 0.0300
ID 1042: 0.0300
ID 104: 0.0200
ID 612: 0.0200
ID 1270: 0.0200
ID 734: 0.0200
ID 728: 0.0200
ID 342: 0.0200
ID 406: 0.0100
ID 862: 0.0100
ID 1097: 0.0100
ID 664: 0.0100
ID 1101: 0.0100
ID 653: 0.0100
ID 76: 0.0100
ID 1190: 0.0100
ID 246: 0.0100
ID 918: 0.0100
ID 1359: 0.0100
ID 250: 0.0100
ID 835: 0.0100
True Label in Top 25: False

Example 2
True Label: 1794.0
Top 25 Predictions (Misconception ID: Probability):
ID 96: 0.5600
ID 693: 0.4100
ID 404: 0.0200
ID 537: 0.0100
ID 490: 0.0000
ID 491: 0.0000
ID 492: 0.0000
ID 474: 0.0000
ID 475: 0.0000
ID 476: 0.0000
ID 477: 0.0000
ID 478: 0.0000
ID 479: 0.0000
ID 480: 0.0000
ID 481: 0.0000
ID 482: 0.0000
ID 473: 0.0000
ID 484: 0.0000
ID 485: 0.0000
ID 486: 0.0000
ID 487: 0.0000
ID 488: 0.0000
ID 489: 0.0000
ID 483: 0.0000
ID 472: 0.0000
True Label in Top 25: False

Example 3
True Label: 64

In [14]:
#Split the training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [17]:
n_estimators_range = [10, 50, 100, 150, 200]

n_estimators_results = {
    "MAP@25": {},
    "NDCG@25": {},
    "Precision@25": {},
    "Recall@25": {},
    "F1@25": {}
}

# test n_estimators
for n in n_estimators_range:
    model = RandomForestClassifier(n_estimators=n, random_state=42)
    model.fit(X_train, y_train)
    y_pred_probs = model.predict_proba(X_test)
    
    # calculate results
    map_25 = map_at_25(y_test, y_pred_probs)
    ndcg_25 = ndcg_at_25(y_test, y_pred_probs)
    precision_25 = precision_at_25(y_test, y_pred_probs)
    recall_25 = recall_at_25(y_test, y_pred_probs)
    f1_25 = f1_at_25(y_test, y_pred_probs)
    
    # store results
    n_estimators_results["MAP@25"][n] = map_25
    n_estimators_results["NDCG@25"][n] = ndcg_25
    n_estimators_results["Precision@25"][n] = precision_25
    n_estimators_results["Recall@25"][n] = recall_25
    n_estimators_results["F1@25"][n] = f1_25
    
    print(f"n_estimators={n}, MAP@25={map_25:.4f}, NDCG@25={ndcg_25:.4f}, "
          f"Precision@25={precision_25:.4f}, Recall@25={recall_25:.4f}, F1@25={f1_25:.4f}")

# best chosen based on MAP@25
best_n_estimators = max(n_estimators_results["MAP@25"], key=n_estimators_results["MAP@25"].get)
print(f"Best n_estimators (based on MAP@25): {best_n_estimators}")


n_estimators=10, MAP@25=0.0016, NDCG@25=0.0022, Precision@25=0.0002, Recall@25=0.0046, F1@25=0.0004
n_estimators=50, MAP@25=0.0019, NDCG@25=0.0025, Precision@25=0.0002, Recall@25=0.0046, F1@25=0.0004
n_estimators=100, MAP@25=0.0019, NDCG@25=0.0022, Precision@25=0.0001, Recall@25=0.0034, F1@25=0.0003
n_estimators=150, MAP@25=0.0020, NDCG@25=0.0028, Precision@25=0.0002, Recall@25=0.0057, F1@25=0.0004
n_estimators=200, MAP@25=0.0021, NDCG@25=0.0033, Precision@25=0.0003, Recall@25=0.0080, F1@25=0.0006
Best n_estimators (based on MAP@25): 200


In [17]:
max_depth_values = [None, 10, 20, 30, 50]

max_depth_results = {
    "MAP@25": {},
    "NDCG@25": {},
    "Precision@25": {},
    "Recall@25": {},
    "F1@25": {}
}

# iterate through max_depth values
for d in max_depth_values:
    model = RandomForestClassifier(n_estimators=100, max_depth=d, random_state=42)
    model.fit(X_train, y_train)
    y_pred_probs = model.predict_proba(X_test)
    
    # calculate results
    map_25 = map_at_25(y_test, y_pred_probs)
    ndcg_25 = ndcg_at_25(y_test, y_pred_probs)
    precision_25 = precision_at_25(y_test, y_pred_probs)
    recall_25 = recall_at_25(y_test, y_pred_probs)
    f1_25 = f1_at_25(y_test, y_pred_probs)
    
    # store results
    max_depth_results["MAP@25"][d] = map_25
    max_depth_results["NDCG@25"][d] = ndcg_25
    max_depth_results["Precision@25"][d] = precision_25
    max_depth_results["Recall@25"][d] = recall_25
    max_depth_results["F1@25"][d] = f1_25
    
    print(f"max_depth={d}, MAP@25={map_25:.4f}, NDCG@25={ndcg_25:.4f}, "
          f"Precision@25={precision_25:.4f}, Recall@25={recall_25:.4f}, F1@25={f1_25:.4f}")

# best chosen based on MAP@25
best_max_depth = max(max_depth_results["MAP@25"], key=max_depth_results["MAP@25"].get)
print(f"Best max_depth (based on MAP@25): {best_max_depth}")



max_depth=None, MAP@25=0.0019, NDCG@25=0.0022, Precision@25=0.0001, Recall@25=0.0034, F1@25=0.0003
max_depth=10, MAP@25=0.0025, NDCG@25=0.0048, Precision@25=0.0005, Recall@25=0.0137, F1@25=0.0011
max_depth=20, MAP@25=0.0024, NDCG@25=0.0047, Precision@25=0.0005, Recall@25=0.0137, F1@25=0.0011
max_depth=30, MAP@25=0.0022, NDCG@25=0.0043, Precision@25=0.0005, Recall@25=0.0126, F1@25=0.0010
max_depth=50, MAP@25=0.0018, NDCG@25=0.0037, Precision@25=0.0005, Recall@25=0.0114, F1@25=0.0009
Best max_depth (based on MAP@25): 10


In [18]:
min_samples_split_values = [2, 5, 10, 20, 30]

min_samples_split_results = {
    "MAP@25": {},
    "NDCG@25": {},
    "Precision@25": {},
    "Recall@25": {},
    "F1@25": {}
}

# iterate through min_samples_split values
for split in min_samples_split_values:
    model = RandomForestClassifier(n_estimators=100, min_samples_split=split, random_state=42)
    model.fit(X_train, y_train)
    y_pred_probs = model.predict_proba(X_test)
    
    # calculate results
    map_25 = map_at_25(y_test, y_pred_probs)
    ndcg_25 = ndcg_at_25(y_test, y_pred_probs)
    precision_25 = precision_at_25(y_test, y_pred_probs)
    recall_25 = recall_at_25(y_test, y_pred_probs)
    f1_25 = f1_at_25(y_test, y_pred_probs)
    
    # store results
    min_samples_split_results["MAP@25"][split] = map_25
    min_samples_split_results["NDCG@25"][split] = ndcg_25
    min_samples_split_results["Precision@25"][split] = precision_25
    min_samples_split_results["Recall@25"][split] = recall_25
    min_samples_split_results["F1@25"][split] = f1_25
    
    print(f"min_samples_split={split}, MAP@25={map_25:.4f}, NDCG@25={ndcg_25:.4f}, "
          f"Precision@25={precision_25:.4f}, Recall@25={recall_25:.4f}, F1@25={f1_25:.4f}")

# best chosen based on MAP@25
best_min_samples_split = max(min_samples_split_results["MAP@25"], key=min_samples_split_results["MAP@25"].get)
print(f"Best min_samples_split (based on MAP@25): {best_min_samples_split}")


min_samples_split=2, MAP@25=0.0019, NDCG@25=0.0022, Precision@25=0.0001, Recall@25=0.0034, F1@25=0.0003
min_samples_split=5, MAP@25=0.0018, NDCG@25=0.0022, Precision@25=0.0001, Recall@25=0.0034, F1@25=0.0003
min_samples_split=10, MAP@25=0.0023, NDCG@25=0.0042, Precision@25=0.0005, Recall@25=0.0114, F1@25=0.0009
min_samples_split=20, MAP@25=0.0023, NDCG@25=0.0042, Precision@25=0.0005, Recall@25=0.0114, F1@25=0.0009
min_samples_split=30, MAP@25=0.0022, NDCG@25=0.0037, Precision@25=0.0004, Recall@25=0.0092, F1@25=0.0007
Best min_samples_split (based on MAP@25): 10


In [19]:
min_samples_leaf_values = [1, 2, 4, 6, 8]

min_samples_leaf_results = {
    "MAP@25": {},
    "NDCG@25": {},
    "Precision@25": {},
    "Recall@25": {},
    "F1@25": {}
}

# iterate through min_samples_leaf values
for leaf in min_samples_leaf_values:
    model = RandomForestClassifier(
        n_estimators=100,min_samples_leaf=leaf, random_state=42)
    model.fit(X_train, y_train)
    y_pred_probs = model.predict_proba(X_test)
    
    # calculate results 
    map_25 = map_at_25(y_test, y_pred_probs)
    ndcg_25 = ndcg_at_25(y_test, y_pred_probs)
    precision_25 = precision_at_25(y_test, y_pred_probs)
    recall_25 = recall_at_25(y_test, y_pred_probs)
    f1_25 = f1_at_25(y_test, y_pred_probs)
    
    # store results
    min_samples_leaf_results["MAP@25"][leaf] = map_25
    min_samples_leaf_results["NDCG@25"][leaf] = ndcg_25
    min_samples_leaf_results["Precision@25"][leaf] = precision_25
    min_samples_leaf_results["Recall@25"][leaf] = recall_25
    min_samples_leaf_results["F1@25"][leaf] = f1_25
    
    print(f"min_samples_leaf={leaf}, MAP@25={map_25:.4f}, NDCG@25={ndcg_25:.4f}, "
          f"Precision@25={precision_25:.4f}, Recall@25={recall_25:.4f}, F1@25={f1_25:.4f}")

# best chosen based on MAP@25
best_min_samples_leaf = max(min_samples_leaf_results["MAP@25"], key=min_samples_leaf_results["MAP@25"].get)
print(f"Best min_samples_leaf (based on MAP@25): {best_min_samples_leaf}")


min_samples_leaf=1, MAP@25=0.0019, NDCG@25=0.0022, Precision@25=0.0001, Recall@25=0.0034, F1@25=0.0003
min_samples_leaf=2, MAP@25=0.0026, NDCG@25=0.0044, Precision@25=0.0005, Recall@25=0.0114, F1@25=0.0009
min_samples_leaf=4, MAP@25=0.0017, NDCG@25=0.0041, Precision@25=0.0005, Recall@25=0.0137, F1@25=0.0011
min_samples_leaf=6, MAP@25=0.0020, NDCG@25=0.0042, Precision@25=0.0005, Recall@25=0.0126, F1@25=0.0010
min_samples_leaf=8, MAP@25=0.0023, NDCG@25=0.0055, Precision@25=0.0007, Recall@25=0.0172, F1@25=0.0013
Best min_samples_leaf (based on MAP@25): 2


In [20]:
# Final model with the best hyperparameters
final_model = RandomForestClassifier(n_estimators=best_n_estimators, max_depth=best_max_depth, min_samples_split=best_min_samples_split, min_samples_leaf=best_min_samples_leaf, max_features='sqrt', random_state=42)

final_model.fit(X_train, y_train)
y_pred = final_model.predict(X_test)

map_25 = map_at_25(y_test, y_pred_probs)
ndcg_25 = ndcg_at_25(y_test, y_pred_probs)
precision_25 = precision_at_25(y_test, y_pred_probs)
recall_25 = recall_at_25(y_test, y_pred_probs)
f1_25 = f1_at_25(y_test, y_pred_probs)

# Display results
print(f"MAP@25: {map_25:.4f}")
print(f"NDCG@25: {ndcg_25:.4f}")
print(f"Precision@25: {precision_25:.4f}")
print(f"Recall@25: {recall_25:.4f}")
print(f"F1@25: {f1_25:.4f}")

MAP@25: 0.0023
NDCG@25: 0.0055
Precision@25: 0.0007
Recall@25: 0.0172
F1@25: 0.0013


In [20]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import ParameterGrid

# Define the parameter grid
param_grid = {
    'max_depth': [10, 20,25],
    'min_samples_split': [10, 20, 25],
    'n_estimators': [150, 175, 200]
}

# Fixed parameters
fixed_params = {
    'max_features': 'sqrt',
    'min_samples_leaf': 2,
    'random_state': 42
}

# Track the best combination
best_params = None
best_map_at_25 = 0
results = []

# Loop through all combinations of parameters
for params in ParameterGrid(param_grid):
    # Merge fixed and current grid parameters
    current_params = {**fixed_params, **params}
    
    # Initialize and train the model
    model = RandomForestClassifier(**current_params)
    model.fit(X_train, y_train)
    y_pred_probs = model.predict_proba(X_test)
    
    # Compute metrics
    map_25 = map_at_25(y_test, y_pred_probs)
    ndcg_25 = ndcg_at_25(y_test, y_pred_probs)
    precision_25 = precision_at_25(y_test, y_pred_probs)
    recall_25 = recall_at_25(y_test, y_pred_probs)
    f1_25 = f1_at_25(y_test, y_pred_probs)
    
    # Store results
    results.append((current_params, map_25, ndcg_25, precision_25, recall_25, f1_25))
    
    # Check for best MAP@25
    if map_25 > best_map_at_25:
        best_map_at_25 = map_25
        best_params = current_params

    # Print metrics for the current combination
    print(f"Params: {current_params}")
    print(f"MAP@25: {map_25:.4f}, NDCG@25: {ndcg_25:.4f}, Precision@25: {precision_25:.4f}, Recall@25: {recall_25:.4f}, F1@25: {f1_25:.4f}\n")

# Output the best combination
print("Best Combination (based on MAP@25):")
print(f"Params: {best_params}")
print(f"MAP@25: {best_map_at_25:.4f}")


Params: {'max_features': 'sqrt', 'min_samples_leaf': 2, 'random_state': 42, 'max_depth': 10, 'min_samples_split': 10, 'n_estimators': 150}
MAP@25: 0.0024, NDCG@25: 0.0045, Precision@25: 0.0005, Recall@25: 0.0126, F1@25: 0.0010

Params: {'max_features': 'sqrt', 'min_samples_leaf': 2, 'random_state': 42, 'max_depth': 10, 'min_samples_split': 10, 'n_estimators': 175}
MAP@25: 0.0024, NDCG@25: 0.0045, Precision@25: 0.0005, Recall@25: 0.0126, F1@25: 0.0010

Params: {'max_features': 'sqrt', 'min_samples_leaf': 2, 'random_state': 42, 'max_depth': 10, 'min_samples_split': 10, 'n_estimators': 200}
MAP@25: 0.0024, NDCG@25: 0.0045, Precision@25: 0.0005, Recall@25: 0.0126, F1@25: 0.0010

Params: {'max_features': 'sqrt', 'min_samples_leaf': 2, 'random_state': 42, 'max_depth': 10, 'min_samples_split': 20, 'n_estimators': 150}
MAP@25: 0.0025, NDCG@25: 0.0050, Precision@25: 0.0006, Recall@25: 0.0149, F1@25: 0.0011

Params: {'max_features': 'sqrt', 'min_samples_leaf': 2, 'random_state': 42, 'max_depth':