In [1]:
from sklearn.neighbors import NearestNeighbors
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [None]:
df = pd.read_csv("recordDS.csv")
transformed_data = []

for index, row in df.iterrows():
    user_id = row['user_id']
    qa_array = eval(row['qa_array'])
    exercises = qa_array[0]
    results = qa_array[1]
    for exercise_id, result in zip(exercises, results):
        transformed_data.append([user_id, exercise_id, result])

transformed_df = pd.DataFrame(transformed_data, columns=['user_id', 'exercise_id', 'result'])


In [4]:
train_data, validation_data = train_test_split(transformed_df, test_size=0.2, random_state=42)
train_data.to_csv("training_set.csv", index=False)
validation_data.to_csv("validation_set.csv", index=False)

El codigo para la funcion predict_rating y evaluate_mae fue obtenido de ChatGPT.
https://chatgpt.com/share/67323595-4c24-8003-a8ff-14ff143c458a 

In [None]:
interaction_matrix = train_data.pivot_table(index='user_id', columns='exercise_id', values='result', fill_value=0)

knn_model = NearestNeighbors(metric='cosine', algorithm='brute')
knn_model.fit(interaction_matrix.T)

def predict_rating(user_id, exercise_id, n_neighbors=5):
    if exercise_id not in interaction_matrix.columns:
        # If exercise is new or not in the dataset, we may return a default rating
        return interaction_matrix.mean().mean()  # Return the global average rating as a fallback

    # Find the k-nearest neighbors (similar exercises)
    exercise_vector = interaction_matrix[exercise_id].values.reshape(1, -1)
    distances, indices = knn_model.kneighbors(exercise_vector, n_neighbors=n_neighbors)
    
    # Get the similar exercises and their distances
    similar_exercises = interaction_matrix.columns[indices.flatten()]
    similarity_scores = 1 - distances.flatten()  # Convert cosine distances to similarities
    
    # Compute the weighted rating based on similar exercises the user has rated
    user_ratings = interaction_matrix.loc[user_id, similar_exercises]
    if user_ratings.sum() == 0:
        return interaction_matrix.mean().mean()  # Fallback if user hasn't rated similar items

    weighted_ratings = user_ratings * similarity_scores
    rating_prediction = weighted_ratings.sum() / similarity_scores[user_ratings > 0].sum()
    
    return rating_prediction


In [6]:
sample_user_id = 9829
sample_exercise_id = 1
predicted_rating = predict_rating(sample_user_id, sample_exercise_id)
print(f"Predicted rating for user {sample_user_id} on exercise {sample_exercise_id}: {predicted_rating}")

Predicted rating for user 9829 on exercise 1: 0.12994647908989448


In [None]:
def evaluate_mae(validation_data, n_neighbors=5):
    actual_ratings = []
    predicted_ratings = []
    
    for _, row in validation_data.iterrows():
        user_id = row['user_id']
        exercise_id = row['exercise_id']
        actual_result = row['result']

        # Predict rating using the IKNN model
        predicted_result = predict_rating(user_id, exercise_id, n_neighbors)

        # Append actual and predicted ratings for MAE calculation
        actual_ratings.append(actual_result)
        predicted_ratings.append(predicted_result)
    
    # Calculate Mean Absolute Error
    mae = mean_absolute_error(actual_ratings, predicted_ratings)
    return mae

# Calculate and print MAE on the validation data
mae_value = evaluate_mae(validation_data)
print(f"Mean Absolute Error (MAE) on validation set: {mae_value}")

Mean Absolute Error (MAE) on validation set: 0.26218199316911567


Para probar cambios y analizar los resultados se usara una fraccion del dataset de validacion

In [11]:
train_data_not_used, validation_data_shortened = train_test_split(validation_data, test_size=0.2, random_state=42)

Actualmente se esta dando un rating entre 1 y 0 de forma continua, ahora se probara entregarlos de forma discreta haciendo aproximaciones.

In [None]:
def predict_rating_discrete(user_id, exercise_id, n_neighbors=5, rating_threshold=0.5):
    if exercise_id not in interaction_matrix.columns:
        # If exercise is new or not in the dataset, we may return a default rating
        return interaction_matrix.mean().mean()

    # Find the k-nearest neighbors (similar exercises)
    exercise_vector = interaction_matrix[exercise_id].values.reshape(1, -1)
    distances, indices = knn_model.kneighbors(exercise_vector, n_neighbors=n_neighbors)
    
    # Get the similar exercises and their distances
    similar_exercises = interaction_matrix.columns[indices.flatten()]
    similarity_scores = 1 - distances.flatten()  # Convert cosine distances to similarities
    
    # Compute the weighted rating based on similar exercises the user has rated
    user_ratings = interaction_matrix.loc[user_id, similar_exercises]
    if user_ratings.sum() == 0:
        return interaction_matrix.mean().mean()  # Fallback if user hasn't rated similar items

    weighted_ratings = user_ratings * similarity_scores
    rating_prediction = weighted_ratings.sum() / similarity_scores[user_ratings > 0].sum()
    
    if rating_prediction >= rating_threshold:
        rating_prediction = 1
    else:
        rating_prediction = 0
    
    return rating_prediction

In [None]:
def evaluate_mae_discrete(validation_data, n_neighbors=5, rating_threshold=0.5):
    actual_ratings = []
    predicted_ratings = []
    
    for _, row in validation_data.iterrows():
        user_id = row['user_id']
        exercise_id = row['exercise_id']
        actual_result = row['result']

        # Predict rating using the IKNN model
        predicted_result = predict_rating_discrete(user_id, exercise_id, n_neighbors, rating_threshold)
        
        # Append actual and predicted ratings for MAE calculation
        actual_ratings.append(actual_result)
        predicted_ratings.append(predicted_result)
    
    # Calculate Mean Absolute Error
    mae = mean_absolute_error(actual_ratings, predicted_ratings)
    return mae

rating_threshold=0.5 n_neighbors=5

In [15]:
mae_value = evaluate_mae_discrete(validation_data_shortened)
print(f"Mean Absolute Error (MAE) on validation set: {mae_value}")

Mean Absolute Error (MAE) on validation set: 0.2317791828170453


rating_threshold=0.4 n_neighbors=5

In [24]:
mae_value = evaluate_mae_discrete(validation_data_shortened, rating_threshold=0.4)
print(f"Mean Absolute Error (MAE) on validation set: {mae_value}")

Mean Absolute Error (MAE) on validation set: 0.23455208234565236


rating_threshold=0.5 n_neighbors=7

In [25]:
mae_value = evaluate_mae_discrete(validation_data_shortened, n_neighbors=7, rating_threshold=0.5)
print(f"Mean Absolute Error (MAE) on validation set: {mae_value}")

Mean Absolute Error (MAE) on validation set: 0.23371158746707477


rating_threshold=0.4 n_neighbors=7

In [26]:
mae_value = evaluate_mae_discrete(validation_data_shortened, n_neighbors=7, rating_threshold=0.4)
print(f"Mean Absolute Error (MAE) on validation set: {mae_value}")

Mean Absolute Error (MAE) on validation set: 0.23814822671284605


rating_threshold=0.5 n_neighbors=10

In [27]:
mae_value = evaluate_mae_discrete(validation_data_shortened, n_neighbors=10, rating_threshold=0.5)
print(f"Mean Absolute Error (MAE) on validation set: {mae_value}")

Mean Absolute Error (MAE) on validation set: 0.23645875995512475


rating_threshold=0.4 n_neighbors=10

In [28]:
mae_value = evaluate_mae_discrete(validation_data_shortened, n_neighbors=10, rating_threshold=0.4)
print(f"Mean Absolute Error (MAE) on validation set: {mae_value}")

Mean Absolute Error (MAE) on validation set: 0.24200455901233892


rating_threshold=0.6 n_neighbors=5

In [29]:
mae_value = evaluate_mae_discrete(validation_data_shortened, n_neighbors=5, rating_threshold=0.6)
print(f"Mean Absolute Error (MAE) on validation set: {mae_value}")

Mean Absolute Error (MAE) on validation set: 0.23459829733779583


Actualmente se estan utilizando todos los datos de los usuarios para hacer las predicciones, lo cual puede no ser lo mejor considerando que el conocimiento de los usuarios va progresando, y es mas posible que obtenga buenos resultados en los ejercicios que hizo recientemente. Ademas es la informacion que mas habla acerca de si el usuario tendra una pregunta correcta o no. Por esta razon, se repetira lo echo anteriormente pero solo con informacion de los ejercicios que hizo en su ultimo dia registrado.

In [None]:
df = pd.read_csv("recordDS.csv")

transformed_data_time = []

for index, row in df.iterrows():
    user_id = row['user_id']
    qa_array = eval(row['qa_array'])
    create_time = row['create_time']
    
    exercises = qa_array[0]
    results = qa_array[1]
    
    for exercise_id, result in zip(exercises, results):
        transformed_data_time.append([user_id, exercise_id, create_time, result])

transformed_time_df = pd.DataFrame(transformed_data_time, columns=['user_id', 'exercise_id', 'create_time','result'])

In [None]:
transformed_time_df['create_time'] = pd.to_datetime(transformed_time_df['create_time'])
transformed_time_df['date'] = transformed_time_df['create_time'].dt.date
latest_dates = transformed_time_df.groupby('user_id')['date'].max().reset_index()
latest_dates.columns = ['user_id', 'latest_date']
filtered_df = pd.merge(transformed_time_df, latest_dates, how='inner', left_on=['user_id', 'date'], right_on=['user_id', 'latest_date'])
filtered_df = filtered_df.drop(columns=['date', 'latest_date'])

filtered_df.to_csv("filtered_data.csv", index=False)

In [5]:
train_time_data, validation_time_data = train_test_split(filtered_df, test_size=0.2, random_state=42)
train_time_data.to_csv("training_time_set.csv", index=False)
validation_time_data.to_csv("validation_time_set.csv", index=False)

In [None]:
interaction_matrix = train_time_data.pivot_table(index='user_id', columns='exercise_id', values='result', fill_value=0)
knn_model = NearestNeighbors(metric='cosine', algorithm='brute')
knn_model.fit(interaction_matrix.T)  # Transpose so exercises are the rows

In [None]:
def predict_rating(user_id, exercise_id, n_neighbors=5):
    if exercise_id not in interaction_matrix.columns:
        # If exercise is new or not in the dataset, we may return a default rating
        return interaction_matrix.mean().mean()  # Return the global average rating as a fallback

    # Find the k-nearest neighbors (similar exercises)
    exercise_vector = interaction_matrix[exercise_id].values.reshape(1, -1)
    distances, indices = knn_model.kneighbors(exercise_vector, n_neighbors=n_neighbors)
    
    # Get the similar exercises and their distances
    similar_exercises = interaction_matrix.columns[indices.flatten()]
    similarity_scores = 1 - distances.flatten()  # Convert cosine distances to similarities
    
    # Compute the weighted rating based on similar exercises the user has rated
    user_ratings = interaction_matrix.loc[user_id, similar_exercises]
    if user_ratings.sum() == 0:
        return interaction_matrix.mean().mean()  # Fallback if user hasn't rated similar items

    weighted_ratings = user_ratings * similarity_scores
    rating_prediction = weighted_ratings.sum() / similarity_scores[user_ratings > 0].sum()
    
    return rating_prediction

In [None]:
def evaluate_mae(validation_data, n_neighbors=5):
    actual_ratings = []
    predicted_ratings = []
    
    for _, row in validation_data.iterrows():
        user_id = row['user_id']
        exercise_id = row['exercise_id']
        actual_result = row['result']
        
        # Predict rating using the IKNN model
        predicted_result = predict_rating(user_id, exercise_id, n_neighbors)
        
        # Append actual and predicted ratings for MAE calculation
        actual_ratings.append(actual_result)
        predicted_ratings.append(predicted_result)
    
    # Calculate Mean Absolute Error
    mae = mean_absolute_error(actual_ratings, predicted_ratings)
    return mae

In [None]:
mae_value = evaluate_mae(validation_time_data)
print(f"Mean Absolute Error (MAE) on validation set: {mae_value}")

In [8]:
def predict_rating_discrete(user_id, exercise_id, n_neighbors=5, rating_threshold=0.5):
    if exercise_id not in interaction_matrix.columns:
        # If exercise is new or not in the dataset, we may return a default rating
        return interaction_matrix.mean().mean()  # Return the global average rating as a fallback

    # Find the k-nearest neighbors (similar exercises)
    exercise_vector = interaction_matrix[exercise_id].values.reshape(1, -1)
    distances, indices = knn_model.kneighbors(exercise_vector, n_neighbors=n_neighbors)
    
    # Get the similar exercises and their distances
    similar_exercises = interaction_matrix.columns[indices.flatten()]
    similarity_scores = 1 - distances.flatten()  # Convert cosine distances to similarities
    
    # Compute the weighted rating based on similar exercises the user has rated
    user_ratings = interaction_matrix.loc[user_id, similar_exercises]
    if user_ratings.sum() == 0:
        return interaction_matrix.mean().mean()  # Fallback if user hasn't rated similar items

    weighted_ratings = user_ratings * similarity_scores
    rating_prediction = weighted_ratings.sum() / similarity_scores[user_ratings > 0].sum()
    
    if rating_prediction >= rating_threshold:
        rating_prediction = 1
    else:
        rating_prediction = 0
    
    return rating_prediction

In [None]:
def evaluate_mae_discrete(validation_data, n_neighbors=5, rating_threshold=0.5):
    actual_ratings = []
    predicted_ratings = []
    corrects_1 = 0
    corrects_0 = 0
    incorrects_1 = 0
    incorrects_0 = 0
    
    for _, row in validation_data.iterrows():
        user_id = row['user_id']
        exercise_id = row['exercise_id']
        actual_result = row['result']
        
        # Predict rating using the IKNN model
        predicted_result = predict_rating_discrete(user_id, exercise_id, n_neighbors, rating_threshold)
        
        # Append actual and predicted ratings for MAE calculation
        actual_ratings.append(actual_result)
        predicted_ratings.append(predicted_result)

        if actual_result == predicted_result and actual_result == 1:
            corrects_1 += 1
        elif actual_result == predicted_result and actual_result == 0:
            corrects_0 += 1
        elif actual_result != predicted_result and actual_result == 1:
            incorrects_1 += 1
        elif actual_result != predicted_result and actual_result == 0:
            incorrects_0 += 1
    
    # Calculate Mean Absolute Error
    mae = mean_absolute_error(actual_ratings, predicted_ratings)
    return mae, corrects_0, corrects_1, incorrects_0, incorrects_1

In [17]:
mae_value, corrects_0, corrects_1, incorrects_0, incorrects_1 = evaluate_mae_discrete(validation_time_data)
print(f"Mean Absolute Error (MAE) on validation set: {mae_value}\n Corrects 0: {corrects_0}\n Corrects 1: {corrects_1}\n Incorrects 0: {incorrects_0}\n Incorrects 1: {incorrects_1}")

Mean Absolute Error (MAE) on validation set: 0.24053330542666393
 Corrects 0: 2360
 Corrects 1: 57754
 Incorrects 0: 19766
 Incorrects 1: 2312


In [18]:
mae_value, corrects_0, corrects_1, incorrects_0, incorrects_1 = evaluate_mae_discrete(validation_time_data, rating_threshold=0.4)
print(f"Mean Absolute Error (MAE) on validation set: {mae_value}\n Corrects 0: {corrects_0}\n Corrects 1: {corrects_1}\n Incorrects 0: {incorrects_0}\n Incorrects 1: {incorrects_1}")

Mean Absolute Error (MAE) on validation set: 0.24570412497114516
 Corrects 0: 1076
 Corrects 1: 58613
 Incorrects 0: 21050
 Incorrects 1: 1453


In [10]:
mae_value, corrects_0, corrects_1, incorrects_0, incorrects_1 = evaluate_mae_discrete(validation_time_data, rating_threshold=0.6)
print(f"Mean Absolute Error (MAE) on validation set: {mae_value}\n Corrects 0: {corrects_0}\n Corrects 1: {corrects_1}\n Incorrects 0: {incorrects_0}\n Incorrects 1: {incorrects_1}")

Mean Absolute Error (MAE) on validation set: 0.2406184718662201
 Corrects 0: 4604
 Corrects 1: 55503
 Incorrects 0: 17522
 Incorrects 1: 4563


In [11]:
mae_value, corrects_0, corrects_1, incorrects_0, incorrects_1 = evaluate_mae_discrete(validation_time_data, rating_threshold=0.7)
print(f"Mean Absolute Error (MAE) on validation set: {mae_value}\n Corrects 0: {corrects_0}\n Corrects 1: {corrects_1}\n Incorrects 0: {incorrects_0}\n Incorrects 1: {incorrects_1}")

Mean Absolute Error (MAE) on validation set: 0.25519409966454604
 Corrects 0: 6857
 Corrects 1: 52052
 Incorrects 0: 15269
 Incorrects 1: 8014


Modelo con dataset equilibrado

In [None]:
incorrect_df = transformed_df[transformed_df['result'] == 0]
correct_df = transformed_df[transformed_df['result'] == 1]

correct_sampled_df = correct_df.sample(n=len(incorrect_df), random_state=42)

balanced_df = pd.concat([incorrect_df, correct_sampled_df]).sample(frac=1, random_state=42).reset_index(drop=True)

balanced_df.to_csv("balanced_data.csv", index=False)

In [14]:
train_balanced_data, validation_balanced_data = train_test_split(balanced_df, test_size=0.2, random_state=42)

In [15]:
interaction_matrix = train_balanced_data.pivot_table(index='user_id', columns='exercise_id', values='result', fill_value=0)
knn_model = NearestNeighbors(metric='cosine', algorithm='brute')
knn_model.fit(interaction_matrix.T)

In [16]:
mae_value, corrects_0, corrects_1, incorrects_0, incorrects_1 = evaluate_mae_discrete(validation_balanced_data)
print(f"Mean Absolute Error (MAE) on validation set: {mae_value}\n Corrects 0: {corrects_0}\n Corrects 1: {corrects_1}\n Incorrects 0: {incorrects_0}\n Incorrects 1: {incorrects_1}")

Mean Absolute Error (MAE) on validation set: 0.41087117208457163
 Corrects 0: 11182
 Corrects 1: 41444
 Incorrects 0: 42868
 Incorrects 1: 12686


Modelo con 0s y 1s cambiados por 1s y 2s respectivamente

In [17]:
escalated_df = transformed_df.copy()
escalated_df['result'] = escalated_df['result'].replace({1: 2, 0: 1})
escalated_df.to_csv("escalated_data.csv", index=False)

In [25]:
train_escalated_data, validation_escalated_data = train_test_split(escalated_df, test_size=0.2, random_state=42)

In [26]:
interaction_matrix = train_escalated_data.pivot_table(index='user_id', columns='exercise_id', values='result', fill_value=0)
knn_model = NearestNeighbors(metric='cosine', algorithm='brute')
knn_model.fit(interaction_matrix.T)

In [27]:
def predict_rating_discrete_escalated(user_id, exercise_id, n_neighbors=5, rating_threshold=1.5):
    if exercise_id not in interaction_matrix.columns:
        # If exercise is new or not in the dataset, we may return a default rating
        return interaction_matrix.mean().mean()  # Return the global average rating as a fallback

    # Find the k-nearest neighbors (similar exercises)
    exercise_vector = interaction_matrix[exercise_id].values.reshape(1, -1)
    distances, indices = knn_model.kneighbors(exercise_vector, n_neighbors=n_neighbors)
    
    # Get the similar exercises and their distances
    similar_exercises = interaction_matrix.columns[indices.flatten()]
    similarity_scores = 1 - distances.flatten()  # Convert cosine distances to similarities
    
    # Compute the weighted rating based on similar exercises the user has rated
    user_ratings = interaction_matrix.loc[user_id, similar_exercises]
    if user_ratings.sum() == 0:
        return interaction_matrix.mean().mean()  # Fallback if user hasn't rated similar items

    weighted_ratings = user_ratings * similarity_scores
    rating_prediction = weighted_ratings.sum() / similarity_scores[user_ratings > 0].sum()
    
    if rating_prediction >= rating_threshold:
        rating_prediction = 2
    else:
        rating_prediction = 1
    
    return rating_prediction

In [28]:
def evaluate_mae_discrete_escalated(validation_data, n_neighbors=5, rating_threshold=1.5):
    actual_ratings = []
    predicted_ratings = []
    corrects_1 = 0
    corrects_0 = 0
    incorrects_1 = 0
    incorrects_0 = 0
    
    for _, row in validation_data.iterrows():
        user_id = row['user_id']
        exercise_id = row['exercise_id']
        actual_result = row['result']
        
        # Predict rating using the IKNN model
        predicted_result = predict_rating_discrete(user_id, exercise_id, n_neighbors, rating_threshold)
        
        # Append actual and predicted ratings for MAE calculation
        actual_ratings.append(actual_result)
        predicted_ratings.append(predicted_result)

        if actual_result == predicted_result and actual_result == 2:
            corrects_1 += 1
        elif actual_result == predicted_result and actual_result == 1:
            corrects_0 += 1
        elif actual_result != predicted_result and actual_result == 2:
            incorrects_1 += 1
        elif actual_result != predicted_result and actual_result == 1:
            incorrects_0 += 1
    
    # Calculate Mean Absolute Error
    mae = mean_absolute_error(actual_ratings, predicted_ratings)
    return mae, corrects_0, corrects_1, incorrects_0, incorrects_1

In [29]:
mae_value, corrects_0, corrects_1, incorrects_0, incorrects_1 = evaluate_mae_discrete_escalated(validation_escalated_data)
print(f"Mean Absolute Error (MAE) on validation set: {mae_value}\n Corrects 0: {corrects_0}\n Corrects 1: {corrects_1}\n Incorrects 0: {incorrects_0}\n Incorrects 1: {incorrects_1}")

Mean Absolute Error (MAE) on validation set: 0.9084130296577081
 Corrects 0: 35440
 Corrects 1: 0
 Incorrects 0: 18631
 Incorrects 1: 162305


In [32]:
mae_value= evaluate_mae(validation_escalated_data)
print(f"Mean Absolute Error (MAE) on validation set: {mae_value}")

Mean Absolute Error (MAE) on validation set: 0.2910657663303868


In [33]:
def predict_rating_aprox(user_id, exercise_id, n_neighbors=5):
    if exercise_id not in interaction_matrix.columns:
        # If exercise is new or not in the dataset, we may return a default rating
        return interaction_matrix.mean().mean()  # Return the global average rating as a fallback

    # Find the k-nearest neighbors (similar exercises)
    exercise_vector = interaction_matrix[exercise_id].values.reshape(1, -1)
    distances, indices = knn_model.kneighbors(exercise_vector, n_neighbors=n_neighbors)
    
    # Get the similar exercises and their distances
    similar_exercises = interaction_matrix.columns[indices.flatten()]
    similarity_scores = 1 - distances.flatten()  # Convert cosine distances to similarities
    
    # Compute the weighted rating based on similar exercises the user has rated
    user_ratings = interaction_matrix.loc[user_id, similar_exercises]
    if user_ratings.sum() == 0:
        return interaction_matrix.mean().mean()  # Fallback if user hasn't rated similar items

    weighted_ratings = user_ratings * similarity_scores
    rating_prediction = weighted_ratings.sum() / similarity_scores[user_ratings > 0].sum()
    
    if abs(rating_prediction-1) < abs(rating_prediction-2):
        rating_prediction = 1
    else:
        rating_prediction = 2
    
    return rating_prediction

In [34]:
def evaluate_mae_aprox(validation_data, n_neighbors=5):
    actual_ratings = []
    predicted_ratings = []
    corrects_1 = 0
    corrects_0 = 0
    incorrects_1 = 0
    incorrects_0 = 0
    
    for _, row in validation_data.iterrows():
        user_id = row['user_id']
        exercise_id = row['exercise_id']
        actual_result = row['result']
        
        # Predict rating using the IKNN model
        predicted_result = predict_rating_aprox(user_id, exercise_id, n_neighbors)
        
        # Append actual and predicted ratings for MAE calculation
        actual_ratings.append(actual_result)
        predicted_ratings.append(predicted_result)

        if actual_result == predicted_result and actual_result == 2:
            corrects_1 += 1
        elif actual_result == predicted_result and actual_result == 1:
            corrects_0 += 1
        elif actual_result != predicted_result and actual_result == 2:
            incorrects_1 += 1
        elif actual_result != predicted_result and actual_result == 1:
            incorrects_0 += 1
    
    # Calculate Mean Absolute Error
    mae = mean_absolute_error(actual_ratings, predicted_ratings)
    return mae, corrects_0, corrects_1, incorrects_0, incorrects_1

In [35]:
mae_value, corrects_0, corrects_1, incorrects_0, incorrects_1 = evaluate_mae_aprox(validation_escalated_data)
print(f"Mean Absolute Error (MAE) on validation set: {mae_value}\n Corrects 0: {corrects_0}\n Corrects 1: {corrects_1}\n Incorrects 0: {incorrects_0}\n Incorrects 1: {incorrects_1}")

Mean Absolute Error (MAE) on validation set: 0.2370169413669552
 Corrects 0: 18592
 Corrects 1: 146616
 Incorrects 0: 35479
 Incorrects 1: 15689


Modelo final con todas las mejoras

In [2]:
#Codigo para reducir el dataset a la info mas reciente de cada usuario
df = pd.read_csv("recordDS.csv")

transformed_data_time = []

for index, row in df.iterrows():
    user_id = row['user_id']
    qa_array = eval(row['qa_array'])
    create_time = row['create_time']
    
    exercises = qa_array[0]
    results = qa_array[1]
    
    for exercise_id, result in zip(exercises, results):
        transformed_data_time.append([user_id, exercise_id, create_time, result])

transformed_time_df = pd.DataFrame(transformed_data_time, columns=['user_id', 'exercise_id', 'create_time','result'])

transformed_time_df['create_time'] = pd.to_datetime(transformed_time_df['create_time'])
transformed_time_df['date'] = transformed_time_df['create_time'].dt.date
latest_dates = transformed_time_df.groupby('user_id')['date'].max().reset_index()
latest_dates.columns = ['user_id', 'latest_date']
filtered_df = pd.merge(transformed_time_df, latest_dates, how='inner', left_on=['user_id', 'date'], right_on=['user_id', 'latest_date'])
filtered_df = filtered_df.drop(columns=['date', 'latest_date'])

In [3]:
#Codigo para balancear el dataset
incorrect_df = filtered_df[filtered_df['result'] == 0]
correct_df = filtered_df[filtered_df['result'] == 1]

correct_sampled_df = correct_df.sample(n=len(incorrect_df), random_state=42)

balanced_df = pd.concat([incorrect_df, correct_sampled_df]).sample(frac=1, random_state=42).reset_index(drop=True)

balanced_df.to_csv("balanced_data.csv", index=False)

In [4]:
#Codigo para cambiar los valores de 0 y 1 a 1 y 2
final_df = balanced_df.copy()
final_df['result'] = final_df['result'].replace({1: 2, 0: 1})
final_df.to_csv("final_dataset.csv", index=False)

In [5]:
#Creacion de los datasets de entrenamiento y validacion
train_final_data, validation_final_data = train_test_split(final_df, test_size=0.2, random_state=42)

In [6]:
#Entrenamiento del modelo
interaction_matrix = train_final_data.pivot_table(index='user_id', columns='exercise_id', values='result', fill_value=0)
knn_model = NearestNeighbors(metric='cosine', algorithm='brute')
knn_model.fit(interaction_matrix.T)

In [14]:
def predict_rating_aprox(user_id, exercise_id, n_neighbors=5):
    if exercise_id not in interaction_matrix.columns:
        # If exercise is new or not in the dataset, we may return a default rating
        return 2  # Return the global average rating as a fallback

    # Find the k-nearest neighbors (similar exercises)
    exercise_vector = interaction_matrix[exercise_id].values.reshape(1, -1)
    distances, indices = knn_model.kneighbors(exercise_vector, n_neighbors=n_neighbors)
    
    # Get the similar exercises and their distances
    similar_exercises = interaction_matrix.columns[indices.flatten()]
    similarity_scores = 1 - distances.flatten()  # Convert cosine distances to similarities
    
    # Compute the weighted rating based on similar exercises the user has rated
    user_ratings = interaction_matrix.loc[user_id, similar_exercises]
    if user_ratings.sum() == 0:
        return 2  # Fallback if user hasn't rated similar items

    weighted_ratings = user_ratings * similarity_scores
    rating_prediction = weighted_ratings.sum() / similarity_scores[user_ratings > 0].sum()
    
    if abs(rating_prediction-1) < abs(rating_prediction-2):
        rating_prediction = 1
    else:
        rating_prediction = 2
    
    return rating_prediction

In [15]:
# Prepare test data
# Assuming `test_final_data` is your test dataset
test_final_data = validation_final_data.copy()  # Avoid modifying the original test dataset

# Function to predict for all rows in test data
def predict_ratings_for_test_data(test_data):
    predictions = []
    for _, row in test_data.iterrows():
        pred = predict_rating_aprox(row['user_id'], row['exercise_id'])
        predictions.append(int(pred))
    return predictions

# Predict ratings for the test set
test_final_data['predicted_result'] = predict_ratings_for_test_data(test_final_data)

# Calculate metrics
y_true = test_final_data['result']  # Actual results
y_pred = test_final_data['predicted_result']  # Predicted results

accuracy = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred, average='binary')  # Use 'macro' if it's a multi-class problem
recall = recall_score(y_true, y_pred, average='binary')
f1 = f1_score(y_true, y_pred, average='binary')

# Display the results
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")


Accuracy: 0.6601
Precision: 0.6521
Recall: 0.6910
F1 Score: 0.6710


In [16]:
print(test_final_data)
test_final_data.to_csv("test_final_data.csv", index=False)

        user_id  exercise_id         create_time  result  predicted_result
46322      6131         2041 2020-12-03 14:40:00       1                 2
20928      1805         2483 2020-06-11 10:24:00       2                 2
105803    15090          954 2022-01-22 21:33:00       1                 1
96138     10372         1739 2021-12-10 05:55:00       1                 1
175764    15165         2483 2022-01-26 11:26:00       1                 2
...         ...          ...                 ...     ...               ...
55577      1816         2425 2020-06-11 10:39:00       1                 2
29198      9680         1321 2021-08-17 22:17:00       1                 2
94211      1811         1020 2020-04-09 11:30:00       1                 2
203416     7575         1296 2021-01-31 22:45:00       2                 2
123192     9601         1672 2021-08-14 13:35:00       1                 1

[44138 rows x 5 columns]
