In [1]:
import numpy as np
import pandas as pd
from typing import List
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

#base_path = "C:/Users/99818854/Projetos/GitRep/adaptive_learning"
base_path = "/media/bruno/Arquivos/Desenvolvimento/NextQuestion"

In [2]:
base = pd.read_csv(f"{base_path}/data/mastery.csv")
submit = pd.read_csv(f"{base_path}/data/Submit.csv", sep=";")

In [3]:
mastery_params = [[4.44], [0.33], [0.86], [0.86], [0.46], [0.39]]
mastery = np.dot(base.values[:, 3:-1], mastery_params)
X, y = np.concatenate((base.values[:, 2:3], mastery), axis=1), base.values[:, -1]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
mean_difficulty = round(np.nanmean(X_train[:, 0]))
X_train[np.isnan(X_train)] = mean_difficulty
X_test[np.isnan(X_test)] = mean_difficulty

user_data = {}
group = base.groupby(["user_id", "area"])[[
    "points", "points1", "points2", "points3", "points4", "points5"
]].agg("max")

for i in range(0, len(group)):
    user_id, area = group.index[i]
    if user_id not in user_data.keys():
        user_data[user_id] = {}
    user_data[user_id][area] = np.dot(group.values[i], mastery_params)[0]

def search_mastery(user_id, area):
    if user_id not in user_data.keys():
        return 0
    if area not in user_data[user_id].keys():
        return 0
    return user_data[user_id][area]

In [23]:
model = SGDClassifier(loss="log_loss", learning_rate="adaptive", eta0=0.001)
model.fit(X_train, y_train)
y_predict = model.predict(X_test)

tn, fp, fn, tp = confusion_matrix(y_test, y_predict).ravel()
accuracy = (tp + tn) / (tn + fp + fn + tp)
precision = tp / (tp + fp)
recall = tp / (tp + fn)
f1 = 2 * (precision * recall) / (precision + recall)

accuracy, precision, recall, f1

(0.6862066666666666,
 0.7164698280884237,
 0.8613575181526221,
 0.7822613474455526)

In [34]:
df = submit[["difficulty", "novo_user_id", "knowledge_area_id"]]
df = df.fillna(mean_difficulty)
df["mastery"] = [search_mastery(df["novo_user_id"].values[i], df["knowledge_area_id"].values[i]) for i in range(0, len(df))]

df["predict"] = model.predict(df[["difficulty", "mastery"]].values)
pd.DataFrame(df["predict"].values, columns=["acertou"]).to_csv(f"{base_path}/data/result.csv", index=False)
df.groupby("predict")["predict"].count()

predict
0.0     3561
1.0    16439
Name: predict, dtype: int64

In [27]:
model = DecisionTreeClassifier(criterion="entropy")
model.fit(X_train, y_train)
y_predict = model.predict(X_test)

tn, fp, fn, tp = confusion_matrix(y_test, y_predict).ravel()
accuracy = (tp + tn) / (tn + fp + fn + tp)
precision = tp / (tp + fp)
recall = tp / (tp + fn)
f1 = 2 * (precision * recall) / (precision + recall)

accuracy, precision, recall, f1

(0.7138616666666666,
 0.7271512340348539,
 0.9007317079382543,
 0.8046870644983305)