In [1]:
import numpy as np
import pandas as pd
from typing import List
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

base_path = "C:/Users/99818854/Projetos/GitRep/adaptive_learning"
#base_path = "/media/bruno/Arquivos/Desenvolvimento/NextQuestion"

In [2]:
base = pd.read_csv(f"{base_path}/data/mastery.csv")
submit = pd.read_csv(f"{base_path}/data/Submit.csv", sep=";")

In [47]:
mastery_params = [[4.44], [0.33], [0.86], [0.86], [0.46], [0.39]]
mastery = np.dot(base.values[:, 3:-1], mastery_params)
X, y = np.concatenate((base.values[:, 2:3], mastery), axis=1), base.values[:, -1]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
mean_difficulty = round(np.nanmean(X_train[:, 0]))
X_train[np.isnan(X_train)] = mean_difficulty
X_test[np.isnan(X_test)] = mean_difficulty

user_data = {}
group = base.groupby(["user_id", "area"])[[
    "points", "points1", "points2", "points3", "points4", "points5"
]].agg("max")

for i in range(0, len(group)):
    user_id, area = group.index[i]
    if user_id not in user_data.keys():
        user_data[user_id] = {}
    user_data[user_id][area] = np.dot(group.values[i], mastery_params)[0]

def search_mastery(user_id, area):
    if user_id not in user_data.keys():
        return 0
    if area not in user_data[user_id].keys():
        return 0
    return user_data[user_id][area]

In [52]:
model = SGDClassifier(loss="log", learning_rate="adaptive", eta0=0.001)
model.fit(X_train, y_train)
y_predict = model.predict(X_test)

tn, fp, fn, tp = confusion_matrix(y_test, y_predict).ravel()
accuracy = (tp + tn) / (tn + fp + fn + tp)
precision = tp / (tp + fp)
recall = tp / (tp + fn)
f1 = 2 * (precision * recall) / (precision + recall)

accuracy, precision, recall, f1

(0.684545, 0.711525809084398, 0.8707648873906585, 0.783132571534639)

In [60]:
df = submit[["difficulty", "novo_user_id", "knowledge_area_id"]]
df = df.fillna(mean_difficulty)
df["mastery"] = [search_mastery(df["novo_user_id"].values[i], df["knowledge_area_id"].values[i]) for i in range(0, len(df))]

df["predict"] = model.predict(df[["difficulty", "mastery"]].values)
df.groupby("predict")["predict"].agg(["count", "mean"])

Unnamed: 0_level_0,count,mean
predict,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,3593,0.0
1.0,16407,1.0


In [59]:
model = DecisionTreeClassifier(criterion="entropy")
model.fit(X_train, y_train)
y_predict = model.predict(X_test)

tn, fp, fn, tp = confusion_matrix(y_test, y_predict).ravel()
accuracy = (tp + tn) / (tn + fp + fn + tp)
precision = tp / (tp + fp)
recall = tp / (tp + fn)
f1 = 2 * (precision * recall) / (precision + recall)

accuracy, precision, recall, f1

(0.713335, 0.7273828039588602, 0.8984923419532542, 0.8039336607964215)

In [24]:
model = RandomForestClassifier(criterion="entropy", n_estimators=50)
model.fit(X_train, y_train)
y_predict = model.predict(X_test)

tn, fp, fn, tp = confusion_matrix(y_test, y_predict).ravel()
accuracy = (tp + tn) / (tn + fp + fn + tp)
precision = tp / (tp + fp)
recall = tp / (tp + fn)
f1 = 2 * (precision * recall) / (precision + recall)

accuracy, precision, recall, f1

(0.7144083333333333,
 0.7258686864562492,
 0.9055587392550143,
 0.8058179263296862)

In [4]:
total_models = 15
models: List[SGDClassifier] = []

for i in range(0, total_models):
    X_sample, y_sample = [], []
    while len(X_sample) != len(X_train):
        j = np.random.randint(0, len(X_train))
        X_sample.append(X_train[j])
        y_sample.append(y_train[j])

    model = SGDClassifier(loss="log_loss", learning_rate="adaptive", eta0=0.001)
    model.fit(X_sample, y_sample)
    models.append(model)

In [5]:
def predict(models: List[SGDClassifier], test):
    votes = []
    for model in models:
        y_pred_test = model.predict(test)
        votes.append(y_pred_test)

    result = []
    for i in range(0, len(test)):
        total = []
        for j in range(0, len(votes)):
            total.append(votes[j][i])
        predict = np.mean(total)
        result.append(1 if predict > 0.5 else 0)

    tn, fp, fn, tp = confusion_matrix(y_test, result).ravel()
    return result, tn, fp, fn, tp

In [8]:
votes = []
for model in models:
    y_pred_test = model.predict(X_test)
    votes.append(y_pred_test)

result = []
for i in range(0, len(X_test)):
    total = []
    for j in range(0, len(votes)):
        total.append(votes[j][i])
    predict = np.mean(total)
    result.append(1 if predict > 0.5 else 0)

tn, fp, fn, tp = confusion_matrix(y_test, result).ravel()

In [10]:
accuracy = (tp + tn) / (tn + fp + fn + tp)
precision = tp / (tp + fp)
recall = tp / (tp + fn)
f1 = 2 * (precision * recall) / (precision + recall)

In [7]:
y_pred, accuracy, precision, recall, f1 = predict(models, X_test)

In [8]:
accuracy, precision, recall, f1

(74232, 132846, 55800, 337122)