In [36]:
import os

import numpy as np
import pandas as pd

from glob import glob
from math import sqrt
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.metrics import mean_squared_error, f1_score, classification_report
from sklearn.model_selection import train_test_split

In [37]:
home = os.path.dirname(os.getcwd())
data_path = os.path.join(home, "data")

In [38]:
dfs = {
    os.path.basename(file_name).split(".")[0]: 
    pd.read_csv(file_name) 
    for file_name in glob(f'{data_path}/*.csv')
}
for k, v in dfs.items():
    print(k)
    locals()[k] = v

problem_data
sample_submissions
test_submissions
train_submissions
user_data


In [39]:
user_data["user_id"] = user_data["user_id"].apply(lambda x: int(x.split('_')[1]))
user_data.sort_values("user_id", inplace=True)

In [40]:
problem_data["problem_id"] = problem_data["problem_id"].apply(lambda x: int(x.split('_')[1]))
problem_data.sort_values("problem_id", inplace=True)

In [41]:
train_submissions["problem_id"] = train_submissions["problem_id"].apply(lambda x: int(x.split('_')[1]))
train_submissions["user_id"] = train_submissions["user_id"].apply(lambda x: int(x.split('_')[1]))

test_submissions["problem_id"] = test_submissions["problem_id"].apply(lambda x: int(x.split('_')[1]))
test_submissions["user_id"] = test_submissions["user_id"].apply(lambda x: int(x.split('_')[1]))

In [42]:
train_data, valid_data = train_test_split(train_submissions, stratify=train_submissions["attempts_range"], test_size=0.2)

In [43]:
num_counts = dict()
mapping = dict()

n_users = len(user_data)
n_problems = len(problem_data)
n_user_rank = len(user_data["rank"].value_counts())
n_countries = len(user_data["country"].value_counts())
n_level_types = len(problem_data["level_type"].value_counts())

user_mapping = {user_id: i for i, user_id in enumerate(user_data["user_id"].values)}
problem_mapping = {problem_id: i for i, problem_id in enumerate(problem_data["problem_id"].values)}

In [58]:
from keras.models import Model
from keras.layers import Embedding, Dense, Input, Concatenate
from keras.initializers import he_normal
from keras.optimizers import Adam

def get_model():
    input_x = Input(shape=(1,))
    x = Embedding(input_dim=n_users, output_dim=50)(input_x)
    
    input_y = Input(shape=(1,))
    y = Embedding(input_dim=n_problems, output_dim=50)(input_y)
    
    merge = Concatenate(axis=-1)([x, y])
    merge = Dense(50, activation="relu", kernel_initializer=he_normal(seed=42))(merge)
    merge = Dense(50, activation="relu", kernel_initializer=he_normal(seed=42))(merge)
    merge = Dense(50, activation="relu", kernel_initializer=he_normal(seed=42))(merge)
    output = Dense(6, activation="sigmoid")(merge)
    model = Model(inputs=[input_x, input_y], outputs=output)
    model.compile(optimizer=Adam(lr=1e-4), loss='categorical_crossentropy', metrics=['acc'])
    #print(model.summary())
    
    return model

In [45]:
train_submissions["user_id"] = train_submissions["user_id"].map(user_mapping)
train_submissions["problem_id"] = train_submissions["problem_id"].map(problem_mapping)

In [46]:
y = pd.get_dummies(train_submissions["attempts_range"])

In [47]:
y = y.values[:, np.newaxis]

In [48]:
y.shape

(155295, 1, 6)

In [None]:
model = get_model()
model.fit([train_submissions["user_id"].values, train_submissions["problem_id"].values], y, batch_size=128, epochs=10, validation_split=0.2)

Train on 124236 samples, validate on 31059 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
 20864/124236 [====>.........................] - ETA: 20s - loss: 0.9382 - acc: 0.6076

In [50]:
problem_prediction = predict(valid_data_matrix, problem_similarity, type='item')
user_prediction = predict(valid_data_matrix, user_similarity, type='user')

NameError: name 'predict' is not defined

In [None]:
def rmse(prediction, ground_truth):
    prediction = prediction[ground_truth.nonzero()].flatten()
    ground_truth = ground_truth[ground_truth.nonzero()].flatten()
    return sqrt(mean_squared_error(prediction, ground_truth))

def rounded_f1_score(prediction, ground_truth):
    prediction = prediction[ground_truth.nonzero()].flatten()
    prediction = np.round(prediction)
    prediction[prediction > 6] = 6
    prediction[prediction < 1] = 1
    ground_truth = ground_truth[ground_truth.nonzero()].flatten()
    print(classification_report(ground_truth, prediction))
    
    return f1_score(ground_truth, prediction, average="weighted")

In [None]:
print('User-based CF RMSE: ' + str(rmse(user_prediction, valid_data_matrix)))
print('Item-based CF RMSE: ' + str(rmse(problem_prediction, valid_data_matrix)))

print('User-based CF F1: ' + str(rounded_f1_score(user_prediction, valid_data_matrix)))
print('Item-based CF F1: ' + str(rounded_f1_score(problem_prediction, valid_data_matrix)))

In [None]:
print(classification_report([1, 2, 3, 4, 5], [2, 2, 2, 2, 2]))