In [None]:
import riiideducation
import pandas as pd

# You can only call make_env() once, so don't lose it!
env = riiideducation.make_env()


In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
# You can only iterate through a result from `env.iter_test()` once
# so be careful not to lose it once you start iterating.
iter_test = env.iter_test()

In [None]:
def make_times_dict_np(df_np):
    user_times = {}
    for i, row in enumerate(df_np):
        if row[1] in user_times:
            if row[0] > user_times[row[1]]:
                user_times[row[1]] = row[0]
        else:
            user_times[row[1]] = 1
    return user_times

In [None]:
def create_cols_dict(df):
    cols_lst = list(df.columns)
    cols_dict = {}
    for i, col in enumerate(cols_lst):
        cols_dict[col] = i
    return cols_dict

In [None]:
def create_bool_dict(arr, cols_dict, id_column_name, tag_column_name):
    bool_dict = {}
    print(arr.shape[0])
    for i in range(0,arr.shape[0]):
        if id_column_name == "question_id":
            ls = [1 if str(t) in str(arr[i,cols_dict[tag_column_name]]).split(" ") else 0 for t in range(0,188)]
        elif id_column_name == "lecture_id":
            ls = [1 if str(t)==str(arr[i,cols_dict[tag_column_name]]) else 0 for t in range(0,188)]
        bool_dict[arr[i,cols_dict[id_column_name]]] = np.array(ls)
        #print("created array for " + str(i))
    #print(bool_dict)
    return bool_dict

In [None]:
#converting categorical data to 0s and 1s, and adding the questions and lectures data to train_df, with
#NaN for lecture values that don't apply to questions or vice versa

def categorical_to_binary(new_train_df_np, cols_dict, new_lectures_df_np, new_questions_df_np, 
                          lectures_bool_dict, questions_bool_dict, most_important_tag_coefs, train_data=True): 
    
    print('here')
    print(most_important_tag_coefs.shape)
    num_tags = most_important_tag_coefs.shape[0]
    print(num_tags)
    new_train_df_np = np.append(new_train_df_np,(-np.ones((new_train_df_np.shape[0],num_tags))),1)
    print(new_train_df_np.shape)
    for i in range(0,new_train_df_np.shape[0]):
        if new_train_df_np[i,cols_dict['content_type_id']]==0:
            tags_row = questions_bool_dict[new_train_df_np[i,cols_dict['content_id']]].reshape(188,1)
        else:
            tags_row = lectures_bool_dict[new_train_df_np[i,cols_dict['content_id']]].reshape(188,1)
        #print(tags_row.shape)
        #print(most_important_tag_coefs.shape)
        new_train_df_np[i,-num_tags:] = np.matmul(most_important_tag_coefs,tags_row).reshape(1,num_tags)
    print('encoded tags')
    
    return new_train_df_np

In [None]:
def normalize_elapsed_time_np(train_df_np, cols_dict):
    for i in range(len(train_df_np)):
        train_df_np[i, cols_dict["prior_question_elapsed_time"]] = train_df_np[i, cols_dict["prior_question_elapsed_time"]]/300000
    return  train_df_np

In [None]:
def convert_times_to_float_np(df_np, cols_dict, user_times):
    for i in range(0, len(df_np)):
        df_np[i, cols_dict["timestamp"]] = df_np[i, cols_dict["timestamp"]] / user_times[df_np[i, cols_dict["user_id"]]]
    return df_np

In [None]:
from sklearn.decomposition import PCA


def find_most_important_tags(new_train_df_np, cols_dict, lectures_bool_dict, questions_bool_dict):

    num_tags = 188
    tags_arr = -np.ones((new_train_df_np.shape[0],num_tags))
    print(tags_arr.shape)
    for i in range(0,tags_arr.shape[0]):
        if new_train_df_np[i,cols_dict['content_type_id']]==0:
            tags_arr[i,:] = questions_bool_dict[new_train_df_np[i,cols_dict['content_id']]]
        else:
            tags_arr[i,:] = lectures_bool_dict[new_train_df_np[i,cols_dict['content_id']]]
    
    pca = PCA(n_components=20)
    pca.fit(tags_arr)
    print('pca done')
    return pca.components_

In [None]:
import numpy as np

train_df = pd.read_csv('/kaggle/input/riiid-test-answer-prediction/train.csv', low_memory=False, nrows=50000, 
                       dtype={'row_id': 'int64', 'timestamp': 'int64', 'user_id': 'int32', 'content_id': 'int16', 'content_type_id': 'int8',
                              'task_container_id': 'int16', 'user_answer': 'int8', 'answered_correctly': 'float64', 'prior_question_elapsed_time': 'float32', 
                             'prior_question_had_explanation': 'boolean',
                             }
                      )

lectures_df = pd.read_csv('/kaggle/input/riiid-test-answer-prediction/lectures.csv', low_memory=False,
                             dtype={'lecture_id': 'int64', 'part': 'int8', 'tag': 'int32', 'type_of': 'string'}
                         )

questions_df = pd.read_csv('/kaggle/input/riiid-test-answer-prediction/questions.csv', low_memory=False,
                              dtype={'question_id': 'int64', 'bundle_id': 'int64', 'correct_answer': 'int8', 'part': 'int8', 'tags': 'string',
                             }
                          )

def get_data_ready(train_df, lectures_df, questions_df, train_data=True, most_important_tag_coefs=None):
    cols_dict = create_cols_dict(train_df)
    new_train_df_np = train_df.to_numpy(na_value=-1)
    user_times = make_times_dict_np(new_train_df_np[:,cols_dict["timestamp"]:cols_dict["user_id"] + 1])
    print('created user dict')
    
    #if(train_data): 
    new_lectures_df_np = lectures_df.to_numpy(na_value=-1)
    lectures_cols_dict = create_cols_dict(lectures_df)
    lectures_bool_dict = create_bool_dict(new_lectures_df_np, lectures_cols_dict, "lecture_id", "tag")
    
    new_questions_df_np = questions_df.to_numpy(na_value=-1)      
    questions_cols_dict = create_cols_dict(questions_df)
    questions_bool_dict = create_bool_dict(new_questions_df_np, questions_cols_dict, "question_id", "tags")  
    
    print()
    print(lectures_cols_dict)
    print(questions_cols_dict)
    
    del train_df
    del lectures_df
    del questions_df
    
    if most_important_tag_coefs is None:
        most_important_tag_coefs = find_most_important_tags(new_train_df_np, cols_dict, 
        lectures_bool_dict, questions_bool_dict)
    
    new_train_df_np = categorical_to_binary(new_train_df_np, cols_dict, new_lectures_df_np, new_questions_df_np,
                                            lectures_bool_dict, questions_bool_dict, most_important_tag_coefs, train_data)
    print("converted categorical to binary")
    new_train_df_np = normalize_elapsed_time_np(new_train_df_np, cols_dict)
    print('normalized elapsed times')
    new_train_df_np = convert_times_to_float_np(new_train_df_np, cols_dict, user_times)
    print('converted times to float')
#   new_train_df = new_train_df.drop(['timestamp'], axis=1)
    
    print(cols_dict)
        
    print("dropped extraneous columns")
    #separate df into X and Y
    if train_data:
        y_column = new_train_df_np.T[cols_dict['answered_correctly']]
        new_train_df_np = np.delete(new_train_df_np, cols_dict['answered_correctly'], axis=1)
    else:
        y_column = None
        
    if not train_data:
        new_train_df_np = np.delete(new_train_df_np, cols_dict['prior_group_responses'], axis=1)
        new_train_df_np = np.delete(new_train_df_np, cols_dict['prior_group_answers_correct'], axis=1)
    else:
        new_train_df_np = np.delete(new_train_df_np, cols_dict['user_answer'], axis=1)
    new_train_df_np = np.delete(new_train_df_np, cols_dict['task_container_id'], axis=1)
    new_train_df_np = np.delete(new_train_df_np, cols_dict['content_id'], axis=1)
    new_train_df_np = np.delete(new_train_df_np, cols_dict['user_id'], axis=1)
    new_train_df_np = np.delete(new_train_df_np, cols_dict['row_id'], axis=1)

    print('cleaned data')
    
    return new_train_df_np, y_column, most_important_tag_coefs

In [None]:
(X_train, Y_train, most_important_tag_coefs) = get_data_ready(train_df, lectures_df, questions_df)

In [None]:
Y_train = Y_train.astype('float')
print(Y_train.shape)

In [None]:
print(X_train.shape)

In [None]:
import numpy as np
import sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier

X_train_split, X_CV, Y_train_split, Y_CV = train_test_split(X_train, Y_train, test_size=0.2)
clf = MLPClassifier(solver='adam', alpha=1e-5, hidden_layer_sizes=(5, 2), max_iter = 10000)
clf.fit(X_train_split, Y_train_split)

In [None]:
Y_pred_proba = clf.predict_proba(X_CV)
Y_pred = clf.predict(X_CV)
print(Y_pred.shape)
print(Y_CV[0:100])
print(Y_pred_proba[0:100,:])
print(Y_pred[0:100])
accuracy = clf.score(X_CV,Y_CV)
print(accuracy)

In [None]:
#env.predict(sample_prediction_df)
# first_ind = 0
# group_num = 0
del X_train
del Y_train 

for (test_df, sample_prediction_df) in iter_test:
    # test_df, sample_prediction_df = next(iter_test)
    (X_test, Y_test, c) = get_data_ready(test_df, lectures_df, questions_df, False, most_important_tag_coefs)
    X_test = X_test.astype('float')
    # print(X_test)
    prediction = clf.predict(X_test)
    prediction[prediction==-1] = 0.5
    test_df['answered_correctly'] = prediction
    test_df['answered_correctly'] = test_df['answered_correctly']
#   submission['row_id'] = list(range(first_ind, len(submission) + first_ind ))
#   submission = submission.set_index('row_id')
#   first_ind += len(submission)
#   lst = [group_num]*len(submission)
#   submission['group_num'] = lst
    submission = test_df.loc[test_df['content_type_id'] == 0, ['row_id', 'answered_correctly']]
    print(submission)
    env.predict(submission)
#   group_num += 1