# Model Analysis and Testing

In [None]:
#!pip install plotly gower dateutil kmodes scikit-learn

In [20]:
import os
import ast

import pandas as pd
import numpy as np
from dateutil.relativedelta import relativedelta
from kmodes.kprototypes import KPrototypes
from sklearn.cluster import AgglomerativeClustering
from scipy.cluster.hierarchy import linkage, fcluster
import gower

import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
from plotly.subplots import make_subplots
import plotly.figure_factory as ff
# Settings for plot rendering, makes work with HTML output + jupyer lab + static output
pio.renderers.default = "notebook+plotly_mimetype+png"
# pio.renderers.default = "png"
%matplotlib inline

## Load and format data

In [2]:
# Use normpath to make work on windows machines
data_dir = os.path.normpath("../data/")
eedi_dir = os.path.join(data_dir, "Eedi_dataset")
eedi_metadata_dir = os.path.join(eedi_dir, "metadata")

df_train = pd.read_csv(os.path.join(eedi_dir, "train_data", "train_task_3_4.csv"))
df_answer = pd.read_csv(os.path.join(eedi_metadata_dir, "answer_metadata_task_3_4.csv"), parse_dates=["DateAnswered"])
df_student = pd.read_csv(os.path.join(eedi_metadata_dir, "student_metadata_task_3_4.csv"), parse_dates=["DateOfBirth"])
df_question = pd.read_csv(os.path.join(eedi_metadata_dir, "question_metadata_task_3_4.csv"))
df_subject = pd.read_csv(os.path.join(eedi_metadata_dir, "subject_metadata.csv"))

# Join the datasets on the relevant keys, so that we only have to work with two DFs: df_total and df_subject
df_total = df_train.join(df_answer.set_index('AnswerId'), on="AnswerId")\
                    .join(df_student.set_index("UserId"), on="UserId")\
                    .join(df_question.set_index("QuestionId"), on="QuestionId")

# Convert str col to proper dtype, list of ints
df_total.SubjectId = df_total.SubjectId.apply(ast.literal_eval)
# Convert PremiumPupil to proper dtype of bool
df_total.PremiumPupil = df_total.PremiumPupil.fillna(0).astype(bool)
# Make age column and drop DateOfBirth
df_total["Age"] = df_total.apply(lambda x: relativedelta(x.DateAnswered, x.DateOfBirth).years if not pd.isnull(x.DateOfBirth) else np.NaN, axis=1) 
df_total = df_total.drop(columns=["DateOfBirth"])

subject_mapping = pd.Series(df_subject.Name.values,index=df_subject.SubjectId).to_dict()
# Make 1st level category ID
df_total["Category1"] = df_total.SubjectId.apply(lambda x: subject_mapping[x[1]])
# Make column with strs of the parentId
df_subject["ParentName"] = df_subject.ParentId.map(subject_mapping)


print(df_subject)
df_total

     SubjectId                       Name  ParentId  Level   
0            3                      Maths       NaN      0  \
1           32                     Number       3.0      1   
2           33                     BIDMAS     144.0      3   
3           34     Upper and Lower Bounds     141.0      3   
4           35             Calculator Use      32.0      2   
..         ...                        ...       ...    ...   
383       1982  Mixed operation Fractions      39.0      3   
384       1983               Drawing Axes      54.0      3   
385       1985             Reading Scales      98.0      3   
386       1987             Sorting Shapes     174.0      3   
387       1988           Labelling Shapes     174.0      3   

                            ParentName  
0                                  NaN  
1                                Maths  
2                     Basic Arithmetic  
3              Rounding and Estimating  
4                               Number  
..       

Unnamed: 0,QuestionId,UserId,AnswerId,IsCorrect,CorrectAnswer,AnswerValue,DateAnswered,Confidence,GroupId,QuizId,SchemeOfWorkId,Gender,PremiumPupil,SubjectId,Age,Category1
0,898,2111,280203,1,2,2,2019-12-08 17:47:00,,95,86,52562.0,2,False,"[3, 49, 62, 70]",12.0,Algebra
1,767,3062,55638,1,3,3,2019-10-27 20:54:00,25.0,115,39,52562.0,0,False,"[3, 32, 144, 204]",,Number
2,165,1156,386475,1,2,2,2019-10-06 20:16:00,,101,39,52562.0,0,False,"[3, 32, 37, 220]",,Number
3,490,1653,997498,1,4,4,2020-02-27 17:40:00,,46,115,52562.0,0,False,"[3, 49, 81, 406]",,Algebra
4,298,3912,578636,1,3,3,2019-12-27 16:07:00,,314,78,52562.0,2,False,"[3, 71, 74, 180]",11.0,Geometry and Measure
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1382722,80,2608,57945,1,2,2,2019-11-28 15:22:00,,270,65,52562.0,0,False,"[3, 32, 42, 211]",,Number
1382723,707,2549,584230,0,2,1,2020-01-06 16:26:00,,269,109,52562.0,2,False,"[3, 32, 42, 212]",12.0,Number
1382724,840,5901,1138956,1,1,1,2020-01-07 07:14:00,,270,66,52562.0,0,False,"[3, 32, 37, 219]",,Number
1382725,794,3854,1151183,0,1,3,2019-12-15 16:32:00,,339,9,52562.0,0,False,"[3, 49, 62, 70]",,Algebra


## Clustering

Want to cluster over numerical AND categorical variables. 

* Use kmodes/kprototype [link](https://pypi.org/project/kmodes/)
* Use gower distance: [with hier clustering](http://webcache.googleusercontent.com/search?q=cache:https://towardsdatascience.com/clustering-on-numerical-and-categorical-features-6e0ebcf1cbad&strip=0&vwsrc=1&referer=medium-parser)
* 

In [7]:
# cat_cols = ["QuestionId", "UserId", "IsCorrect", "DateAnswered", "Confidence", "GroupId", "QuizId", "Gender", "PremiumPupil", "Age"]
# df_cluster = df_total[]

UserId
1689    827
2851    812
3145    810
2667    810
3014    807
       ... 
900      50
3124     50
139      50
4394     50
2764     50
Name: count, Length: 4918, dtype: int64

### Clustering using the KPrototypes Algorithm

In [33]:
cat_cols = ["QuestionId", "UserId", "GroupId", "QuizId", "Gender", "IsCorrect", "PremiumPupil", "Age"]
cluster_cols = cat_cols + ["DateAnswered", "Confidence"]

df_cluster = df_total[cluster_cols].sample(frac=0.1, random_state=42).fillna(0)
df_cluster.DateAnswered = df_cluster.DateAnswered.dt.strftime("%Y%m%d").astype(int)

kprot = KPrototypes(n_clusters=3)
df_cluster["Clusters"]  = kprot.fit_predict(df_cluster, categorical=list(range(len(cat_cols))))
kprot

KeyboardInterrupt: 

### Clustering using the Gower distance matrix

Gower Similarity is computed as the average of partial similarities (ps) across the m features of the observation, and can be used to calculate non-euclidian (aka more correct) distances between categorical features. 

In [32]:
cat_cols = ["QuestionId", "UserId", "GroupId", "QuizId", "Gender", "IsCorrect", "PremiumPupil", "Age"]
cluster_cols = cat_cols + ["DateAnswered", "Confidence"]

df_cluster = df_total[cluster_cols].sample(frac=0.1, random_state=42).fillna(0)
df_cluster.DateAnswered = df_cluster.DateAnswered.dt.strftime("%Y%m%d").astype(int)

# df_cluster.iloc[0]
# df_cluster["PremiumPupil"].value_counts()

PremiumPupil
False    118319
True      19954
Name: count, dtype: int64

In [22]:
cat_cols = ["QuestionId", "UserId", "GroupId", "QuizId", "Gender", "IsCorrect", "PremiumPupil", "Age"]
cluster_cols = cat_cols + ["DateAnswered", "Confidence"]

df_cluster = df_total[cluster_cols].sample(frac=0.1, random_state=42).fillna(0)
df_cluster.DateAnswered = df_cluster.DateAnswered.dt.strftime("%Y%m%d").astype(int)

gower_dists = gower.gower_matrix(df_cluster, cat_features = [True] * len(cat_cols) + [False] * (len(cluster_cols) - len(cat_cols)))

gower_dists

KeyboardInterrupt: 

In [None]:
cluster_linkages = linkage(gower_dists) 
clusters = fcluster(cluster_linkages, t=3, criterion='maxclust')

cluster

## PCA

Use FAMD to decompose categoricals into numerical values

* Using FAMD [link](https://towardsdatascience.com/famd-how-to-generalize-pca-to-categorical-and-numerical-data-2ddbeb2b9210)
* 

## Random Forest Regression

https://towardsdatascience.com/random-forest-regression-5f605132d19d