In [25]:
import sys
from pathlib import Path

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

repo_dir = Path().resolve().parent
sys.path.append(str(repo_dir))

data_dir = repo_dir / "data"

In [9]:
zip_path = data_dir / "predict-student-performance-from-game-play.zip"

if not zip_path.exists():
    !kaggle competitions download -c predict-student-performance-from-game-play -p {data_dir}
    !unzip -o {zip_path} -d {data_dir}

In [19]:
train_df_sample = pd.read_csv(data_dir / "train.csv", nrows=10000)
test_df = pd.read_csv(data_dir / "test.csv")
train_labels_df = pd.read_csv(data_dir / "train_labels.csv")

In [20]:
train_df_sample.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 20 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   session_id      10000 non-null  int64  
 1   index           10000 non-null  int64  
 2   elapsed_time    10000 non-null  int64  
 3   event_name      10000 non-null  object 
 4   name            10000 non-null  object 
 5   level           10000 non-null  int64  
 6   page            268 non-null    float64
 7   room_coor_x     8830 non-null   float64
 8   room_coor_y     8830 non-null   float64
 9   screen_coor_x   8830 non-null   float64
 10  screen_coor_y   8830 non-null   float64
 11  hover_duration  1144 non-null   float64
 12  text            3383 non-null   object 
 13  fqid            6816 non-null   object 
 14  room_fqid       10000 non-null  object 
 15  text_fqid       3383 non-null   object 
 16  fullscreen      0 non-null      float64
 17  hq              0 non-null      

In [21]:
train_labels_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 212022 entries, 0 to 212021
Data columns (total 2 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   session_id  212022 non-null  object
 1   correct     212022 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 3.2+ MB


## Data Exploration

#### Files

- train.csv - the training set
- test.csv - the test set
- sample_submission.csv - a sample submission file in the correct format
- train_labels.csv - correct value for all 18 questions for each session in the training set

#### Columns

- session_id - the ID of the session the event took place in
- index - the index of the event for the session
- elapsed_time - how much time has passed (in milliseconds) between the start of the session and when the event was recorded
- event_name - the name of the event type
- name - the event name (e.g. identifies whether a notebook_click is is opening or closing the notebook)
- level - what level of the game the event occurred in (0 to 22)
- page - the page number of the event (only for notebook-related events)
- room_coor_x - the coordinates of the click in reference to the in-game room (only for click events)
- room_coor_y - the coordinates of the click in reference to the in-game room (only for click events)
- screen_coor_x - the coordinates of the click in reference to the player’s screen (only for click events)
- screen_coor_y - the coordinates of the click in reference to the player’s screen (only for click events)
- hover_duration - how long (in milliseconds) the hover happened for (only for hover events)
- text - the text the player sees during this event
- fqid - the fully qualified ID of the event
- room_fqid - the fully qualified ID of the room the event took place in
- text_fqid - the fully qualified ID of the
- fullscreen - whether the player is in fullscreen mode
- hq - whether the game is in high-quality
- music - whether the game music is on or off
- level_group - which group of levels - and group of questions - this row belongs to (0-4, 5-12, 13-22)

[EDA example](https://www.kaggle.com/code/nguyenthicamlai/eda-ml-on-game-play-ongoing)