## 1. Package Loading

In [1]:
import os
import sys
sys.path.insert(0, os.path.abspath('../data_prep/'))
import glob
import pickle
import pandas as pd
import log_prep as lp
import grades_prep as gp
import ml_modeling as mm
from sklearn.metrics import accuracy_score

## 2. Log data management process

### Load log data files

#### `log_data` structure

| Element | Content |
| :- | :- |
| `log_data[0]` | 0 |
| `log_data[1]` | dataframe for session 1 |
| `log_data[2]` | dataframe for session 2 |
| `log_data[3]` | dataframe for session 3 |
| `log_data[4]` | dataframe for session 4 |
| `log_data[5]` | dataframe for session 5 |
| `log_data[6]` | dataframe for session 6 |

In [2]:
log_data = lp.read_file()
log_data[2] # session 2

1th element in the sessions list represents Session1
2th element in the sessions list represents Session2
3th element in the sessions list represents Session3
4th element in the sessions list represents Session4
5th element in the sessions list represents Session5
6th element in the sessions list represents Session6


Unnamed: 0,session,student_id,exercise,activity,start_time,end_time,idle_time,mouse_wheel,mouse_wheel_click,mouse_click_left,mouse_click_right,mouse_movement,keystroke
0,2,61,Es,Other,16.10.2014 11:55:13,16.10.2014 11:55:20,1607,0,0,2,0,350,0
1,2,61,Es,Aulaweb,16.10.2014 11:55:21,16.10.2014 11:55:26,94,7,0,8,0,334,0
2,2,61,Es_2_1,TextEditor_Es_2_1,16.10.2014 11:55:27,16.10.2014 11:55:31,217,3,0,8,0,210,6
3,2,61,Es_2_1,Study_Es_2_1,16.10.2014 11:55:32,16.10.2014 11:55:33,16,0,0,0,0,0,9
4,2,61,Es_2_1,Aulaweb,16.10.2014 11:55:34,16.10.2014 11:55:34,0,0,0,0,0,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
440,2,22,Es_2_6,Deeds_Es_2_6,16.10.2014 14:5:31,16.10.2014 14:5:31,0,0,0,2,0,32,0
441,2,22,Es_2_6,Other,16.10.2014 14:5:32,16.10.2014 14:5:41,1262,0,0,6,4,259,0
442,2,22,Es_2_6,Other,16.10.2014 14:5:42,16.10.2014 14:5:46,718,0,0,0,0,12,30
443,2,22,Es_2_6,Other,16.10.2014 14:5:47,16.10.2014 14:5:47,0,0,0,0,0,15,2


### Manipulate log data

- Drop irrelevant columns and give simpler column names
- Rename columns
- Transform variables from two dimensions to one
- Replace nan to 0

In [3]:
manipulated_log_data = lp.feature_manipulation(log_data)
manipulated_log_data[2] # dataframe for session 2

Unnamed: 0_level_0,DUR_Aulaweb,DUR_Blank,DUR_Deeds,DUR_Diagram,DUR_FSM,DUR_Other,DUR_Properties,DUR_Study,DUR_TextEditor,KS_Aulaweb,...,MW_TextEditor,MWC_Aulaweb,MWC_Blank,MWC_Deeds,MWC_Diagram,MWC_FSM,MWC_Other,MWC_Properties,MWC_Study,MWC_TextEditor
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1108.0,15296.0,33635659.0,8685148.0,0.0,39820.0,3493.0,100878318.0,118425808.0,0.0,...,1191.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,13063.0,27628.0,152742872.0,898633.0,0.0,419913.0,294227.0,14462337.0,347521629.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,6891.0,1894304.0,64732143.0,779097.0,0.0,3677630.0,93.0,118419039.0,424103241.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,37073.0,538424.0,126805821.0,255194.0,93.0,3453213.0,1170.0,1972457.0,196191366.0,29.0,...,2022.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0
5,8813.0,11555.0,79309264.0,347358.0,0.0,49656.0,9502.0,48292987.0,275261872.0,0.0,...,1280.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101,348051.0,38214.0,30974648.0,9805.0,0.0,2024124.0,33.0,11602697.0,199308768.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
102,9099.0,32547.0,48035277.0,42348.0,0.0,231288.0,0.0,39830462.0,169579497.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
103,41778.0,11014.0,83156320.0,3145933.0,0.0,239583.0,32268.0,161358754.0,535482141.0,0.0,...,1286.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
104,292310.0,77768.0,161245059.0,1376193.0,0.0,241945.0,3774.0,229629087.0,202163111.0,0.0,...,724.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Standardize log data

- Apply Standard Scaler

#### `std_manipulated_log_data` structure after standardization

| Element | Content |
| :- | :- |
| `std_manipulated_log_data[0]` | dataframe for session 1 |
| `std_manipulated_log_data[1]` | dataframe for session 2 |
| `std_manipulated_log_data[2]` | dataframe for session 3 |
| `std_manipulated_log_data[3]` | dataframe for session 4 |
| `std_manipulated_log_data[4]` | dataframe for session 5 |
| `std_manipulated_log_data[5]` | dataframe for session 6 |

In [4]:
std_manipulated_log_data = lp.feature_standardization(manipulated_log_data)
std_manipulated_log_data[1] # Session 2

Unnamed: 0_level_0,DUR_Aulaweb,DUR_Blank,DUR_Deeds,DUR_Diagram,DUR_FSM,DUR_Other,DUR_Properties,DUR_Study,DUR_TextEditor,KS_Aulaweb,...,MW_TextEditor,MWC_Aulaweb,MWC_Blank,MWC_Deeds,MWC_Diagram,MWC_FSM,MWC_Other,MWC_Properties,MWC_Study,MWC_TextEditor
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,-0.180077,-0.223501,-0.731890,1.869931,-0.157494,-0.321309,-0.187415,-0.021704,-0.748815,-0.225401,...,0.078659,-0.227076,0.0,-0.180038,0.0,0.0,-0.111111,0.0,-0.283103,-0.250139
2,-0.179064,-0.222049,0.070617,-0.176423,-0.157494,-0.286618,7.977765,-0.568710,0.277439,-0.225401,...,-0.681333,-0.227076,0.0,-0.180038,0.0,0.0,-0.111111,0.0,-0.283103,-0.250139
3,-0.179587,-0.002275,-0.522372,-0.207838,-0.157494,0.010717,-0.282903,0.089327,0.620493,-0.225401,...,-0.681333,-0.227076,0.0,-0.180038,0.0,0.0,-0.111111,0.0,-0.283103,-0.250139
4,-0.177029,-0.161910,-0.104139,-0.345523,6.866187,-0.009766,-0.252656,-0.647769,-0.400458,0.439615,...,0.608930,-0.227076,0.0,1.162062,0.0,0.0,-0.111111,0.0,-0.283103,-0.250139
5,-0.179424,-0.223941,-0.424156,-0.321302,-0.157494,-0.320411,-0.018654,-0.354565,-0.046255,-0.225401,...,0.135451,-0.227076,0.0,-0.180038,0.0,0.0,-0.111111,0.0,-0.283103,-0.250139
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101,-0.150673,-0.220802,-0.749819,-0.410013,-0.157494,-0.140200,-0.284588,-0.586811,-0.386493,-0.225401,...,-0.681333,-0.227076,0.0,-0.180038,0.0,0.0,-0.111111,0.0,-0.283103,-0.250139
102,-0.179400,-0.221470,-0.634870,-0.401461,-0.157494,-0.303834,-0.285515,-0.408132,-0.519668,-0.225401,...,-0.681333,-0.227076,0.0,-0.180038,0.0,0.0,-0.111111,0.0,-0.283103,-0.250139
103,-0.176630,-0.224005,-0.398235,0.414184,-0.157494,-0.303077,0.620722,0.361131,1.119425,-0.225401,...,0.139280,-0.227076,0.0,-0.180038,0.0,0.0,-0.111111,0.0,-0.283103,-0.250139
104,-0.155397,-0.216145,0.127902,-0.050916,-0.157494,-0.302861,-0.179523,0.793276,-0.373707,-0.225401,...,-0.219339,-0.227076,0.0,-0.180038,0.0,0.0,-0.111111,0.0,-0.283103,-0.250139


### Check the data structure

In [5]:
# The original data include six sessions.
# std_manipulated_log_data[0] is session 1's log data
# std_manipulated_log_data[5] is session 6's log data
len(std_manipulated_log_data) 

6

### Save the manipualted log data

In [6]:
# If a user wants to use save them, 
# the user can save them as csv files by using the save_data function
# lp.save_data(manipulated_log_data) 

## 3. Intermediate and final grade data management process

### Load grades

#### `grades` structure

| Element | Content |
| :- | :- |
| `grades[0]` | intermediate grades |
| `grades[1]` | 1st attempt final grades |
| `grades[2]` | 2nd attempt final grades |


In [7]:
grades = gp.read_grades()
grades[1].head() # 1st attempt final grades

  warn(msg)


Unnamed: 0,Student ID,ES 1.1 \n(2 points),ES 1.2 \n(3 points),ES 2.1\n(2 points),ES 2.2\n(3 points),ES 3.1\n(1 points),ES 3.2\n(2 points),ES 3.3\n(2 points),ES 3.4\n(2 points),ES 3.5\n(3 points),ES 4.1\n(15 points),ES 4.2\n(10 points),ES 5.1\n(2 points),ES 5.2\n(10 points),ES 5.3\n(3 points),ES 6.1\n(25 points),ES 6.2\n(15 points),TOTAL\n(100 points)
0,3,2.0,3,1.0,2.0,1,2,2,2.0,3.0,15.0,10.0,1,5.0,3.0,18.0,15,85.0
1,6,2.0,3,2.0,3.0,1,2,2,0.0,3.0,15.0,7.0,2,9.0,3.0,13.0,15,82.0
2,7,2.0,3,1.0,1.5,1,2,0,0.0,3.0,5.0,4.0,0,0.0,3.0,17.0,10,52.5
3,10,2.0,3,2.0,1.5,1,2,0,2.0,3.0,11.0,1.0,2,10.0,1.5,7.0,10,59.0
4,13,2.0,3,2.0,1.5,1,2,2,2.0,3.0,14.5,10.0,2,2.0,3.0,25.0,15,90.0


### Merge 1st and 2nd final grades into one

- prioritize 1st attempt grades over the 2nd
- rebase to 100

#### `final_grades` structure

| Column | Content |
| :- | :- |
| `ID` | student ID |
| `FIN1` | grades for session 1 related questions |
| `FIN2` | grades for session 2 related questions |
| `FIN3` | grades for session 3 related questions |
| `FIN4` | grades for session 4 related questions |
| `FIN5` | grades for session 5 related questions |
| `FIN6` | grades for session 6 related questions |
| `final_score` | Total final score |

In [8]:
final_grades = gp.final_manipulation(grades[1], grades[2])
final_grades 

Unnamed: 0,ID,FIN1,FIN2,FIN3,FIN4,FIN5,FIN6,final_score
0,1,100.0,30.0,100.0,100.0,100.000000,95.00,87.500000
1,2,100.0,50.0,80.0,68.0,43.333333,12.50,58.972222
0,3,100.0,60.0,100.0,100.0,60.000000,82.50,83.750000
2,4,100.0,30.0,50.0,28.0,10.000000,25.00,40.500000
3,5,100.0,70.0,100.0,20.0,80.000000,7.50,62.916667
...,...,...,...,...,...,...,...,...
50,101,100.0,30.0,60.0,24.0,0.000000,0.00,35.666667
58,102,100.0,50.0,60.0,60.0,0.000000,7.50,46.250000
59,103,100.0,50.0,80.0,12.0,0.000000,0.00,40.333333
60,104,100.0,50.0,100.0,96.0,86.666667,93.75,87.736111


### Rebase the intermdediate grades (random scores) into 100

#### `intermediate_grades` structure

| Column | Content |
| :- | :- |
| `ID` | student ID |
| `MID2` | intermediate grades for session 2 |
| `MID3` | intermediate grades for session 3 |
| `MID4` | intermediate grades for session 4 |
| `MID5` | intermediate grades for session 5 |
| `MID6` | intermediate grades for session 6 |

> No intermediate grades for session 1 are provided.

In [9]:
intermediate_grades = gp.rebase_mid(grades[0])
intermediate_grades

Unnamed: 0,ID,MID2,MID3,MID4,MID5,MID6
0,1,83.33,0.0,90.0,100.0,56.25
1,2,66.67,87.5,90.0,100.0,25.00
2,3,58.33,87.5,90.0,100.0,0.00
3,4,100.00,100.0,100.0,87.5,68.75
4,5,83.33,100.0,100.0,100.0,68.75
...,...,...,...,...,...,...
110,111,50.00,0.0,0.0,75.0,0.00
111,112,0.00,0.0,0.0,0.0,0.00
112,113,0.00,0.0,0.0,0.0,0.00
113,114,0.00,0.0,0.0,0.0,0.00


### Combine the intermdediate grades with final grades

In [10]:
whole_grades = gp.merge_mid_final(intermediate_grades, final_grades)
whole_grades

Unnamed: 0,ID,MID2,MID3,MID4,MID5,MID6,FIN1,FIN2,FIN3,FIN4,FIN5,FIN6,final_score
0,1,83.33,0.0,90.0,100.0,56.25,100.0,30.0,100.0,100.0,100.000000,95.00,87.500000
1,2,66.67,87.5,90.0,100.0,25.00,100.0,50.0,80.0,68.0,43.333333,12.50,58.972222
2,3,58.33,87.5,90.0,100.0,0.00,100.0,60.0,100.0,100.0,60.000000,82.50,83.750000
3,4,100.00,100.0,100.0,87.5,68.75,100.0,30.0,50.0,28.0,10.000000,25.00,40.500000
4,5,83.33,100.0,100.0,100.0,68.75,100.0,70.0,100.0,20.0,80.000000,7.50,62.916667
...,...,...,...,...,...,...,...,...,...,...,...,...,...
88,101,75.00,37.5,0.0,87.5,0.00,100.0,30.0,60.0,24.0,0.000000,0.00,35.666667
89,102,75.00,100.0,80.0,87.5,50.00,100.0,50.0,60.0,60.0,0.000000,7.50,46.250000
90,103,33.33,0.0,0.0,87.5,0.00,100.0,50.0,80.0,12.0,0.000000,0.00,40.333333
91,104,100.00,0.0,90.0,100.0,68.75,100.0,50.0,100.0,96.0,86.666667,93.75,87.736111


### Standardize the whole scores

- Apply Standard Scaler

In [11]:
std_whole_grades = gp.standardize_grades(whole_grades)
std_whole_grades

Unnamed: 0,MID2,MID3,MID4,MID5,MID6,FIN2,FIN3,FIN4,FIN5,FIN6,ID
0,0.873680,-1.739299,0.139648,0.503558,0.221767,-0.881542,0.730296,1.243222,1.569829,1.448182,1
1,0.359568,0.706238,0.139648,0.503558,-0.936902,-0.053427,-0.134895,0.278217,0.011825,-0.824042,2
2,0.102203,0.706238,0.139648,0.503558,-1.863836,0.360631,0.730296,1.243222,0.470062,1.103906,3
3,1.388100,1.055601,0.639157,0.030519,0.685234,-0.881542,-1.432682,-0.928039,-0.904647,-0.479766,4
4,0.873680,1.055601,0.639157,0.503558,0.685234,0.774689,0.730296,-1.169290,1.019946,-0.961753,5
...,...,...,...,...,...,...,...,...,...,...,...
88,0.616624,-0.691212,-4.355936,0.030519,-1.863836,-0.881542,-1.000087,-1.048665,-1.179589,-1.168319,101
89,0.616624,1.055601,-0.359862,0.030519,-0.009967,-0.053427,-1.000087,0.036966,-1.179589,-0.961753,102
90,-0.669273,-1.739299,-4.355936,0.030519,-1.863836,-0.053427,-0.134895,-1.410541,-1.179589,-1.168319,103
91,1.388100,-1.739299,0.139648,0.503558,0.685234,-0.053427,0.730296,1.122596,1.203240,1.413755,104


### Detect grades change between intermediate and final

#### `grades_change` structure

| Newly Added Column | Content |
| :- | :- |
| `RES2` | 0 or 1 |
| `RES3` | 0 or 1 |
| `RES4` | 0 or 1 |
| `RES5` | 0 or 1 |
| `RES6` | 0 or 1 |

> When students' final scores are higher than their intertermediate ones, 'RES#' is 0
> When students' final scores are lower than their intertermediate ones, 'RES#' is 1

In [12]:
# When students' final scores are higher than their intertermediate scores, 'RES#' is 0
# When students' final scores are less than their intertermediate scores, 'RES#' is 1
grades_change = gp.get_result(std_whole_grades)
grades_change

Unnamed: 0,MID2,MID3,MID4,MID5,MID6,FIN2,FIN3,FIN4,FIN5,FIN6,ID,RES2,RES3,RES4,RES5,RES6
0,0.873680,-1.739299,0.139648,0.503558,0.221767,-0.881542,0.730296,1.243222,1.569829,1.448182,1,1,0,0,0,0
1,0.359568,0.706238,0.139648,0.503558,-0.936902,-0.053427,-0.134895,0.278217,0.011825,-0.824042,2,1,1,1,1,0
2,0.102203,0.706238,0.139648,0.503558,-1.863836,0.360631,0.730296,1.243222,0.470062,1.103906,3,0,1,0,1,0
3,1.388100,1.055601,0.639157,0.030519,0.685234,-0.881542,-1.432682,-0.928039,-0.904647,-0.479766,4,1,1,1,1,1
4,0.873680,1.055601,0.639157,0.503558,0.685234,0.774689,0.730296,-1.169290,1.019946,-0.961753,5,1,1,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
88,0.616624,-0.691212,-4.355936,0.030519,-1.863836,-0.881542,-1.000087,-1.048665,-1.179589,-1.168319,101,1,1,0,1,0
89,0.616624,1.055601,-0.359862,0.030519,-0.009967,-0.053427,-1.000087,0.036966,-1.179589,-0.961753,102,1,1,0,1,1
90,-0.669273,-1.739299,-4.355936,0.030519,-1.863836,-0.053427,-0.134895,-1.410541,-1.179589,-1.168319,103,0,0,0,1,0
91,1.388100,-1.739299,0.139648,0.503558,0.685234,-0.053427,0.730296,1.122596,1.203240,1.413755,104,1,0,0,0,0


### Save the manipualted grade data

In [13]:
# If a user wants to use save them, 
# the user can save them as csv files by using the save_data function
# gp.save_grades(std_whole_grades_with_stu_behavior)

## 4. Merging log acitivities and grades into one data file

### Leave relevant columns only from intermediate and final grade data

In [14]:
outcome = grades_change[['ID', 'RES2', 'RES3', 'RES4', 'RES5', 'RES6','MID2','MID3','MID4','MID5','MID6']]
outcome

Unnamed: 0,ID,RES2,RES3,RES4,RES5,RES6,MID2,MID3,MID4,MID5,MID6
0,1,1,0,0,0,0,0.873680,-1.739299,0.139648,0.503558,0.221767
1,2,1,1,1,1,0,0.359568,0.706238,0.139648,0.503558,-0.936902
2,3,0,1,0,1,0,0.102203,0.706238,0.139648,0.503558,-1.863836
3,4,1,1,1,1,1,1.388100,1.055601,0.639157,0.030519,0.685234
4,5,1,1,1,0,1,0.873680,1.055601,0.639157,0.503558,0.685234
...,...,...,...,...,...,...,...,...,...,...,...
88,101,1,1,0,1,0,0.616624,-0.691212,-4.355936,0.030519,-1.863836
89,102,1,1,0,1,1,0.616624,1.055601,-0.359862,0.030519,-0.009967
90,103,0,0,0,1,0,-0.669273,-1.739299,-4.355936,0.030519,-1.863836
91,104,1,0,0,0,0,1.388100,-1.739299,0.139648,0.503558,0.685234


### Merge log data with grade data

#### `whole_data` structure

| Element | Content |
| :- | :- |
| `whole_data[0]` | student behavior features |
| `whole_data[1]` | student behavior features, intermediate grades for session 2, and grade change in final exam |
| `whole_data[2]` | student behavior features, intermediate grades for session 3, and grade change in final exam |
| `whole_data[3]` | student behavior features, intermediate grades for session 4, and grade change in final exam |
| `whole_data[4]` | student behavior features, intermediate grades for session 5, and grade change in final exam |
| `whole_data[5]` | student behavior features, intermediate grades for session 6, and grade change in final exam |

> Note that no intermediate grades for session 1 are provided.

In [15]:
whole_data = lp.merge_all_data(std_manipulated_log_data, outcome)
whole_data[5].head() # representSession 6

Unnamed: 0,ID,DUR_Aulaweb,DUR_Blank,DUR_Deeds,DUR_Diagram,DUR_FSM,DUR_Other,DUR_Properties,DUR_Study,DUR_TextEditor,...,MWC_Blank,MWC_Deeds,MWC_Diagram,MWC_FSM,MWC_Other,MWC_Properties,MWC_Study,MWC_TextEditor,MID6,Y
0,1,-0.206026,-0.187364,0.402963,-0.035197,-0.544723,-0.259192,0.164625,-0.653497,-0.1646,...,-0.109764,0.0,0.0,-0.199007,-0.156174,-0.109764,-0.170951,-0.19245,0.221767,0
1,2,0.098791,-0.210084,-0.358647,-0.350479,-0.520192,-0.206216,-0.607414,0.730816,-0.235257,...,-0.109764,0.0,0.0,-0.199007,-0.156174,-0.109764,-0.170951,-0.19245,-0.936902,0
2,4,-0.206706,0.134054,-0.398028,-0.171902,-0.359848,-0.259971,-0.388247,-0.735513,-0.222955,...,-0.109764,0.0,0.0,1.890571,-0.156174,-0.109764,-0.170951,-0.19245,0.685234,1
3,5,-0.207138,-0.210165,-0.374556,1.165346,-0.371774,-0.252044,-0.539273,-0.747055,-0.139567,...,-0.109764,0.0,0.0,-0.199007,-0.156174,-0.109764,-0.170951,-0.19245,0.685234,1
4,6,-0.20616,-0.129651,-0.291206,-0.346947,-0.606445,0.714986,0.214414,-0.134238,-0.200661,...,-0.109764,0.0,0.0,-0.199007,-0.156174,-0.109764,-0.170951,-0.19245,0.916968,1


### Save the combined data in a pickle file

In [16]:
# with open('whole_data.pkl', 'wb') as f:
#     pickle.dump(whole_data, f)

## 5. Sort out important features and Predict performance in final

### Subset common important features across all sessions

In [25]:
# Subset common important features across all sessions 
# by using an ensemble learning method fundamentally based on decision trees
dt1 = whole_data.copy()
data_common_important_features = mm.subset_important_features(dt1, 3, "common")
data_common_important_features[1] #Session 2

Unnamed: 0,KS_Deeds,DUR_Other,ID,MID2
0,-0.359030,-0.321309,1,0.873680
1,-0.163113,-0.286618,2,0.359568
2,-0.268846,0.010717,3,0.102203
3,-0.038721,-0.009766,4,1.388100
4,3.083515,-0.320411,5,0.873680
...,...,...,...,...
70,0.147867,-0.148921,100,0.102203
71,-0.853488,-0.140200,101,0.616624
72,-0.415006,-0.303834,102,0.616624
73,0.290918,-0.303077,103,-0.669273


### Subset important features from each session

In [18]:
# Subset important features from each session
# by using an ensemble learning method fundamentally based on decision trees
dt4 = whole_data.copy()
f4 = mm.subset_important_features(dt4, 4, "different")

In [19]:
f4[5] #Session 6

Unnamed: 0,MID6,KS_TextEditor,DUR_Properties,MW_Other,Y,ID
0,0.221767,0.358436,0.164625,-0.503428,0,1
1,-0.936902,0.239186,-0.607414,1.605516,0,2
2,0.685234,0.618799,-0.388247,-0.430706,1,4
3,0.685234,0.298811,-0.539273,0.514683,1,5
4,0.916968,2.705675,0.214414,1.544914,1,6
...,...,...,...,...,...,...
74,0.221767,-1.096414,2.095253,-0.503428,0,96
75,0.916968,-0.200051,0.658939,-0.164058,1,98
76,-0.473434,-0.567739,-0.432498,-0.479187,1,99
77,-0.009967,-1.136164,0.768677,-0.188299,1,102


### Create pickles for features & outcome and trained models

In [20]:
for feat_num in range(3, 6):
    df_diff_features_outcome = mm.subset_important_features(dt4, feat_num, "different")
    # Create pickles for features and outcome
    for i in range(1, 6):
#         print(len(df_diff_features_outcome[i].columns))
        filename = '../data_prep/pickles/features_and_outcome/session_' + str(i+1) + '_featnum_' + str(feat_num)
        pickle.dump(df_diff_features_outcome[i], open(filename, 'wb'))

        # Create pickles for trained model
        for model in ['KNN', 'DT', 'RF', 'NB', 'LR', 'SVC']:
            mm.machine_learning_model(df_diff_features_outcome, model)
        

### Predict final exam performance compared to intermediate one

In [26]:
# Predict final exam performance by fittng important log data and intermediate score into logistic regression
student_behavior_predictions = mm.machine_learning_model(f4, 'LR')
student_behavior_predictions[2] # When Predicted_Y = 1, a student is predicted to perform poorly in the final exam

Unnamed: 0,MID3,MCL_Diagram,MCR_Other,MCL_Properties,Y,ID,Predicted_Y
0,0.706238,-0.610939,-0.514530,0.052543,1,2,1
1,0.706238,1.295953,-0.744089,-0.395617,1,3,1
2,1.055601,1.713821,-0.284970,-1.023042,1,4,1
3,1.055601,0.419018,1.092386,-0.305985,1,5,1
4,0.706238,0.625010,-0.744089,-0.395617,1,6,1
...,...,...,...,...,...,...,...
72,0.007513,-0.393176,1.092386,-0.216353,1,99,0
73,-0.341849,-0.758075,1.092386,-0.574881,0,100,0
74,-0.691212,-1.005265,-0.744089,-0.754146,1,101,0
75,1.055601,1.331266,-0.744089,-0.843778,1,102,1


### Model prediction accuracy

In [27]:
print(accuracy_score(student_behavior_predictions[1]['Y'], student_behavior_predictions[1]['Predicted_Y']))
print(accuracy_score(student_behavior_predictions[2]['Y'], student_behavior_predictions[2]['Predicted_Y']))
print(accuracy_score(student_behavior_predictions[3]['Y'], student_behavior_predictions[3]['Predicted_Y']))
print(accuracy_score(student_behavior_predictions[4]['Y'], student_behavior_predictions[4]['Predicted_Y']))
print(accuracy_score(student_behavior_predictions[5]['Y'], student_behavior_predictions[5]['Predicted_Y']))

0.7466666666666667
0.8831168831168831
0.7931034482758621
0.7380952380952381
0.7215189873417721
