# Universal Automated Tutor

## 0: Data Import + Cleaning

In [2]:
import pandas as pd
import numpy as np 
import random

In [21]:
# Read in the CSVs, select variables, combine
raw_data04 = pd.read_csv("2004-WPI-Assistments-Math.csv",low_memory=False)
raw_data056 = pd.read_csv("2005-06-WPI-Assistments-Math.csv",low_memory=False)
raw_data = pd.concat([raw_data056,raw_data04])
data = raw_data[["stud_id","duration","student_response_type","problem_id","step","attempt_num",
             "last_attempt","outcome","input","feedback"]]
data.reset_index(drop=True, inplace=True)

# Create dummy variables for hint, correct, incorrect
data.loc[:,"hint"] = np.where(data["outcome"] == "HINT",1,0)
data.loc[:,"correct"] = np.where(data["outcome"] == "CORRECT",1,0)
data.loc[:,"incorrect"] = np.where(data["outcome"] == "INCORRECT",1,0)

# Filter + clean data
data = data[data["duration"] != '.']
data.loc[:,1] = data["duration"].astype(np.float64)

## 1: Distribution Creation

In [4]:
def chi2lower(x):
    '''
    Calculates the chi-squared distribution lower bound
    '''
    return np.mean(x) - np.std(x)/4


def chi2upper(x):
    '''
    Calculates the chi-squared distribution upper bound
    '''
    return np.mean(x) + np.std(x)/2

In [5]:
# manipulate to make problem distributions
df = data[["problem_id", "duration", "hint", "stud_id","incorrect"]].groupby(["problem_id","stud_id"]).sum()
problemDistsLower = df.groupby(["problem_id"]).agg(chi2lower)
problemDistsUpper = df.groupby(["problem_id"]).agg(chi2upper)
problemDistsLower.rename(columns={'duration': 'durationLower', 'hint': 'hintLower', 'incorrect': 'incorrectLower'}, inplace=True)
problemDistsUpper.rename(columns={'duration': 'durationUpper', 'hint': 'hintUpper', 'incorrect': 'incorrectUpper'}, inplace=True)
problemDists = pd.concat([problemDistsLower, problemDistsUpper], axis=1)

# Create given student EVs
studentEV = df.groupby(["stud_id"]).mean()

In [24]:
problemDistsUpper

Unnamed: 0_level_0,durationUpper,hintUpper,incorrectUpper
problem_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2,186.000000,4.000000,2.000000
7,472.600952,7.850396,5.296579
12,253.682629,5.835896,3.399405
21,93.110577,0.786937,1.557401
22,308.807849,3.442042,3.296593
...,...,...,...
13208,121.630857,1.551520,0.666667
13216,164.397613,1.891272,1.411043
13219,99.590531,0.954099,1.474505
13224,278.649221,5.307131,2.530776


## 2: Student Class

In [9]:
class Student:
    
    
    def __init__(self, data):
        self.data = data
        self.means = 0
        self.problem = ""

        
    def updateStudent(self, lastQ):
        self.data = self.data.append(lastQ)
        self.means = self.data.mean()
        self.duration = self.means.duration
        self.incorrect = self.means.incorrect
        self.hint = self.means.hint
        
    def nextQ(self):
        # Determines a viable next question
        EVdur = self.means.duration
        EVinc = self.means.incorrect
        EVhint = self.means.hint
        viableProbs = problemDists[(EVdur < problemDists["durationUpper"])  & (EVhint < problemDists["hintUpper"]) & \
                                   (EVincproblemDists["incorrectLower"] > EVinc)]
        problemNum = random.choice(viableProbs.index.values.tolist())
        self.problem = df[df["problem_id"]==problemNum].iloc[0][0].split(':')[-1].split("?")[0] + "?"

## 3: Example Run Through

In [10]:
# Pseudo Code
# studentData = Pretest data
# Pretest data will include entries with "step", "problem_id", "duration", "hint", "incorrect", "correct"
# student = Student(studentData)

# Example
stud = "Stu_fe96fe63d83aa63c4ec667167fc7f1ce"
df = data[["step", "problem_id", "stud_id","duration", "hint", "incorrect", "correct"]]
newdf = df[df["stud_id"] == stud]
studentData = newdf.iloc[0:10,:]

studentData

Unnamed: 0,step,problem_id,stud_id,duration,hint,incorrect,correct
1266120,Step0:3761:Harry measured all but one angle of...,3761,Stu_fe96fe63d83aa63c4ec667167fc7f1ce,90.0,1,0,0
1266121,Step1:3762:How many angles are there in a pent...,3761,Stu_fe96fe63d83aa63c4ec667167fc7f1ce,37.0,0,1,0
1266122,Step1:3762:How many angles are there in a pent...,3761,Stu_fe96fe63d83aa63c4ec667167fc7f1ce,4.0,0,0,1
1266123,Step0:3761:Harry measured all but one angle of...,3761,Stu_fe96fe63d83aa63c4ec667167fc7f1ce,246.0,1,0,0
1266124,Step1:3762:How many angles are there in a pent...,3761,Stu_fe96fe63d83aa63c4ec667167fc7f1ce,6.0,0,0,1
1266125,Step3:3764:What is the measure of the only ang...,3761,Stu_fe96fe63d83aa63c4ec667167fc7f1ce,29.0,0,0,1
1266126,Step0:193:(0.2)(0.2)(0.2) is equal to which of...,193,Stu_fe96fe63d83aa63c4ec667167fc7f1ce,37.0,0,0,1
1266127,Step0:113:Figure ABCD is translated so that th...,113,Stu_fe96fe63d83aa63c4ec667167fc7f1ce,65.0,1,0,0
1266128,Step1:114:Look at the figure above. Let s find...,113,Stu_fe96fe63d83aa63c4ec667167fc7f1ce,88.0,1,0,0
1266129,Step1:114:Look at the figure above. Let s find...,113,Stu_fe96fe63d83aa63c4ec667167fc7f1ce,21.0,1,0,0


In [23]:
# Initializing a student
student = Student(studentData)

# questionData represents their next action from the GUI (we will be passed this)
questionData = newdf.iloc[11:13,:]

# If the person gets the problem right, then send the questionData to the student to update and choose next Q
if questionData.correct.isin([1]).any():
    student.updateStudent(questionData)
    student.nextQ()

# Question that is sent to the interface next:
print(student.problem)

Marisa saved $500 to spend on a vacation. She will spend about $45 per day on her vacation, and she must have $70 left to pay for her bus ride home. Which of the inequalities above best represents the possible number SYSTEM?


# What is left to finish model?

- Need to find correct answers
- Need to find images
- Need to figure out how to get through steps
- Need to format answer box to accept all close answers